add a way to save/restore alerts

This commit is contained in:
surtur 2022-08-16 15:33:18 +02:00
parent 3cc42597d1
commit 8379ab4c8c
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D
3 changed files with 744 additions and 0 deletions

16
Makefile Normal file

@ -0,0 +1,16 @@
# ref:
# https://community.grafana.com/t/ngalert-grafana-8-alert-feature-how-to-export-import-alerts-as-yml-json/51677/22
CHECK-GRAFANA-TOKEN:
ifndef GRAFANA_TOKEN
$(error GRAFANA_TOKEN is required)
endif
DOWNLOAD-GRAFANA-ALERTS: CHECK-GRAFANA-TOKEN
curl -X GET \
-H "Authorization: Bearer ${GRAFANA_TOKEN}" \
'https://grafana.dotya.ml/api/ruler/grafana/api/v1/rules' \
| jq > './alerts/alerts.json'
UPLOAD-GRAFANA-ALERTS: CHECK-GRAFANA-TOKEN
./upload-grafana-alerts.sh

712
alerts/alerts.json Normal file

@ -0,0 +1,712 @@
{
"general-alerting": [
{
"name": "nebula-ci",
"interval": "1m",
"rules": [
{
"expr": "",
"for": "5m",
"labels": {
"issue": "pending_builds",
"service": "ci"
},
"annotations": {
"__dashboardUid__": "Ha2VyrtGz",
"__panelId__": "8",
"description": "monitor the number of pending CI builds and alert if the number has been >= 3 for the duration of 5 minutes",
"summary": "3 (or more) CI builds have been in a pending state for (at least) the duration of 5 minutes"
},
"grafana_alert": {
"id": 2,
"orgId": 2,
"title": "ci - pending builds",
"condition": "E",
"data": [
{
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_running_builds",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running builds",
"maxDataPoints": 43200,
"refId": "A"
}
},
{
"refId": "C",
"queryType": "",
"relativeTimeRange": {
"from": 43200,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"editorMode": "code",
"exemplar": true,
"expr": "drone_running_jobs",
"hide": false,
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running jobs",
"maxDataPoints": 43200,
"range": true,
"refId": "C"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"expr": "drone_pending_builds",
"format": "time_series",
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending builds",
"maxDataPoints": 43200,
"refId": "B"
}
},
{
"refId": "D",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_pending_jobs",
"hide": false,
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending jobs",
"maxDataPoints": 43200,
"refId": "D"
}
},
{
"refId": "E",
"queryType": "",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "when"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "E",
"type": "classic_conditions"
}
}
],
"updated": "2022-08-16T11:01:40Z",
"intervalSeconds": 60,
"version": 8,
"uid": "r6IWmzg4z",
"namespace_uid": "cxtgDx37z",
"namespace_id": 11,
"rule_group": "nebula-ci",
"no_data_state": "NoData",
"exec_err_state": "Alerting"
}
},
{
"expr": "",
"for": "5m",
"labels": {
"issue": "pending_jobs",
"service": "ci"
},
"annotations": {
"__dashboardUid__": "Ha2VyrtGz",
"__panelId__": "8",
"description": "monitor the number of pending CI jobs and alert if the number has been >=3 for the duration of 5 minutes",
"summary": "3 (or more) CI jobs have been in a pending state for (at least) the duration of 5 minutes"
},
"grafana_alert": {
"id": 3,
"orgId": 2,
"title": "ci - pending jobs",
"condition": "E",
"data": [
{
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_running_builds",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running builds",
"maxDataPoints": 43200,
"refId": "A"
}
},
{
"refId": "C",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_running_jobs",
"hide": false,
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running jobs",
"maxDataPoints": 43200,
"refId": "C"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"expr": "drone_pending_builds",
"format": "time_series",
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending builds",
"maxDataPoints": 43200,
"refId": "B"
}
},
{
"refId": "D",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_pending_jobs",
"hide": false,
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending jobs",
"maxDataPoints": 43200,
"refId": "D"
}
},
{
"refId": "E",
"queryType": "",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"D"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "E",
"type": "classic_conditions"
}
}
],
"updated": "2022-07-14T12:34:50Z",
"intervalSeconds": 60,
"version": 3,
"uid": "20B7ZzRVz",
"namespace_uid": "cxtgDx37z",
"namespace_id": 11,
"rule_group": "nebula-ci",
"no_data_state": "NoData",
"exec_err_state": "Alerting"
}
},
{
"expr": "",
"for": "5m",
"labels": {
"alertname": "ci - missing build count data",
"issue": "missing_build_count_data",
"service": "ci"
},
"annotations": {
"__dashboardUid__": "Ha2VyrtGz",
"__panelId__": "8",
"description": "we are missing build count data, which likely means that Prometheus wasn't able to scrape CI (likely because CI is down).",
"summary": "Prometheus wasn't able to scrape fresh build count data from CI."
},
"grafana_alert": {
"id": 4,
"orgId": 2,
"title": "ci - missing build count data",
"condition": "F",
"data": [
{
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_running_builds",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running builds",
"maxDataPoints": 43200,
"refId": "A"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_running_jobs",
"hide": false,
"interval": "",
"intervalFactor": 1,
"intervalMs": 15000,
"legendFormat": "running jobs",
"maxDataPoints": 43200,
"refId": "B"
}
},
{
"refId": "C",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"expr": "drone_pending_builds",
"format": "time_series",
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending builds",
"maxDataPoints": 43200,
"refId": "C"
}
},
{
"refId": "D",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"exemplar": true,
"expr": "drone_pending_jobs",
"hide": false,
"interval": "",
"intervalMs": 15000,
"legendFormat": "pending jobs",
"maxDataPoints": 43200,
"refId": "D"
}
},
{
"refId": "E",
"queryType": "",
"relativeTimeRange": {
"from": 86400,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"editorMode": "code",
"exemplar": false,
"expr": "absent(drone_build_count)",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalMs": 1000,
"legendFormat": "missing build count data",
"maxDataPoints": 43200,
"range": true,
"refId": "E"
}
},
{
"refId": "F",
"queryType": "",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0,
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"E"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
},
{
"evaluator": {
"params": [
0,
0
],
"type": "no_value"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
},
{
"evaluator": {
"params": [
0,
0
],
"type": "no_value"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "F",
"type": "classic_conditions"
}
}
],
"updated": "2022-08-16T11:02:29Z",
"intervalSeconds": 60,
"version": 12,
"uid": "Tw1EuLiVz",
"namespace_uid": "cxtgDx37z",
"namespace_id": 11,
"rule_group": "nebula-ci",
"no_data_state": "NoData",
"exec_err_state": "Alerting"
}
}
]
},
{
"name": "nebula",
"interval": "5m",
"rules": [
{
"expr": "",
"for": "30m",
"annotations": {
"__dashboardUid__": "Ha2VyrtGz",
"__panelId__": "4",
"summary": "fs free bytes dropped below threshold (currently 35GB)"
},
"grafana_alert": {
"id": 1,
"orgId": 2,
"title": "node_rootfs-low_on_disk_space",
"condition": "B",
"data": [
{
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 3600,
"to": 0
},
"datasourceUid": "wntsJ72Gz",
"model": {
"datasource": {
"type": "prometheus",
"uid": "wntsJ72Gz"
},
"editorMode": "code",
"exemplar": false,
"expr": "node_filesystem_free_bytes{mountpoint=\"/rootfs\"} / 1000000000",
"hide": false,
"instant": false,
"interval": "",
"intervalMs": 15000,
"legendFormat": "{{mountpoint}}",
"maxDataPoints": 43200,
"range": true,
"refId": "A"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
35,
0
],
"type": "lt"
},
"operator": {
"type": "when"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "min"
},
"type": "query"
},
{
"evaluator": {
"params": [
0,
0
],
"type": "no_value"
},
"operator": {
"type": "or"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "classic_conditions"
}
}
],
"updated": "2022-08-16T11:03:09Z",
"intervalSeconds": 300,
"version": 4,
"uid": "LZoRhwenk",
"namespace_uid": "cxtgDx37z",
"namespace_id": 11,
"rule_group": "nebula",
"no_data_state": "NoData",
"exec_err_state": "Error"
}
}
]
}
]
}

16
upload-grafana-alerts.sh Executable file

@ -0,0 +1,16 @@
#!/bin/bash
ALERTS_JSON_PATH=./alerts/alerts.json
NUMBER_OF_ALERTS=$(jq -c '.["folder-name"] | length' ${ALERTS_JSON_PATH})
for ((i=0; i<NUMBER_OF_ALERTS; i++)); do
ALERT_OBJECT=$(jq -c --arg i "$i" '.["folder-name"][($i | tonumber)] | del(.rules[0].grafana_alert.uid)' ${ALERTS_JSON_PATH})
ALERT_NAME=$(jq -c --arg i "$i" '.["folder-name"][($i | tonumber)].name' ${ALERTS_JSON_PATH})
printf "Creating %s..." "${ALERT_NAME}"
curl -X POST \
-H "Authorization: Bearer ${GRAFANA_TOKEN}" \
-H "Content-type: application/json" \
'https://grafana.dotya.ml/api/ruler/grafana/api/v1/rules/folder-name' \
-d "${ALERT_OBJECT}"
printf "\n"
done