diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fbeb44b --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +# ref: +# https://community.grafana.com/t/ngalert-grafana-8-alert-feature-how-to-export-import-alerts-as-yml-json/51677/22 + +CHECK-GRAFANA-TOKEN: +ifndef GRAFANA_TOKEN + $(error GRAFANA_TOKEN is required) +endif + +DOWNLOAD-GRAFANA-ALERTS: CHECK-GRAFANA-TOKEN + curl -X GET \ + -H "Authorization: Bearer ${GRAFANA_TOKEN}" \ + 'https://grafana.dotya.ml/api/ruler/grafana/api/v1/rules' \ + | jq > './alerts/alerts.json' + +UPLOAD-GRAFANA-ALERTS: CHECK-GRAFANA-TOKEN + ./upload-grafana-alerts.sh diff --git a/alerts/alerts.json b/alerts/alerts.json new file mode 100644 index 0000000..d547d0e --- /dev/null +++ b/alerts/alerts.json @@ -0,0 +1,712 @@ +{ + "general-alerting": [ + { + "name": "nebula-ci", + "interval": "1m", + "rules": [ + { + "expr": "", + "for": "5m", + "labels": { + "issue": "pending_builds", + "service": "ci" + }, + "annotations": { + "__dashboardUid__": "Ha2VyrtGz", + "__panelId__": "8", + "description": "monitor the number of pending CI builds and alert if the number has been >= 3 for the duration of 5 minutes", + "summary": "3 (or more) CI builds have been in a pending state for (at least) the duration of 5 minutes" + }, + "grafana_alert": { + "id": 2, + "orgId": 2, + "title": "ci - pending builds", + "condition": "E", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_running_builds", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running builds", + "maxDataPoints": 43200, + "refId": "A" + } + }, + { + "refId": "C", + "queryType": "", + "relativeTimeRange": { + "from": 43200, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "editorMode": "code", + "exemplar": true, + "expr": "drone_running_jobs", + "hide": false, + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running jobs", + "maxDataPoints": 43200, + "range": true, + "refId": "C" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "expr": "drone_pending_builds", + "format": "time_series", + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending builds", + "maxDataPoints": 43200, + "refId": "B" + } + }, + { + "refId": "D", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_pending_jobs", + "hide": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending jobs", + "maxDataPoints": 43200, + "refId": "D" + } + }, + { + "refId": "E", + "queryType": "", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 3 + ], + "type": "gt" + }, + "operator": { + "type": "when" + }, + "query": { + "params": [ + "B" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "E", + "type": "classic_conditions" + } + } + ], + "updated": "2022-08-16T11:01:40Z", + "intervalSeconds": 60, + "version": 8, + "uid": "r6IWmzg4z", + "namespace_uid": "cxtgDx37z", + "namespace_id": 11, + "rule_group": "nebula-ci", + "no_data_state": "NoData", + "exec_err_state": "Alerting" + } + }, + { + "expr": "", + "for": "5m", + "labels": { + "issue": "pending_jobs", + "service": "ci" + }, + "annotations": { + "__dashboardUid__": "Ha2VyrtGz", + "__panelId__": "8", + "description": "monitor the number of pending CI jobs and alert if the number has been >=3 for the duration of 5 minutes", + "summary": "3 (or more) CI jobs have been in a pending state for (at least) the duration of 5 minutes" + }, + "grafana_alert": { + "id": 3, + "orgId": 2, + "title": "ci - pending jobs", + "condition": "E", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_running_builds", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running builds", + "maxDataPoints": 43200, + "refId": "A" + } + }, + { + "refId": "C", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_running_jobs", + "hide": false, + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running jobs", + "maxDataPoints": 43200, + "refId": "C" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "expr": "drone_pending_builds", + "format": "time_series", + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending builds", + "maxDataPoints": 43200, + "refId": "B" + } + }, + { + "refId": "D", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_pending_jobs", + "hide": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending jobs", + "maxDataPoints": 43200, + "refId": "D" + } + }, + { + "refId": "E", + "queryType": "", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 3 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "D" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "E", + "type": "classic_conditions" + } + } + ], + "updated": "2022-07-14T12:34:50Z", + "intervalSeconds": 60, + "version": 3, + "uid": "20B7ZzRVz", + "namespace_uid": "cxtgDx37z", + "namespace_id": 11, + "rule_group": "nebula-ci", + "no_data_state": "NoData", + "exec_err_state": "Alerting" + } + }, + { + "expr": "", + "for": "5m", + "labels": { + "alertname": "ci - missing build count data", + "issue": "missing_build_count_data", + "service": "ci" + }, + "annotations": { + "__dashboardUid__": "Ha2VyrtGz", + "__panelId__": "8", + "description": "we are missing build count data, which likely means that Prometheus wasn't able to scrape CI (likely because CI is down).", + "summary": "Prometheus wasn't able to scrape fresh build count data from CI." + }, + "grafana_alert": { + "id": 4, + "orgId": 2, + "title": "ci - missing build count data", + "condition": "F", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_running_builds", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running builds", + "maxDataPoints": 43200, + "refId": "A" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_running_jobs", + "hide": false, + "interval": "", + "intervalFactor": 1, + "intervalMs": 15000, + "legendFormat": "running jobs", + "maxDataPoints": 43200, + "refId": "B" + } + }, + { + "refId": "C", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "expr": "drone_pending_builds", + "format": "time_series", + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending builds", + "maxDataPoints": 43200, + "refId": "C" + } + }, + { + "refId": "D", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "exemplar": true, + "expr": "drone_pending_jobs", + "hide": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "pending jobs", + "maxDataPoints": 43200, + "refId": "D" + } + }, + { + "refId": "E", + "queryType": "", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "editorMode": "code", + "exemplar": false, + "expr": "absent(drone_build_count)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalMs": 1000, + "legendFormat": "missing build count data", + "maxDataPoints": 43200, + "range": true, + "refId": "E" + } + }, + { + "refId": "F", + "queryType": "", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0, + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "E" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 0, + 0 + ], + "type": "no_value" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 0, + 0 + ], + "type": "no_value" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "F", + "type": "classic_conditions" + } + } + ], + "updated": "2022-08-16T11:02:29Z", + "intervalSeconds": 60, + "version": 12, + "uid": "Tw1EuLiVz", + "namespace_uid": "cxtgDx37z", + "namespace_id": 11, + "rule_group": "nebula-ci", + "no_data_state": "NoData", + "exec_err_state": "Alerting" + } + } + ] + }, + { + "name": "nebula", + "interval": "5m", + "rules": [ + { + "expr": "", + "for": "30m", + "annotations": { + "__dashboardUid__": "Ha2VyrtGz", + "__panelId__": "4", + "summary": "fs free bytes dropped below threshold (currently 35GB)" + }, + "grafana_alert": { + "id": 1, + "orgId": 2, + "title": "node_rootfs-low_on_disk_space", + "condition": "B", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 3600, + "to": 0 + }, + "datasourceUid": "wntsJ72Gz", + "model": { + "datasource": { + "type": "prometheus", + "uid": "wntsJ72Gz" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_free_bytes{mountpoint=\"/rootfs\"} / 1000000000", + "hide": false, + "instant": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "{{mountpoint}}", + "maxDataPoints": 43200, + "range": true, + "refId": "A" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35, + 0 + ], + "type": "lt" + }, + "operator": { + "type": "when" + }, + "query": { + "params": [ + "A" + ] + }, + "reducer": { + "params": [], + "type": "min" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 0, + 0 + ], + "type": "no_value" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "A" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "B", + "type": "classic_conditions" + } + } + ], + "updated": "2022-08-16T11:03:09Z", + "intervalSeconds": 300, + "version": 4, + "uid": "LZoRhwenk", + "namespace_uid": "cxtgDx37z", + "namespace_id": 11, + "rule_group": "nebula", + "no_data_state": "NoData", + "exec_err_state": "Error" + } + } + ] + } + ] +} diff --git a/upload-grafana-alerts.sh b/upload-grafana-alerts.sh new file mode 100755 index 0000000..cdf70e5 --- /dev/null +++ b/upload-grafana-alerts.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +ALERTS_JSON_PATH=./alerts/alerts.json +NUMBER_OF_ALERTS=$(jq -c '.["folder-name"] | length' ${ALERTS_JSON_PATH}) + +for ((i=0; i