713 lines
22 KiB
JSON
713 lines
22 KiB
JSON
{
|
|
"general-alerting": [
|
|
{
|
|
"name": "nebula-ci",
|
|
"interval": "1m",
|
|
"rules": [
|
|
{
|
|
"expr": "",
|
|
"for": "5m",
|
|
"labels": {
|
|
"issue": "pending_builds",
|
|
"service": "ci"
|
|
},
|
|
"annotations": {
|
|
"__dashboardUid__": "Ha2VyrtGz",
|
|
"__panelId__": "8",
|
|
"description": "monitor the number of pending CI builds and alert if the number has been >= 3 for the duration of 5 minutes",
|
|
"summary": "3 (or more) CI builds have been in a pending state for (at least) the duration of 5 minutes"
|
|
},
|
|
"grafana_alert": {
|
|
"id": 2,
|
|
"orgId": 2,
|
|
"title": "ci - pending builds",
|
|
"condition": "E",
|
|
"data": [
|
|
{
|
|
"refId": "A",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_running_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "A"
|
|
}
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 43200,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"editorMode": "code",
|
|
"exemplar": true,
|
|
"expr": "drone_running_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running jobs",
|
|
"maxDataPoints": 43200,
|
|
"range": true,
|
|
"refId": "C"
|
|
}
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"expr": "drone_pending_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "B"
|
|
}
|
|
},
|
|
{
|
|
"refId": "D",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_pending_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending jobs",
|
|
"maxDataPoints": 43200,
|
|
"refId": "D"
|
|
}
|
|
},
|
|
{
|
|
"refId": "E",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 0,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "-100",
|
|
"model": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
3
|
|
],
|
|
"type": "gt"
|
|
},
|
|
"operator": {
|
|
"type": "when"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"B"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"datasource": {
|
|
"type": "__expr__",
|
|
"uid": "-100"
|
|
},
|
|
"hide": false,
|
|
"intervalMs": 1000,
|
|
"maxDataPoints": 43200,
|
|
"refId": "E",
|
|
"type": "classic_conditions"
|
|
}
|
|
}
|
|
],
|
|
"updated": "2022-08-16T11:01:40Z",
|
|
"intervalSeconds": 60,
|
|
"version": 8,
|
|
"uid": "r6IWmzg4z",
|
|
"namespace_uid": "cxtgDx37z",
|
|
"namespace_id": 11,
|
|
"rule_group": "nebula-ci",
|
|
"no_data_state": "NoData",
|
|
"exec_err_state": "Alerting"
|
|
}
|
|
},
|
|
{
|
|
"expr": "",
|
|
"for": "5m",
|
|
"labels": {
|
|
"issue": "pending_jobs",
|
|
"service": "ci"
|
|
},
|
|
"annotations": {
|
|
"__dashboardUid__": "Ha2VyrtGz",
|
|
"__panelId__": "8",
|
|
"description": "monitor the number of pending CI jobs and alert if the number has been >=3 for the duration of 5 minutes",
|
|
"summary": "3 (or more) CI jobs have been in a pending state for (at least) the duration of 5 minutes"
|
|
},
|
|
"grafana_alert": {
|
|
"id": 3,
|
|
"orgId": 2,
|
|
"title": "ci - pending jobs",
|
|
"condition": "E",
|
|
"data": [
|
|
{
|
|
"refId": "A",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_running_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "A"
|
|
}
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_running_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running jobs",
|
|
"maxDataPoints": 43200,
|
|
"refId": "C"
|
|
}
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"expr": "drone_pending_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "B"
|
|
}
|
|
},
|
|
{
|
|
"refId": "D",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_pending_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending jobs",
|
|
"maxDataPoints": 43200,
|
|
"refId": "D"
|
|
}
|
|
},
|
|
{
|
|
"refId": "E",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 0,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "-100",
|
|
"model": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
3
|
|
],
|
|
"type": "gt"
|
|
},
|
|
"operator": {
|
|
"type": "and"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"D"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"datasource": {
|
|
"type": "__expr__",
|
|
"uid": "-100"
|
|
},
|
|
"hide": false,
|
|
"intervalMs": 1000,
|
|
"maxDataPoints": 43200,
|
|
"refId": "E",
|
|
"type": "classic_conditions"
|
|
}
|
|
}
|
|
],
|
|
"updated": "2022-07-14T12:34:50Z",
|
|
"intervalSeconds": 60,
|
|
"version": 3,
|
|
"uid": "20B7ZzRVz",
|
|
"namespace_uid": "cxtgDx37z",
|
|
"namespace_id": 11,
|
|
"rule_group": "nebula-ci",
|
|
"no_data_state": "NoData",
|
|
"exec_err_state": "Alerting"
|
|
}
|
|
},
|
|
{
|
|
"expr": "",
|
|
"for": "5m",
|
|
"labels": {
|
|
"alertname": "ci - missing build count data",
|
|
"issue": "missing_build_count_data",
|
|
"service": "ci"
|
|
},
|
|
"annotations": {
|
|
"__dashboardUid__": "Ha2VyrtGz",
|
|
"__panelId__": "8",
|
|
"description": "we are missing build count data, which likely means that Prometheus wasn't able to scrape CI (likely because CI is down).",
|
|
"summary": "Prometheus wasn't able to scrape fresh build count data from CI."
|
|
},
|
|
"grafana_alert": {
|
|
"id": 4,
|
|
"orgId": 2,
|
|
"title": "ci - missing build count data",
|
|
"condition": "F",
|
|
"data": [
|
|
{
|
|
"refId": "A",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_running_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "A"
|
|
}
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_running_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"intervalMs": 15000,
|
|
"legendFormat": "running jobs",
|
|
"maxDataPoints": 43200,
|
|
"refId": "B"
|
|
}
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"expr": "drone_pending_builds",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending builds",
|
|
"maxDataPoints": 43200,
|
|
"refId": "C"
|
|
}
|
|
},
|
|
{
|
|
"refId": "D",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"exemplar": true,
|
|
"expr": "drone_pending_jobs",
|
|
"hide": false,
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "pending jobs",
|
|
"maxDataPoints": 43200,
|
|
"refId": "D"
|
|
}
|
|
},
|
|
{
|
|
"refId": "E",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 86400,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"editorMode": "code",
|
|
"exemplar": false,
|
|
"expr": "absent(drone_build_count)",
|
|
"format": "time_series",
|
|
"hide": false,
|
|
"instant": false,
|
|
"interval": "",
|
|
"intervalMs": 1000,
|
|
"legendFormat": "missing build count data",
|
|
"maxDataPoints": 43200,
|
|
"range": true,
|
|
"refId": "E"
|
|
}
|
|
},
|
|
{
|
|
"refId": "F",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 0,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "-100",
|
|
"model": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
0,
|
|
0
|
|
],
|
|
"type": "gt"
|
|
},
|
|
"operator": {
|
|
"type": "and"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"E"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
},
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
0,
|
|
0
|
|
],
|
|
"type": "no_value"
|
|
},
|
|
"operator": {
|
|
"type": "and"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"A"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
},
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
0,
|
|
0
|
|
],
|
|
"type": "no_value"
|
|
},
|
|
"operator": {
|
|
"type": "and"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"B"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"datasource": {
|
|
"name": "Expression",
|
|
"type": "__expr__",
|
|
"uid": "__expr__"
|
|
},
|
|
"hide": false,
|
|
"intervalMs": 1000,
|
|
"maxDataPoints": 43200,
|
|
"refId": "F",
|
|
"type": "classic_conditions"
|
|
}
|
|
}
|
|
],
|
|
"updated": "2022-08-16T11:02:29Z",
|
|
"intervalSeconds": 60,
|
|
"version": 12,
|
|
"uid": "Tw1EuLiVz",
|
|
"namespace_uid": "cxtgDx37z",
|
|
"namespace_id": 11,
|
|
"rule_group": "nebula-ci",
|
|
"no_data_state": "NoData",
|
|
"exec_err_state": "Alerting"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "nebula",
|
|
"interval": "5m",
|
|
"rules": [
|
|
{
|
|
"expr": "",
|
|
"for": "30m",
|
|
"annotations": {
|
|
"__dashboardUid__": "Ha2VyrtGz",
|
|
"__panelId__": "4",
|
|
"summary": "fs free bytes dropped below threshold (currently 35GB)"
|
|
},
|
|
"grafana_alert": {
|
|
"id": 1,
|
|
"orgId": 2,
|
|
"title": "node_rootfs-low_on_disk_space",
|
|
"condition": "B",
|
|
"data": [
|
|
{
|
|
"refId": "A",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 3600,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "wntsJ72Gz",
|
|
"model": {
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "wntsJ72Gz"
|
|
},
|
|
"editorMode": "code",
|
|
"exemplar": false,
|
|
"expr": "node_filesystem_free_bytes{mountpoint=\"/rootfs\"} / 1000000000",
|
|
"hide": false,
|
|
"instant": false,
|
|
"interval": "",
|
|
"intervalMs": 15000,
|
|
"legendFormat": "{{mountpoint}}",
|
|
"maxDataPoints": 43200,
|
|
"range": true,
|
|
"refId": "A"
|
|
}
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"queryType": "",
|
|
"relativeTimeRange": {
|
|
"from": 0,
|
|
"to": 0
|
|
},
|
|
"datasourceUid": "-100",
|
|
"model": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
35,
|
|
0
|
|
],
|
|
"type": "lt"
|
|
},
|
|
"operator": {
|
|
"type": "when"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"A"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "min"
|
|
},
|
|
"type": "query"
|
|
},
|
|
{
|
|
"evaluator": {
|
|
"params": [
|
|
0,
|
|
0
|
|
],
|
|
"type": "no_value"
|
|
},
|
|
"operator": {
|
|
"type": "or"
|
|
},
|
|
"query": {
|
|
"params": [
|
|
"A"
|
|
]
|
|
},
|
|
"reducer": {
|
|
"params": [],
|
|
"type": "last"
|
|
},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"datasource": {
|
|
"name": "Expression",
|
|
"type": "__expr__",
|
|
"uid": "__expr__"
|
|
},
|
|
"hide": false,
|
|
"intervalMs": 1000,
|
|
"maxDataPoints": 43200,
|
|
"refId": "B",
|
|
"type": "classic_conditions"
|
|
}
|
|
}
|
|
],
|
|
"updated": "2022-08-16T11:03:09Z",
|
|
"intervalSeconds": 300,
|
|
"version": 4,
|
|
"uid": "LZoRhwenk",
|
|
"namespace_uid": "cxtgDx37z",
|
|
"namespace_id": 11,
|
|
"rule_group": "nebula",
|
|
"no_data_state": "NoData",
|
|
"exec_err_state": "Error"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|