1
1
Fork 0
mirror of https://gitlab.archlinux.org/archlinux/infrastructure.git synced 2024-05-05 14:06:04 +02:00
infrastructure/roles/prometheus/files/node.rules.yml
Kristian Klausen 4112bdf9fd Make ansible-lint happy
yaml: truthy value should be one of [false, true] (truthy)
yaml: wrong indentation: expected 4 but found 2 (indentation)
yaml: too few spaces before comment (comments)
yaml: missing starting space in comment (comments)
yaml: too many blank lines (1 > 0) (empty-lines)
yaml: too many spaces after colon (colons)
yaml: comment not indented like content (comments-indentation)
yaml: no new line character at the end of file (new-line-at-end-of-file)
load-failure: Failed to load or parse file
parser-error: couldn't resolve module/action 'hosts'. This often indicates a misspelling, missing collection, or incorrect module path.
2021-02-14 14:22:05 +01:00

453 lines
20 KiB
YAML

groups:
- name: node_common
interval: 60s
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro1.pkgbuild.com",instance!~"repro2.pkgbuild.com"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostDiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: prometheus
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: pacman
interval: 2m
rules:
- alert: pacman_updates_pending
expr: pacman_updates_pending > 50
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} has out of date packages'
summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
- name: btrfs
interval: 2m
rules:
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- alert: btrfs_write_io_errs
expr: btrfs_write_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs write_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
- alert: btrfs_read_io_errs
expr: btrfs_read_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs read_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
- alert: btrfs_flush_io_errs
expr: btrfs_flush_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs flush_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- name: borg
interval: 60s
rules:
- alert: BorgHetznerMissingBackup
expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.2
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
- alert: BorgOffsiteMissingBackup
expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.2
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
- name: systemd_unit
interval: 15s
rules:
- alert: systemd_unit_failed
expr: |
node_systemd_unit_state{state="failed"} > 0
for: 3m
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
summary: 'Systemd unit failed'
- alert: systemd_unit_flapping
expr: |
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
summary: 'Systemd unit flapping'
- name: gitlab
interval: 15s
rules:
- alert: ServiceDown
expr: avg_over_time(up[5m]) * 100 < 50
annotations:
description: The service {{ $labels.job }} instance {{ $labels.instance }} is
not responding for more than 50% of the time for 5 minutes.
summary: The service {{ $labels.job }} is not responding
- alert: RedisDown
expr: avg_over_time(redis_up[5m]) * 100 < 50
annotations:
description: The Redis service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Redis service {{ $labels.job }} is not responding
- alert: PostgresDown
expr: avg_over_time(pg_up[5m]) * 100 < 50
annotations:
description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Postgres service {{ $labels.job }} is not responding
- alert: UnicornQueueing
expr: avg_over_time(unicorn_queued_connections[30m]) > 1
annotations:
description: Unicorn instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Unicorn is queueing requests
- alert: PumaQueueing
expr: avg_over_time(puma_queued_connections[30m]) > 1
annotations:
description: Puma instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Puma is queueing requests
- alert: HighUnicornUtilization
expr: instance:unicorn_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Unicorn is has high utilization
- alert: HighPumaUtilization
expr: instance:puma_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Puma is has high utilization
- alert: SidekiqJobsQueuing
expr: sum by (name) (sidekiq_queue_size) > 0
for: 60m
annotations:
summary: Sidekiq has jobs queued
description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
- alert: HighgRPCResourceExhaustedRate
expr: >
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
) /
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m
) * 100 > 1
for: 60m
annotations:
summary: High gRPC ResourceExhausted error rate
description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
- alert: PostgresDatabaseDeadlocks
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
- alert: PostgresDatabaseDeadlockCancels
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has queries canceled due to deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
# Low-traffic - < 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) < 10
) * 100 > 50
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
# High-traffic - >= 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) > 10
) * 100 > 10
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
- name: blackbox
interval: 15s
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe failed (instance {{ $labels.instance }})"
description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 25 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: rebuilderd
interval: 15m
rules:
- alert: RebuilderdQueueNotEmpty
expr: rebuilderd_queue_length > 2000
for: 24h
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
description: "Rebuilderd's queue length is now: {{ $value }}"
- alert: RebuilderdWorkersOffline
expr: rebuilderd_workers < 3
for: 5m
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd workers offline {{ $labels.instance }})"
description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"