1
1
mirror of https://gitlab.archlinux.org/archlinux/infrastructure.git synced 2025-01-18 08:06:16 +01:00
infrastructure/roles/prometheus/files/node.rules.yml
Evangelos Foutras 9338663fce
Replace runner2 with runner3 (with the same specs)
Equinix's AMS1 DC is being shut down so we need to recreate this box.

For Geo variety, this one is created in Frankfurt instead of Amsterdam.

Ref #495
2023-02-11 17:28:19 +02:00

503 lines
22 KiB
YAML

groups:
- name: node_common
interval: 60s
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro3.pkgbuild.com",instance!~"repro2.pkgbuild.com",instance!~"runner1.archlinux.org",instance!~"runner3.archlinux.org"}[10m])) * 100) > 90
for: 10m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 90%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### Disabled due to only (incorrectly) triggering for america.mirror.pkgbuild.com
# - alert: HostUnusualNetworkThroughputOut
# expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
# description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_size_bytes < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_size_bytes < 10 and predict_linear(node_filesystem_avail_bytes[1h], 24 * 3600) < 0
for: 2m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 24 hours (instance {{ $labels.instance }})"
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_files < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: prometheus
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered{instance!~"dashboards.archlinux.org"} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: pacman
interval: 2m
rules:
- alert: pacman_updates_pending
expr: pacman_updates_pending > 50
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has out of date packages'
summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
- alert: pacman_security_updates_pending
expr: pacman_security_updates_pending > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has vulnerable date packages'
summary: '{{ $labels.instance }} has {{ $value }} vulnerable packages'
- alert: pacman_orphans
expr: pacman_orphans > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has orphan packages'
summary: '{{ $labels.instance }} has {{ $value }} orphan packages'
- alert: pacman_foreigns
expr: pacman_foreigns > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has foreign packages'
summary: '{{ $labels.instance }} has {{ $value }} foreign packages'
- name: btrfs
interval: 2m
rules:
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- alert: btrfs_write_io_errs
expr: btrfs_write_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs write_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
- alert: btrfs_read_io_errs
expr: btrfs_read_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs read_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
- alert: btrfs_flush_io_errs
expr: btrfs_flush_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs flush_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- name: smart
interval: 1m
rules:
- alert: smart_device_smart_healthy
expr: smart_device_smart_healthy == 0
for: 2m
labels:
severity: critical
service: smart
annotations:
description: 'host {{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
- alert: smart_device_self_test
expr: smart_device_self_test == 0
for: 2m
labels:
severity: critical
service: smart
annotations:
description: 'host {{ $labels.instance }} has an not passing self test'
summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
- name: borg
interval: 60s
rules:
- alert: BorgHetznerMissingBackup
expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.5
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
- alert: BorgOffsiteMissingBackup
expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.5
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
- name: systemd_unit
interval: 15s
rules:
- alert: systemd_unit_failed
expr: |
node_systemd_unit_state{state="failed"} > 0
for: 3m
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
summary: 'Systemd unit failed'
- alert: systemd_unit_flapping
expr: |
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
summary: 'Systemd unit flapping'
- name: gitlab
interval: 15s
rules:
- alert: ServiceDown
expr: avg_over_time(up[10m]) * 100 < 50
annotations:
description: The service {{ $labels.job }} instance {{ $labels.instance }} is
not responding for more than 50% of the time for 5 minutes.
summary: The service {{ $labels.job }} is not responding
- alert: RedisDown
expr: avg_over_time(redis_up[5m]) * 100 < 50
annotations:
description: The Redis service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Redis service {{ $labels.job }} is not responding
- alert: PostgresDown
expr: avg_over_time(pg_up[5m]) * 100 < 50
annotations:
description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Postgres service {{ $labels.job }} is not responding
- alert: UnicornQueueing
expr: avg_over_time(unicorn_queued_connections[30m]) > 1
annotations:
description: Unicorn instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Unicorn is queueing requests
- alert: PumaQueueing
expr: avg_over_time(puma_queued_connections[30m]) > 1
annotations:
description: Puma instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Puma is queueing requests
- alert: HighUnicornUtilization
expr: instance:unicorn_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Unicorn is has high utilization
- alert: HighPumaUtilization
expr: instance:puma_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Puma is has high utilization
- alert: SidekiqJobsQueuing
expr: sum by (name) (sidekiq_queue_size) > 0
for: 60m
annotations:
summary: Sidekiq has jobs queued
description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
- alert: HighgRPCResourceExhaustedRate
expr: >
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
) /
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m
) * 100 > 1
for: 60m
annotations:
summary: High gRPC ResourceExhausted error rate
description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
- alert: PostgresDatabaseDeadlocks
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
- alert: PostgresDatabaseDeadlockCancels
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has queries canceled due to deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
# Low-traffic - < 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) < 10
) * 100 > 50
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
# High-traffic - >= 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) > 10
) * 100 > 10
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
- name: blackbox
interval: 15s
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe failed (instance {{ $labels.instance }})"
description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry and probe_ssl_earliest_cert_expiry - time() < 86400 * 25
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 25 days\n VALUE = {{ $value | humanizeTimestamp}}\n LABELS: {{ $labels }}"
- name: rebuilderd
interval: 15m
rules:
- alert: RebuilderdQueueNotEmpty
expr: rebuilderd_queue_length > 2000
for: 24h
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
description: "Rebuilderd's queue length is now: {{ $value }}"
- alert: RebuilderdWorkersOffline
expr: rebuilderd_workers < 3
for: 5m
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd workers offline {{ $labels.instance }})"
description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"