1
1
mirror of https://gitlab.archlinux.org/archlinux/infrastructure.git synced 2025-01-18 08:06:16 +01:00
infrastructure/roles/prometheus/files/node.rules.yml
Kristian Klausen ec6296bf6f
Add alert for Fastly cost
If the cost exceeds $0, it indicates that we have run out of credit
and/or are doing something wrong, in either case we want to be alerted.
2024-12-15 18:06:39 +01:00

515 lines
23 KiB
YAML

groups:
- name: node_common
interval: 60s
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro3.pkgbuild.com",instance!~"repro2.pkgbuild.com",instance!~"runner1.archlinux.org",instance!~"runner3.archlinux.org"}[10m])) * 100) > 90
for: 10m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 90%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### Disabled due to only (incorrectly) triggering for america.mirror.pkgbuild.com
# - alert: HostUnusualNetworkThroughputOut
# expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
# description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_size_bytes < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_size_bytes < 10 and predict_linear(node_filesystem_avail_bytes[1h], 24 * 3600) < 0
for: 2m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 24 hours (instance {{ $labels.instance }})"
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free{fstype!="tmpfs",mountpoint!~"/backup.*"} * 100) / node_filesystem_files < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: prometheus
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered{instance!~"dashboards.archlinux.org"} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: pacman
interval: 2m
rules:
- alert: pacman_updates_pending
expr: pacman_updates_pending > 50
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has out of date packages'
summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
- alert: pacman_security_updates_pending
expr: pacman_security_updates_pending > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has vulnerable date packages'
summary: '{{ $labels.instance }} has {{ $value }} vulnerable packages'
- alert: pacman_orphans
expr: pacman_orphans > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has orphan packages'
summary: '{{ $labels.instance }} has {{ $value }} orphan packages'
- alert: pacman_foreigns
expr: pacman_foreigns > 0
for: 15m
labels:
severity: warning
service: pacman
annotations:
description: 'host {{ $labels.instance }} has foreign packages'
summary: '{{ $labels.instance }} has {{ $value }} foreign packages'
- name: btrfs
interval: 2m
rules:
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- alert: btrfs_write_io_errs
expr: btrfs_write_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs write_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
- alert: btrfs_read_io_errs
expr: btrfs_read_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs read_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
- alert: btrfs_flush_io_errs
expr: btrfs_flush_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs flush_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- name: smart
interval: 1m
rules:
- alert: smart_device_smart_healthy
expr: smart_device_smart_healthy == 0
for: 2m
labels:
severity: critical
service: smart
annotations:
description: 'host {{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
- alert: smart_device_self_test
expr: smart_device_self_test == 0
for: 2m
labels:
severity: critical
service: smart
annotations:
description: 'host {{ $labels.instance }} has an not passing self test'
summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}'
- name: borg
interval: 60s
rules:
- alert: BorgHetznerMissingBackup
expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.5
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
- alert: BorgOffsiteMissingBackup
expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.5
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
- name: systemd_unit
interval: 15s
rules:
- alert: systemd_unit_failed
expr: |
node_systemd_unit_state{state="failed"} > 0
for: 3m
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
summary: 'Systemd unit failed'
- alert: systemd_unit_flapping
expr: |
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
summary: 'Systemd unit flapping'
- name: gitlab
interval: 15s
rules:
- alert: ServiceDown
expr: avg_over_time(up[10m]) * 100 < 50
annotations:
description: The service {{ $labels.job }} instance {{ $labels.instance }} is
not responding for more than 50% of the time for 5 minutes.
summary: The service {{ $labels.job }} is not responding
- alert: RedisDown
expr: avg_over_time(redis_up[5m]) * 100 < 50
annotations:
description: The Redis service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Redis service {{ $labels.job }} is not responding
- alert: PostgresDown
expr: avg_over_time(pg_up[5m]) * 100 < 50
annotations:
description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Postgres service {{ $labels.job }} is not responding
- alert: UnicornQueueing
expr: avg_over_time(unicorn_queued_connections[30m]) > 1
annotations:
description: Unicorn instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Unicorn is queueing requests
- alert: PumaQueueing
expr: avg_over_time(puma_queued_connections[30m]) > 1
annotations:
description: Puma instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Puma is queueing requests
- alert: HighUnicornUtilization
expr: instance:unicorn_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Unicorn is has high utilization
- alert: HighPumaUtilization
expr: instance:puma_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Puma is has high utilization
- alert: SidekiqJobsQueuing
expr: sum by (name) (sidekiq_queue_size) > 0
for: 60m
annotations:
summary: Sidekiq has jobs queued
description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
- alert: HighgRPCResourceExhaustedRate
expr: >
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
) /
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m
) * 100 > 1
for: 60m
annotations:
summary: High gRPC ResourceExhausted error rate
description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
- alert: PostgresDatabaseDeadlocks
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
- alert: PostgresDatabaseDeadlockCancels
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has queries canceled due to deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
# Low-traffic - < 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) < 10
) * 100 > 50
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
# High-traffic - >= 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) > 10
) * 100 > 10
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
- name: blackbox
interval: 15s
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe failed (instance {{ $labels.instance }})"
description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry and probe_ssl_earliest_cert_expiry - time() < 86400 * 25
for: 5m
labels:
severity: critical
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 25 days\n VALUE = {{ $value | humanizeTimestamp}}\n LABELS: {{ $labels }}"
- name: rebuilderd
interval: 15m
rules:
- alert: RebuilderdQueueNotEmpty
expr: rebuilderd_queue_length > 2000
for: 24h
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
description: "Rebuilderd's queue length is now: {{ $value }}"
- alert: RebuilderdWorkersOffline
expr: rebuilderd_workers < 3
for: 5m
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd workers offline {{ $labels.instance }})"
description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"
- name: fastly
interval: 60m
rules:
- alert: FastlyCostNotZero
expr: max_over_time(fastly_mtd_estimate_dollars_total[2h]) > 0
labels:
severity: warning
service: fastly
annotations:
summary: "Fastly is consuming real money"
description: "Fastly month-to-date billing estimate is now: {{ $value }}"