mirror of
https://gitlab.archlinux.org/archlinux/infrastructure.git
synced 2024-05-05 14:06:04 +02:00
4112bdf9fd
yaml: truthy value should be one of [false, true] (truthy) yaml: wrong indentation: expected 4 but found 2 (indentation) yaml: too few spaces before comment (comments) yaml: missing starting space in comment (comments) yaml: too many blank lines (1 > 0) (empty-lines) yaml: too many spaces after colon (colons) yaml: comment not indented like content (comments-indentation) yaml: no new line character at the end of file (new-line-at-end-of-file) load-failure: Failed to load or parse file parser-error: couldn't resolve module/action 'hosts'. This often indicates a misspelling, missing collection, or incorrect module path.
453 lines
20 KiB
YAML
453 lines
20 KiB
YAML
groups:
|
|
- name: node_common
|
|
interval: 60s
|
|
rules:
|
|
|
|
- alert: HostHighCpuLoad
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro1.pkgbuild.com",instance!~"repro2.pkgbuild.com"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host high CPU load (instance {{ $labels.instance }})"
|
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostSwapIsFillingUp
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host swap is filling up (instance {{ $labels.instance }})"
|
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOutOfMemory
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of memory (instance {{ $labels.instance }})"
|
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostMemoryUnderMemoryPressure
|
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
|
|
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputIn
|
|
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
|
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputOut
|
|
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
|
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of disk space (instance {{ $labels.instance }})"
|
|
description: "Disk is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostDiskWillFillIn4Hours
|
|
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
|
|
description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOutOfInodes
|
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of inodes (instance {{ $labels.instance }})"
|
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOomKillDetected
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
|
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- name: prometheus
|
|
interval: 60s
|
|
rules:
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus target missing (instance {{ $labels.instance }})"
|
|
description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
|
|
- alert: PrometheusTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
|
|
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusNotConnectedToAlertmanager
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
|
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusRuleEvaluationFailures
|
|
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTemplateTextExpansionFailures
|
|
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusNotificationsBacklog
|
|
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
|
|
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusAlertmanagerNotificationFailing
|
|
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
|
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTargetScrapingSlow
|
|
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
|
|
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusLargeScrape
|
|
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
|
|
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTsdbCheckpointCreationFailures
|
|
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTsdbCompactionsFailed
|
|
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTsdbWalCorruptions
|
|
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: PrometheusTsdbWalTruncationsFailed
|
|
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- name: pacman
|
|
interval: 2m
|
|
rules:
|
|
- alert: pacman_updates_pending
|
|
expr: pacman_updates_pending > 50
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} has out of date packages'
|
|
summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
|
|
|
|
- name: btrfs
|
|
interval: 2m
|
|
rules:
|
|
- alert: btrfs_corruption_errs
|
|
expr: btrfs_corruption_errs > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} btrfs corruption errors'
|
|
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
|
|
- alert: btrfs_write_io_errs
|
|
expr: btrfs_write_io_errs > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} btrfs write_io errors'
|
|
summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
|
|
- alert: btrfs_read_io_errs
|
|
expr: btrfs_read_io_errs > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} btrfs read_io errors'
|
|
summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
|
|
- alert: btrfs_flush_io_errs
|
|
expr: btrfs_flush_io_errs > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} btrfs flush_io errors'
|
|
summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
|
|
- alert: btrfs_corruption_errs
|
|
expr: btrfs_corruption_errs > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: 'host {{ $labels.instance }} btrfs corruption errors'
|
|
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
|
|
|
|
- name: borg
|
|
interval: 60s
|
|
rules:
|
|
- alert: BorgHetznerMissingBackup
|
|
expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.2
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
|
|
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
|
|
- alert: BorgOffsiteMissingBackup
|
|
expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.2
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
|
|
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
|
|
|
|
- name: systemd_unit
|
|
interval: 15s
|
|
rules:
|
|
- alert: systemd_unit_failed
|
|
expr: |
|
|
node_systemd_unit_state{state="failed"} > 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
|
|
summary: 'Systemd unit failed'
|
|
|
|
- alert: systemd_unit_flapping
|
|
expr: |
|
|
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
|
|
summary: 'Systemd unit flapping'
|
|
|
|
- name: gitlab
|
|
interval: 15s
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: avg_over_time(up[5m]) * 100 < 50
|
|
annotations:
|
|
description: The service {{ $labels.job }} instance {{ $labels.instance }} is
|
|
not responding for more than 50% of the time for 5 minutes.
|
|
summary: The service {{ $labels.job }} is not responding
|
|
- alert: RedisDown
|
|
expr: avg_over_time(redis_up[5m]) * 100 < 50
|
|
annotations:
|
|
description: The Redis service {{ $labels.job }} instance {{ $labels.instance
|
|
}} is not responding for more than 50% of the time for 5 minutes.
|
|
summary: The Redis service {{ $labels.job }} is not responding
|
|
- alert: PostgresDown
|
|
expr: avg_over_time(pg_up[5m]) * 100 < 50
|
|
annotations:
|
|
description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
|
|
}} is not responding for more than 50% of the time for 5 minutes.
|
|
summary: The Postgres service {{ $labels.job }} is not responding
|
|
- alert: UnicornQueueing
|
|
expr: avg_over_time(unicorn_queued_connections[30m]) > 1
|
|
annotations:
|
|
description: Unicorn instance {{ $labels.instance }} is queueing requests with
|
|
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
|
|
summary: Unicorn is queueing requests
|
|
- alert: PumaQueueing
|
|
expr: avg_over_time(puma_queued_connections[30m]) > 1
|
|
annotations:
|
|
description: Puma instance {{ $labels.instance }} is queueing requests with
|
|
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
|
|
summary: Puma is queueing requests
|
|
- alert: HighUnicornUtilization
|
|
expr: instance:unicorn_utilization:ratio * 100 > 90
|
|
for: 60m
|
|
annotations:
|
|
description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
|
|
summary: Unicorn is has high utilization
|
|
- alert: HighPumaUtilization
|
|
expr: instance:puma_utilization:ratio * 100 > 90
|
|
for: 60m
|
|
annotations:
|
|
description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
|
|
summary: Puma is has high utilization
|
|
- alert: SidekiqJobsQueuing
|
|
expr: sum by (name) (sidekiq_queue_size) > 0
|
|
for: 60m
|
|
annotations:
|
|
summary: Sidekiq has jobs queued
|
|
description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
|
|
- alert: HighgRPCResourceExhaustedRate
|
|
expr: >
|
|
sum without (grpc_code) (
|
|
job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
|
|
) /
|
|
sum without (grpc_code) (
|
|
job_grpc:grpc_server_handled_total:rate5m
|
|
) * 100 > 1
|
|
for: 60m
|
|
annotations:
|
|
summary: High gRPC ResourceExhausted error rate
|
|
description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
|
|
- alert: PostgresDatabaseDeadlocks
|
|
expr: increase(pg_stat_database_deadlocks[5m]) > 0
|
|
annotations:
|
|
summary: Postgres database has deadlocks
|
|
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
|
|
- alert: PostgresDatabaseDeadlockCancels
|
|
expr: increase(pg_stat_database_deadlocks[5m]) > 0
|
|
annotations:
|
|
summary: Postgres database has queries canceled due to deadlocks
|
|
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
|
|
# Low-traffic - < 10 QPS (600 RPM)
|
|
- alert: WorkhorseHighErrorRate
|
|
expr: >
|
|
(
|
|
sum without (job, code) (
|
|
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
|
|
) /
|
|
sum without (job,code) (
|
|
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
|
|
) < 10
|
|
) * 100 > 50
|
|
annotations:
|
|
summary: Workhorse has high error rates
|
|
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
|
|
# High-traffic - >= 10 QPS (600 RPM)
|
|
- alert: WorkhorseHighErrorRate
|
|
expr: >
|
|
(
|
|
sum without (job, code) (
|
|
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
|
|
) /
|
|
sum without (job,code) (
|
|
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
|
|
) > 10
|
|
) * 100 > 10
|
|
annotations:
|
|
summary: Workhorse has high error rates
|
|
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
|
|
|
|
- name: blackbox
|
|
interval: 15s
|
|
rules:
|
|
- alert: BlackboxProbeFailed
|
|
expr: probe_success == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Blackbox probe failed (instance {{ $labels.instance }})"
|
|
description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: BlackboxProbeHttpFailure
|
|
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
|
|
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
- alert: BlackboxSslCertificateWillExpireSoon
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
|
|
description: "SSL certificate expires in 25 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- name: rebuilderd
|
|
interval: 15m
|
|
rules:
|
|
- alert: RebuilderdQueueNotEmpty
|
|
expr: rebuilderd_queue_length > 2000
|
|
for: 24h
|
|
labels:
|
|
severity: warning
|
|
service: rebuilderd
|
|
annotations:
|
|
summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
|
|
description: "Rebuilderd's queue length is now: {{ $value }}"
|
|
- alert: RebuilderdWorkersOffline
|
|
expr: rebuilderd_workers < 3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: rebuilderd
|
|
annotations:
|
|
summary: "Rebuilderd workers offline {{ $labels.instance }})"
|
|
description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"
|