infrastructure/roles/prometheus/files/node.rules.yml

groups:
  - name: node_common
    interval: 60s
    rules:

  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro1.pkgbuild.com",instance!~"repro2.pkgbuild.com"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host high CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host swap is filling up (instance {{ $labels.instance }})"
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of memory (instance {{ $labels.instance }})"
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of disk space (instance {{ $labels.instance }})"
      description: "Disk is almost full (< 20% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostDiskWillFillIn4Hours
    expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
      description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of inodes (instance {{ $labels.instance }})"
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host OOM kill detected (instance {{ $labels.instance }})"
      description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - name: prometheus
    interval: 60s
    rules:
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus target missing (instance {{ $labels.instance }})"
          description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
      - alert: PrometheusTooManyRestarts
        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTemplateTextExpansionFailures
        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusNotificationsBacklog
        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusAlertmanagerNotificationFailing
        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTargetScrapingSlow
        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
          description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus large scrape (instance {{ $labels.instance }})"
          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbCheckpointCreationFailures
        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbCompactionsFailed
        expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbWalTruncationsFailed
        expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - name: pacman
    interval: 2m
    rules:
      - alert: pacman_updates_pending
        expr: pacman_updates_pending > 50
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} has out of date packages'
          summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'

  - name: btrfs
    interval: 2m
    rules:
      - alert: btrfs_corruption_errs
        expr: btrfs_corruption_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs corruption errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
      - alert: btrfs_write_io_errs
        expr: btrfs_write_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs write_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
      - alert: btrfs_read_io_errs
        expr: btrfs_read_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs read_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
      - alert: btrfs_flush_io_errs
        expr: btrfs_flush_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs flush_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
      - alert: btrfs_corruption_errs
        expr: btrfs_corruption_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs corruption errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'

  - name: borg
    interval: 60s
    rules:
      - alert: BorgHetznerMissingBackup
        expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.2
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
          description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
      - alert: BorgOffsiteMissingBackup
        expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.2
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
          description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'

  - name: systemd_unit
    interval: 15s
    rules:
      - alert: systemd_unit_failed
        expr: |
          node_systemd_unit_state{state="failed"} > 0
        for: 3m
        labels:
          severity: critical
        annotations:
          description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
          summary: 'Systemd unit failed'

      - alert: systemd_unit_flapping
        expr: |
          changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
        labels:
          severity: critical
        annotations:
          description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
          summary: 'Systemd unit flapping'

  - name: gitlab
    interval: 15s
    rules:
      - alert: ServiceDown
        expr: avg_over_time(up[5m]) * 100 < 50
        annotations:
          description: The service {{ $labels.job }} instance {{ $labels.instance }} is
            not responding for more than 50% of the time for 5 minutes.
          summary: The service {{ $labels.job }} is not responding
      - alert: RedisDown
        expr: avg_over_time(redis_up[5m]) * 100 < 50
        annotations:
          description: The Redis service {{ $labels.job }} instance {{ $labels.instance
            }} is not responding for more than 50% of the time for 5 minutes.
          summary: The Redis service {{ $labels.job }} is not responding
      - alert: PostgresDown
        expr: avg_over_time(pg_up[5m]) * 100 < 50
        annotations:
          description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
            }} is not responding for more than 50% of the time for 5 minutes.
          summary: The Postgres service {{ $labels.job }} is not responding
      - alert: UnicornQueueing
        expr: avg_over_time(unicorn_queued_connections[30m]) > 1
        annotations:
          description: Unicorn instance {{ $labels.instance }} is queueing requests with
            an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
          summary: Unicorn is queueing requests
      - alert: PumaQueueing
        expr: avg_over_time(puma_queued_connections[30m]) > 1
        annotations:
          description: Puma instance {{ $labels.instance }} is queueing requests with
            an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
          summary: Puma is queueing requests
      - alert: HighUnicornUtilization
        expr: instance:unicorn_utilization:ratio * 100 > 90
        for: 60m
        annotations:
          description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
          summary: Unicorn is has high utilization
      - alert: HighPumaUtilization
        expr: instance:puma_utilization:ratio * 100 > 90
        for: 60m
        annotations:
          description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
          summary: Puma is has high utilization
      - alert: SidekiqJobsQueuing
        expr: sum by (name) (sidekiq_queue_size) > 0
        for: 60m
        annotations:
          summary: Sidekiq has jobs queued
          description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
      - alert: HighgRPCResourceExhaustedRate
        expr: >
          sum without (grpc_code) (
            job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
          ) /
          sum without (grpc_code) (
            job_grpc:grpc_server_handled_total:rate5m
          ) * 100 > 1
        for: 60m
        annotations:
          summary: High gRPC ResourceExhausted error rate
          description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
      - alert: PostgresDatabaseDeadlocks
        expr: increase(pg_stat_database_deadlocks[5m]) > 0
        annotations:
          summary: Postgres database has deadlocks
          description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
      - alert: PostgresDatabaseDeadlockCancels
        expr: increase(pg_stat_database_deadlocks[5m]) > 0
        annotations:
          summary: Postgres database has queries canceled due to deadlocks
          description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
      # Low-traffic - < 10 QPS (600 RPM)
      - alert: WorkhorseHighErrorRate
        expr: >
          (
            sum without (job, code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
            ) /
            sum without (job,code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
            ) < 10
          ) * 100 > 50
        annotations:
          summary: Workhorse has high error rates
          description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
      # High-traffic - >= 10 QPS (600 RPM)
      - alert: WorkhorseHighErrorRate
        expr: >
          (
            sum without (job, code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
            ) /
            sum without (job,code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
            ) > 10
          ) * 100 > 10
        annotations:
          summary: Workhorse has high error rates
          description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.

  - name: blackbox
    interval: 15s
    rules:
      - alert: BlackboxProbeFailed
        expr: probe_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe failed (instance {{ $labels.instance }})"
          description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: BlackboxProbeHttpFailure
        expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
          description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
          description: "SSL certificate expires in 25 days\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - name: rebuilderd
    interval: 15m
    rules:
      - alert: RebuilderdQueueNotEmpty
        expr: rebuilderd_queue_length > 2000
        for: 24h
        labels:
          severity: warning
          service: rebuilderd
        annotations:
          summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
          description: "Rebuilderd's queue length is now: {{ $value }}"
      - alert: RebuilderdWorkersOffline
        expr: rebuilderd_workers  < 3
        for: 5m
        labels:
          severity: warning
          service: rebuilderd
        annotations:
          summary: "Rebuilderd workers offline {{ $labels.instance }})"
          description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"