hetzner-ansible/templates/prometheus/config/prometheus/alert.rules.j2

groups:
- name: smardigo
  rules:
  - alert: too_many_connections
    expr: pg_stat_database_numbackends > 30
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.datname {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections"
      description: "Too many connections for more than 2 minutes."

  - alert: service_down
    expr: up == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
      description: "Down for more than 2 minutes."

  - alert: high_load
    expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
      description: "High load for more than 2 minutes."

  - alert: apt_upgrades_pending
    expr: apt_upgrades_pending{origin=~".*security.*"} > 0
    for: 1w
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates."
      description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]."

  - alert: reboot_required
    expr: node_reboot_required == 1
    for: 1w
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot."
      description: "Need reboot!"

  - alert: veeam_backup_failed
    expr: veeam_backup_failed == 1
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
      description: "Backup failed."

  - alert: probe_ssl_certificates
    expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days."
      description: "needs a new certificate until next 30 days."

  - alert: probe_ssl_certificates_50
    expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days."
      description: "needs a new certificate until next 50 days."

  - alert: DiskSpaceUsage
    expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",mountpoint=~"/|/rootfs"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",mountpoint=~"/|/rootfs"}) > {{ prometheus_alert_diskspaceusage_warning }}
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})"
      description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS: {{ '{{' }} $labels {{ '}}' }}"

  - alert: software_raid_disks_active
    expr: node_md_disks_active != 2
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken."
      description: "software-raid disks broken."

  - alert: software_raid_active
    expr: node_md_is_active != 1
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive."
      description: "software-raid inactive."

  - alert: restic_backup_failed
    expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
      description: "backup failed."

  - alert: megaraid_smart_errors
    expr: megaraid_smart_errors > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors."
      description: "megaraid smart errors."

  - alert: megaraid_status_failed
    expr: megaraid_status_failed > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed."
      description: "megaraid status errors."

  - alert: megaraid_other_error_count
    expr: megaraid_other_error_count > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors."
      description: "megaraid other error count."

  - alert: megaraid_exit_status
    expr: megaraid_exit_status > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed."
      description: "megaraid exit status."

  - alert: adaptec_controller_defunct_disk_drives
    expr: adaptec_controller_defunct_disk_drives > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure."
      description: "adaptec controller defunct disk drives."

  - alert: adaptec_physical_devices_smart_failed
    expr: adaptec_physical_devices_smart_failed > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors."
      description: "adaptec physical devices smart failed."

  - alert: adaptec_physical_devices_smart_warnings
    expr: adaptec_physical_devices_smart_warnings > 1
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings."
      description: "adaptec physical devices smart warnings."

  - alert: adaptec_controller_logical_failed
    expr: adaptec_controller_logical_failed > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure."
      description: "adaptec controller logical failed."

  - alert: adaptec_controller_status_failed
    expr: adaptec_controller_status_failed > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure."
      description: "adaptec controller status failed."

  - alert: adaptec_controller_temperature_status_failed
    expr: adaptec_controller_temperature_status_failed > 1
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure."
      description: "adaptec controller temperature status failed"

  - alert: adaptec_logical_degraded
    expr: adaptec_logical_degraded > 1
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded."
      description: "adaptec logical degraded."

  - alert: backup_execution
    expr: (time() - restic_backup_timestamp) / 60 > 180
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours."
      description: "latest backup is older then 90 minutes."

  - alert: backup_duration
    expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60
    for: 2m
    labels:
      severity: warning
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes."
      description: "backup duration took more than 60 minutes."

  - alert: endpoint down
    expr: probe_success == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
      description: "checks if endpoint is reachable or not"

  - alert: postgres down
    expr: pg_up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
      description: "checks if postgres service is running.."

  - alert: postgres replication broken
    expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag)
    for: 5m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
      description: "checks if postgres replication works well, if lag is higher than X - something bad happened."

  - alert: postgres replication broken too many wal files
    expr: pg_replication_wal_files_count > 200
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
      description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."

  - alert: ssh root login
    expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "ssh root login on Instance <{{ '{{' }} $labels.instance {{ '}}' }}> detected. plz check"
      description: "unexpected ssh root login detected."

  - alert: offsite backup pending
    expr: |
      (time() - offsite_backup_archive_started_seconds) > 27 * 3600 or
      (time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or
      (time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or
      (time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or
      (time() - offsite_backup_forget_started_seconds) > 27 * 3600 or
      (time() - offsite_backup_forget_ended_seconds) > 27 * 3600
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h"
      description: "offsite backups older then 27h."

  - alert: offsite backup metrics unavailable
    expr: |
      absent(offsite_backup_archive_started_seconds) or
      absent(offsite_backup_archive_ended_seconds) or
      absent(offsite_backup_transfer_started_seconds) or
      absent(offsite_backup_transfer_ended_seconds) or
      absent(offsite_backup_forget_started_seconds) or
      absent(offsite_backup_forget_ended_seconds)
    for: 5m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "one or more offsite backup metric unavailable for Instance <{{ '{{' }} $labels.instance {{ '}}' }}>"
      description: "offsite metrics unavailable."

  - alert: DO_API_REQUEST_usage
    expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 or absent(digitalocean_api_ratelimit_remaining)
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "Too many request against digitalocean API. plz check."
      description: "Alert for request against digitalocean API."

  - alert: elasticsearch health status not green
    expr: avg(elasticsearch_cluster_health_status{color="green"}) < 1
    for: 30m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch health status is not green. Please Check"
      description: "Alert for Elasticsearch health status"

  - alert: awx job failed with status error
    expr: changes(awx_status_total{status="error"}[2m]) > 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "awx job failed with status error"
      description: "Alert awx jobs has an error"

  - alert: awx job failed with status failed
    expr: changes(awx_status_total{status="failed"}[2m]) > 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "awx job failed with status failed"
      description: "Alert awx jobs failed"

  - alert: postgres backup zombies
    expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10
    for: 2h
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "postgres backup zombies, have not been deleted"
      description: "postgres backup zombies, have not been deleted"

  - alert: hetzner unattached volumes
    expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes)
    for: 2h
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "unattached volumes in hetzner"
      description: "unattached volumes in hetzner"


  - alert: hetzner ratelimit_remaining low
    expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_remaining < 50 or absent(hetzner_api_ratelimit_remaining)
    for: 10m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "hetzner ratelimit_remaining below 50%"
      description: "hetzner ratelimit_remaining below 50%"

  - alert: hetzner locked server exists
    expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers)
    for: 1h
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "hetzner locked server exists"
      description: "hetzner locked server exists"