You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hetzner-ansible/templates/prometheus/config/prometheus/alert.rules.j2

413 lines
16 KiB
Django/Jinja

groups:
- name: smardigo
rules:
- alert: too_many_connections
expr: pg_stat_database_numbackends > 30
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.datname {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections"
description: "Too many connections for more than 2 minutes."
- alert: service_down
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
description: "Down for more than 2 minutes."
- alert: high_load
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
description: "High load for more than 2 minutes."
- alert: apt_upgrades_pending
expr: apt_upgrades_pending{origin=~".*security.*"} > 0
for: 1w
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates."
description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]."
- alert: reboot_required
expr: node_reboot_required == 1
for: 1w
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot."
description: "Need reboot!"
- alert: veeam_backup_failed
expr: veeam_backup_failed == 1
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
description: "Backup failed."
- alert: probe_ssl_certificates
expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days."
description: "needs a new certificate until next 30 days."
- alert: probe_ssl_certificates_50
expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days."
description: "needs a new certificate until next 50 days."
- alert: DiskSpaceUsage
expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",mountpoint=~"/|/rootfs"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",mountpoint=~"/|/rootfs"}) > {{ prometheus_alert_diskspaceusage_warning }}
for: 10m
labels:
severity: warning
annotations:
summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})"
description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS: {{ '{{' }} $labels {{ '}}' }}"
- alert: software_raid_disks_active
expr: node_md_disks_active != 2
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken."
description: "software-raid disks broken."
- alert: software_raid_active
expr: node_md_is_active != 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive."
description: "software-raid inactive."
- alert: restic_backup_failed
expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
description: "backup failed."
- alert: megaraid_smart_errors
expr: megaraid_smart_errors > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors."
description: "megaraid smart errors."
- alert: megaraid_status_failed
expr: megaraid_status_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed."
description: "megaraid status errors."
- alert: megaraid_other_error_count
expr: megaraid_other_error_count > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors."
description: "megaraid other error count."
- alert: megaraid_exit_status
expr: megaraid_exit_status > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed."
description: "megaraid exit status."
- alert: adaptec_controller_defunct_disk_drives
expr: adaptec_controller_defunct_disk_drives > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure."
description: "adaptec controller defunct disk drives."
- alert: adaptec_physical_devices_smart_failed
expr: adaptec_physical_devices_smart_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors."
description: "adaptec physical devices smart failed."
- alert: adaptec_physical_devices_smart_warnings
expr: adaptec_physical_devices_smart_warnings > 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings."
description: "adaptec physical devices smart warnings."
- alert: adaptec_controller_logical_failed
expr: adaptec_controller_logical_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure."
description: "adaptec controller logical failed."
- alert: adaptec_controller_status_failed
expr: adaptec_controller_status_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure."
description: "adaptec controller status failed."
- alert: adaptec_controller_temperature_status_failed
expr: adaptec_controller_temperature_status_failed > 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure."
description: "adaptec controller temperature status failed"
- alert: adaptec_logical_degraded
expr: adaptec_logical_degraded > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded."
description: "adaptec logical degraded."
- alert: backup_execution
expr: (time() - restic_backup_timestamp) / 60 > 180
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours."
description: "latest backup is older then 90 minutes."
- alert: backup_duration
expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes."
description: "backup duration took more than 60 minutes."
- alert: endpoint down
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
description: "checks if endpoint is reachable or not"
- alert: postgres down
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
description: "checks if postgres service is running.."
- alert: postgres replication broken
expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag)
for: 5m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
- alert: postgres replication broken too many wal files
expr: pg_replication_wal_files_count > 200
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
- alert: ssh root login
expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "ssh root login on Instance <{{ '{{' }} $labels.instance {{ '}}' }}> detected. plz check"
description: "unexpected ssh root login detected."
- alert: offsite backup pending
expr: |
(time() - offsite_backup_archive_started_seconds) > 27 * 3600 or
(time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or
(time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or
(time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or
(time() - offsite_backup_forget_started_seconds) > 27 * 3600 or
(time() - offsite_backup_forget_ended_seconds) > 27 * 3600
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h"
description: "offsite backups older then 27h."
- alert: offsite backup metrics unavailable
expr: |
absent(offsite_backup_archive_started_seconds) or
absent(offsite_backup_archive_ended_seconds) or
absent(offsite_backup_transfer_started_seconds) or
absent(offsite_backup_transfer_ended_seconds) or
absent(offsite_backup_forget_started_seconds) or
absent(offsite_backup_forget_ended_seconds)
for: 5m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "one or more offsite backup metric unavailable for Instance <{{ '{{' }} $labels.instance {{ '}}' }}>"
description: "offsite metrics unavailable."
- alert: DO_API_REQUEST_usage
expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 or absent(digitalocean_api_ratelimit_remaining)
for: 10m
labels:
severity: critical
annotations:
summary: "Too many request against digitalocean API. plz check."
description: "Alert for request against digitalocean API."
- alert: elasticsearch health status not green
expr: avg(elasticsearch_cluster_health_status{color="green"}) < 1
for: 30m
labels:
severity: critical
annotations:
summary: "Elasticsearch health status is not green. Please Check"
description: "Alert for Elasticsearch health status"
- alert: awx job failed with status error
expr: changes(awx_status_total{status="error"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status error"
description: "Alert awx jobs has an error"
- alert: awx job failed with status failed
expr: changes(awx_status_total{status="failed"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status failed"
description: "Alert awx jobs failed"
- alert: postgres backup zombies
expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10
for: 2h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "postgres backup zombies, have not been deleted"
description: "postgres backup zombies, have not been deleted"
- alert: hetzner unattached volumes
expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes)
for: 2h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "unattached volumes in hetzner"
description: "unattached volumes in hetzner"
- alert: hetzner ratelimit_remaining low
expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_remaining < 50 or absent(hetzner_api_ratelimit_remaining)
for: 10m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "hetzner ratelimit_remaining below 50%"
description: "hetzner ratelimit_remaining below 50%"
- alert: hetzner locked server exists
expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers)
for: 1h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "hetzner locked server exists"
description: "hetzner locked server exists"