You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
413 lines
16 KiB
Django/Jinja
413 lines
16 KiB
Django/Jinja
groups:
|
|
- name: smardigo
|
|
rules:
|
|
- alert: too_many_connections
|
|
expr: pg_stat_database_numbackends > 30
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.datname {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections"
|
|
description: "Too many connections for more than 2 minutes."
|
|
|
|
- alert: service_down
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
|
|
description: "Down for more than 2 minutes."
|
|
|
|
- alert: high_load
|
|
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
|
|
description: "High load for more than 2 minutes."
|
|
|
|
- alert: apt_upgrades_pending
|
|
expr: apt_upgrades_pending{origin=~".*security.*"} > 0
|
|
for: 1w
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates."
|
|
description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]."
|
|
|
|
- alert: reboot_required
|
|
expr: node_reboot_required == 1
|
|
for: 1w
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot."
|
|
description: "Need reboot!"
|
|
|
|
- alert: veeam_backup_failed
|
|
expr: veeam_backup_failed == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
|
|
description: "Backup failed."
|
|
|
|
- alert: probe_ssl_certificates
|
|
expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days."
|
|
description: "needs a new certificate until next 30 days."
|
|
|
|
- alert: probe_ssl_certificates_50
|
|
expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days."
|
|
description: "needs a new certificate until next 50 days."
|
|
|
|
- alert: DiskSpaceUsage
|
|
expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",mountpoint=~"/|/rootfs"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",mountpoint=~"/|/rootfs"}) > {{ prometheus_alert_diskspaceusage_warning }}
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})"
|
|
description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS: {{ '{{' }} $labels {{ '}}' }}"
|
|
|
|
- alert: software_raid_disks_active
|
|
expr: node_md_disks_active != 2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken."
|
|
description: "software-raid disks broken."
|
|
|
|
- alert: software_raid_active
|
|
expr: node_md_is_active != 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive."
|
|
description: "software-raid inactive."
|
|
|
|
- alert: restic_backup_failed
|
|
expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
|
|
description: "backup failed."
|
|
|
|
- alert: megaraid_smart_errors
|
|
expr: megaraid_smart_errors > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors."
|
|
description: "megaraid smart errors."
|
|
|
|
- alert: megaraid_status_failed
|
|
expr: megaraid_status_failed > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed."
|
|
description: "megaraid status errors."
|
|
|
|
- alert: megaraid_other_error_count
|
|
expr: megaraid_other_error_count > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors."
|
|
description: "megaraid other error count."
|
|
|
|
- alert: megaraid_exit_status
|
|
expr: megaraid_exit_status > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed."
|
|
description: "megaraid exit status."
|
|
|
|
- alert: adaptec_controller_defunct_disk_drives
|
|
expr: adaptec_controller_defunct_disk_drives > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure."
|
|
description: "adaptec controller defunct disk drives."
|
|
|
|
- alert: adaptec_physical_devices_smart_failed
|
|
expr: adaptec_physical_devices_smart_failed > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors."
|
|
description: "adaptec physical devices smart failed."
|
|
|
|
- alert: adaptec_physical_devices_smart_warnings
|
|
expr: adaptec_physical_devices_smart_warnings > 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings."
|
|
description: "adaptec physical devices smart warnings."
|
|
|
|
- alert: adaptec_controller_logical_failed
|
|
expr: adaptec_controller_logical_failed > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure."
|
|
description: "adaptec controller logical failed."
|
|
|
|
- alert: adaptec_controller_status_failed
|
|
expr: adaptec_controller_status_failed > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure."
|
|
description: "adaptec controller status failed."
|
|
|
|
- alert: adaptec_controller_temperature_status_failed
|
|
expr: adaptec_controller_temperature_status_failed > 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure."
|
|
description: "adaptec controller temperature status failed"
|
|
|
|
- alert: adaptec_logical_degraded
|
|
expr: adaptec_logical_degraded > 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded."
|
|
description: "adaptec logical degraded."
|
|
|
|
- alert: backup_execution
|
|
expr: (time() - restic_backup_timestamp) / 60 > 180
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours."
|
|
description: "latest backup is older then 90 minutes."
|
|
|
|
- alert: backup_duration
|
|
expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes."
|
|
description: "backup duration took more than 60 minutes."
|
|
|
|
- alert: endpoint down
|
|
expr: probe_success == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
|
|
description: "checks if endpoint is reachable or not"
|
|
|
|
- alert: postgres down
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
|
|
description: "checks if postgres service is running.."
|
|
|
|
- alert: postgres replication broken
|
|
expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
|
|
description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
|
|
|
|
- alert: postgres replication broken too many wal files
|
|
expr: pg_replication_wal_files_count > 200
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
|
|
description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
|
|
|
|
- alert: ssh root login
|
|
expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "ssh root login on Instance <{{ '{{' }} $labels.instance {{ '}}' }}> detected. plz check"
|
|
description: "unexpected ssh root login detected."
|
|
|
|
- alert: offsite backup pending
|
|
expr: |
|
|
(time() - offsite_backup_archive_started_seconds) > 27 * 3600 or
|
|
(time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or
|
|
(time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or
|
|
(time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or
|
|
(time() - offsite_backup_forget_started_seconds) > 27 * 3600 or
|
|
(time() - offsite_backup_forget_ended_seconds) > 27 * 3600
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h"
|
|
description: "offsite backups older then 27h."
|
|
|
|
- alert: offsite backup metrics unavailable
|
|
expr: |
|
|
absent(offsite_backup_archive_started_seconds) or
|
|
absent(offsite_backup_archive_ended_seconds) or
|
|
absent(offsite_backup_transfer_started_seconds) or
|
|
absent(offsite_backup_transfer_ended_seconds) or
|
|
absent(offsite_backup_forget_started_seconds) or
|
|
absent(offsite_backup_forget_ended_seconds)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "one or more offsite backup metric unavailable for Instance <{{ '{{' }} $labels.instance {{ '}}' }}>"
|
|
description: "offsite metrics unavailable."
|
|
|
|
- alert: DO_API_REQUEST_usage
|
|
expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 or absent(digitalocean_api_ratelimit_remaining)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Too many request against digitalocean API. plz check."
|
|
description: "Alert for request against digitalocean API."
|
|
|
|
- alert: elasticsearch health status not green
|
|
expr: avg(elasticsearch_cluster_health_status{color="green"}) < 1
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Elasticsearch health status is not green. Please Check"
|
|
description: "Alert for Elasticsearch health status"
|
|
|
|
- alert: awx job failed with status error
|
|
expr: changes(awx_status_total{status="error"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "awx job failed with status error"
|
|
description: "Alert awx jobs has an error"
|
|
|
|
- alert: awx job failed with status failed
|
|
expr: changes(awx_status_total{status="failed"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "awx job failed with status failed"
|
|
description: "Alert awx jobs failed"
|
|
|
|
- alert: postgres backup zombies
|
|
expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10
|
|
for: 2h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "postgres backup zombies, have not been deleted"
|
|
description: "postgres backup zombies, have not been deleted"
|
|
|
|
- alert: hetzner unattached volumes
|
|
expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes)
|
|
for: 2h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "unattached volumes in hetzner"
|
|
description: "unattached volumes in hetzner"
|
|
|
|
|
|
- alert: hetzner ratelimit_remaining low
|
|
expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_remaining < 50 or absent(hetzner_api_ratelimit_remaining)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "hetzner ratelimit_remaining below 50%"
|
|
description: "hetzner ratelimit_remaining below 50%"
|
|
|
|
- alert: hetzner locked server exists
|
|
expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers)
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
summary: "hetzner locked server exists"
|
|
description: "hetzner locked server exists"
|
|
|
|
|