groups: - name: smardigo rules: - alert: too_many_connections expr: pg_stat_database_numbackends > 30 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.datname {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections" description: "Too many connections for more than 2 minutes." - alert: service_down expr: up == 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down" description: "Down for more than 2 minutes." {% set high_load_instance_exclude=[] %} {% for server_info in stage_server_infos | default([]) %} {%- if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %} {{ high_load_instance_exclude.append("instance!="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"') }} {%- endif %} {% endfor %} - alert: high_load expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_exclude|join(', ') if high_load_instance_exclude }}} [30s])) * 100) > 90 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" description: "High load for more than 2 minutes." {% for server_info in stage_server_infos | default([]) %} {% if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %} {% set high_load_instance_custom="instance="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"' %} - alert: high_load_{{ server_info.name }} expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_custom if high_load_instance_custom }}} [30s])) * 100) > 90 for: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.duration }}" labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" description: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.description }}" {% endif %} {% endfor %} - alert: apt_upgrades_pending expr: apt_upgrades_pending{origin=~".*security.*"} > 0 for: 1w labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates." description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]." - alert: reboot_required expr: node_reboot_required == 1 for: 1w labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot." description: "Need reboot!" - alert: veeam_backup_failed expr: veeam_backup_failed == 1 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." description: "Backup failed." - alert: probe_ssl_certificates expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days." description: "needs a new certificate until next 30 days." - alert: probe_ssl_certificates_50 expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days." description: "needs a new certificate until next 50 days." - alert: DiskSpaceUsage expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",mountpoint=~"/|/rootfs"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",mountpoint=~"/|/rootfs"}) > {{ prometheus_alert_diskspaceusage_warning }} for: 10m labels: severity: warning annotations: summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})" description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS: {{ '{{' }} $labels {{ '}}' }}" - alert: DiskSpaceUsage mounted volumes expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",device=~"/dev/mapper/.*"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",device=~"/dev/mapper/.*"}) > {{ prometheus_alert_diskspaceusage_warning }} for: 10m labels: severity: critical annotations: summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})" description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS: {{ '{{' }} $labels {{ '}}' }}" - alert: software_raid_disks_active expr: node_md_disks_active != 2 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken." description: "software-raid disks broken." - alert: software_raid_active expr: node_md_is_active != 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive." description: "software-raid inactive." - alert: restic_backup_failed expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." description: "backup failed." - alert: nightly_backup_failed_maria expr: | (time() - nightly_backup_successful_maria_finished_seconds) > 30 * 3600 or absent(nightly_backup_successful_maria_finished_seconds) for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed." description: "MariaDB backup failed." - alert: nightly_backup_failed_postgres expr: | (time() - nightly_backup_successful_postgres_finished_seconds) > 30 * 3600 or absent(nightly_backup_successful_postgres_finished_seconds) for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed." description: "PostgreSQL backup failed." - alert: nightly_restoretest_failed_postgres expr: | (time() - nightly_restore_successful_generic{job="restore_test",database_engine="postgres"}) > 30 * 3600 or absent(nightly_restore_successful_generic{job="restore_test",database_engine="postgres"}) for: 3h labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly restore test failed." description: "PostgreSQL restore test failed." - alert: nightly_restoretest_failed_maria expr: | (time() - nightly_restore_successful_generic{job="restore_test",database_engine="maria"}) > 30 * 3600 or absent(nightly_restore_successful_generic{job="restore_test",database_engine="maria"}) for: 3h labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly restore test failed." description: "MariaDB restore test failed." - alert: megaraid_smart_errors expr: megaraid_smart_errors > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors." description: "megaraid smart errors." - alert: megaraid_status_failed expr: megaraid_status_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed." description: "megaraid status errors." - alert: megaraid_other_error_count expr: megaraid_other_error_count > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors." description: "megaraid other error count." - alert: megaraid_exit_status expr: megaraid_exit_status > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed." description: "megaraid exit status." - alert: adaptec_controller_defunct_disk_drives expr: adaptec_controller_defunct_disk_drives > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure." description: "adaptec controller defunct disk drives." - alert: adaptec_physical_devices_smart_failed expr: adaptec_physical_devices_smart_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors." description: "adaptec physical devices smart failed." - alert: adaptec_physical_devices_smart_warnings expr: adaptec_physical_devices_smart_warnings > 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings." description: "adaptec physical devices smart warnings." - alert: adaptec_controller_logical_failed expr: adaptec_controller_logical_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure." description: "adaptec controller logical failed." - alert: adaptec_controller_status_failed expr: adaptec_controller_status_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure." description: "adaptec controller status failed." - alert: adaptec_controller_temperature_status_failed expr: adaptec_controller_temperature_status_failed > 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure." description: "adaptec controller temperature status failed" - alert: adaptec_logical_degraded expr: adaptec_logical_degraded > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded." description: "adaptec logical degraded." - alert: backup_execution expr: (time() - restic_backup_timestamp) / 60 > 180 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours." description: "latest backup is older then 90 minutes." - alert: backup_duration expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes." description: "backup duration took more than 60 minutes." - alert: endpoint down expr: probe_success == 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check." description: "checks if endpoint is reachable or not" - alert: postgres down expr: pg_up == 0 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check" description: "checks if postgres service is running.." - alert: postgres replication broken expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag) for: 5m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" description: "checks if postgres replication works well, if lag is higher than X - something bad happened." - alert: postgres replication broken too many wal files expr: pg_replication_wal_files_count > 200 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened." - alert: ssh root login expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits) for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "ssh root login on Instance <{{ '{{' }} $labels.instance {{ '}}' }}> detected. plz check" description: "unexpected ssh root login detected." - alert: offsite backup pending expr: | (time() - offsite_backup_archive_started_seconds) > 30 * 3600 or (time() - offsite_backup_archive_ended_seconds) > 30 * 3600 or (time() - offsite_backup_transfer_started_seconds) > 30 * 3600 or (time() - offsite_backup_transfer_ended_seconds) > 30 * 3600 or (time() - offsite_backup_forget_started_seconds) > 30 * 3600 or (time() - offsite_backup_forget_ended_seconds) > 30 * 3600 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 30h" description: "offsite backups older then 30h." - alert: offsite backup metrics unavailable expr: | absent(offsite_backup_archive_started_seconds) or absent(offsite_backup_archive_ended_seconds) or absent(offsite_backup_transfer_started_seconds) or absent(offsite_backup_transfer_ended_seconds) or absent(offsite_backup_forget_started_seconds) or absent(offsite_backup_forget_ended_seconds) for: 5m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "one or more offsite backup metric unavailable for Instance <{{ '{{' }} $labels.instance {{ '}}' }}>" description: "offsite metrics unavailable." - alert: DO_API_REQUEST_usage expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 or absent(digitalocean_api_ratelimit_remaining) for: 10m labels: severity: critical annotations: summary: "Too many request against digitalocean API. plz check." description: "Alert for request against digitalocean API." - alert: elasticsearch health status not green expr: avg(elasticsearch_cluster_health_status{color="green"}) < 1 for: 30m labels: severity: critical annotations: summary: "Elasticsearch health status is not green. Please Check" description: "Alert for Elasticsearch health status" - alert: elasticsearch - usage of active shards greater than 90% expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 90 for: 30m labels: severity: warning annotations: summary: "ES cluster - active shards reaches WARN-threshold... Please clean up" description: "WARNING: Alert for ES active shards usage" - alert: elasticsearch - usage of active shards greater than 95% expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 95 for: 5m labels: severity: critical annotations: summary: "ES cluster - active shards reaches CRITICAL-threshold... Please clean up" description: "WARNING: Alert for ES active shards usage" - alert: awx job failed with status error expr: changes(awx_status_total{status="error"}[2m]) > 0 for: 2m labels: severity: critical annotations: summary: "awx job failed with status error" description: "Alert awx jobs has an error" - alert: awx job failed with status failed expr: changes(awx_status_total{status="failed"}[2m]) > 0 for: 2m labels: severity: critical annotations: summary: "awx job failed with status failed" description: "Alert awx jobs failed" - alert: postgres backup zombies expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ shared_service_postgres_primary }}.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ shared_service_postgres_primary }}.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10 for: 2h labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "postgres backup zombies, have not been deleted" description: "postgres backup zombies, have not been deleted" - alert: hetzner ratelimit_remaining low expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_limit < 50 or absent(hetzner_api_ratelimit_remaining) for: 10m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "hetzner ratelimit_remaining below 50%" description: "hetzner ratelimit_remaining below 50%" - alert: hetzner unattached volumes expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes) for: 2h labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "unattached volumes in hetzner" description: "unattached volumes in hetzner" - alert: hetzner locked server exists expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers) for: 1h labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "hetzner locked server exists" description: "hetzner locked server exists" - alert: prom2teams down expr: process_cpu_seconds_total{application="prom2teams"} <= 0 or absent(process_cpu_seconds_total{application="prom2teams"}) for: 5m labels: severity: critical receiver: email annotations: summary: "prom2teams is down" description: "prom2teams is down"