groups: - name: smardigo rules: - alert: too_many_connections expr: pg_stat_database_numbackends > 30 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.datname {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections" description: "Too many connections for more than 2 minutes." - alert: service_down expr: up == 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down" description: "Down for more than 2 minutes." - alert: high_load expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" description: "High load for more than 2 minutes." - alert: apt_upgrades_pending expr: apt_upgrades_pending{origin=~".*security.*"} > 0 for: 1w labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates." description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]." - alert: reboot_required expr: node_reboot_required == 1 for: 1w labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot." description: "Need reboot!" - alert: veeam_backup_failed expr: veeam_backup_failed == 1 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." description: "Backup failed." - alert: probe_ssl_certificates expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days." description: "needs a new certificate until next 30 days." - alert: probe_ssl_certificates_50 expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days." description: "needs a new certificate until next 50 days." - alert: DiskSpaceUsage expr: 100.0 - 100 * (node_filesystem_free_bytes{env="{{ stage }}",mountpoint=~"/|/rootfs"} / node_filesystem_size_bytes{device!="/dev/loop1",env="{{ stage }}",mountpoint=~"/|/rootfs"}) > {{ prometheus_alert_diskspaceusage_warning }} for: 10m labels: severity: warning annotations: summary: "Disk Space Usage (instance {{ '{{' }} $labels.instance {{ '}}' }})" description: "Disk Space on Drive is used more than {{ prometheus_alert_diskspaceusage_warning }}%\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS: {{ '{{' }} $labels {{ '}}' }}" - alert: software_raid_disks_active expr: node_md_disks_active != 2 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken." description: "software-raid disks broken." - alert: software_raid_active expr: node_md_is_active != 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive." description: "software-raid inactive." - alert: restic_backup_failed expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." description: "backup failed." - alert: megaraid_smart_errors expr: megaraid_smart_errors > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors." description: "megaraid smart errors." - alert: megaraid_status_failed expr: megaraid_status_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed." description: "megaraid status errors." - alert: megaraid_other_error_count expr: megaraid_other_error_count > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors." description: "megaraid other error count." - alert: megaraid_exit_status expr: megaraid_exit_status > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed." description: "megaraid exit status." - alert: adaptec_controller_defunct_disk_drives expr: adaptec_controller_defunct_disk_drives > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure." description: "adaptec controller defunct disk drives." - alert: adaptec_physical_devices_smart_failed expr: adaptec_physical_devices_smart_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors." description: "adaptec physical devices smart failed." - alert: adaptec_physical_devices_smart_warnings expr: adaptec_physical_devices_smart_warnings > 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings." description: "adaptec physical devices smart warnings." - alert: adaptec_controller_logical_failed expr: adaptec_controller_logical_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure." description: "adaptec controller logical failed." - alert: adaptec_controller_status_failed expr: adaptec_controller_status_failed > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure." description: "adaptec controller status failed." - alert: adaptec_controller_temperature_status_failed expr: adaptec_controller_temperature_status_failed > 1 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure." description: "adaptec controller temperature status failed" - alert: adaptec_logical_degraded expr: adaptec_logical_degraded > 1 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded." description: "adaptec logical degraded." - alert: backup_execution expr: (time() - restic_backup_timestamp) / 60 > 180 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours." description: "latest backup is older then 90 minutes." - alert: backup_duration expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60 for: 2m labels: severity: warning annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes." description: "backup duration took more than 60 minutes." - alert: endpoint down expr: probe_success == 0 for: 2m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check." description: "checks if endpoint is reachable or not" - alert: postgres down expr: pg_up == 0 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check" description: "checks if postgres service is running.." - alert: postgres replication broken expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag) for: 5m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" description: "checks if postgres replication works well, if lag is higher than X - something bad happened." - alert: postgres replication broken too many wal files expr: pg_replication_wal_files_count > 200 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened." - alert: ssh root login expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits) for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "ssh root login on Instance <{{ '{{' }} $labels.instance {{ '}}' }}> detected. plz check" description: "unexpected ssh root login detected." - alert: offsite backup pending expr: | (time() - offsite_backup_archive_started_seconds) > 27 * 3600 or (time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or (time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or (time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or (time() - offsite_backup_forget_started_seconds) > 27 * 3600 or (time() - offsite_backup_forget_ended_seconds) > 27 * 3600 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h" description: "offsite backups older then 27h." - alert: offsite backup metrics unavailable expr: | absent(offsite_backup_archive_started_seconds) or absent(offsite_backup_archive_ended_seconds) or absent(offsite_backup_transfer_started_seconds) or absent(offsite_backup_transfer_ended_seconds) or absent(offsite_backup_forget_started_seconds) or absent(offsite_backup_forget_ended_seconds) for: 5m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "one or more offsite backup metric unavailable for Instance <{{ '{{' }} $labels.instance {{ '}}' }}>" description: "offsite metrics unavailable." - alert: DO_API_REQUEST_usage expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 for: 10m labels: severity: critical annotations: summary: "Too many request against digitalocean API. plz check." description: "Alert for request against digitalocean API." - alert: elasticsearch health status not green expr: avg(elasticsearch_cluster_health_status{color="green"}) < 1 for: 30m labels: severity: critical annotations: summary: "Elasticsearch health status is not green. Please Check" description: "Alert for Elasticsearch health status"