From ff9c0d94a1f045c6b5e44ecbc94a26e5f42815c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=A4hnel?= Date: Wed, 11 May 2022 10:38:08 +0000 Subject: [PATCH] Extended Monitoring/Alerting for PostgreSQL --- roles/postgres/defaults/main.yml | 8 +++++++- .../config/prometheus/alert.rules.j2 | 20 ++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/roles/postgres/defaults/main.yml b/roles/postgres/defaults/main.yml index e2b8a09..09246b8 100644 --- a/roles/postgres/defaults/main.yml +++ b/roles/postgres/defaults/main.yml @@ -159,7 +159,13 @@ postgres_exporter_additional_queries: - lag: usage: "GAUGE" description: "Replication lag behind master in seconds" - + pg_replication_wal_files: + query: "SELECT COUNT(*) FROM pg_ls_dir('pg_wal') as count" + master: true + metrics: + - wal_files_count: + usage: "COUNTER" + description: "Number of WAL files" pg_postmaster: query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()" master: true diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index f19a402..de69aa0 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -260,8 +260,18 @@ groups: summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check." description: "checks if endpoint is reachable or not" + - alert: postgres down + expr: pg_up == 0 + for: 1m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check" + description: "checks if postgres service is running.." + - alert: postgres replication broken - expr: pg_replication_lag > 120 + expr: pg_replication_lag > 120 or absent(pg_replication_lag) for: 1m labels: severity: critical @@ -270,15 +280,15 @@ groups: summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" description: "checks if postgres replication works well, if lag is higher than X - something bad happened." - - alert: postgres down - expr: (-100 * delta((rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m])*8)[60m:])) / (rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m] offset 1m)*8) > 80 + - alert: postgres replication broken too many wal files + expr: pg_replication_wal_files_count > 200 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' - summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has less traffic than expected" - description: "checks if postgres receives traffic on internal interface." + summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" + description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened." - alert: ssh root login expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)