From ff9c0d94a1f045c6b5e44ecbc94a26e5f42815c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20H=C3=A4hnel?= <michael.haehnel@netgo.group>
Date: Wed, 11 May 2022 10:38:08 +0000
Subject: [PATCH] Extended Monitoring/Alerting for PostgreSQL

---
 roles/postgres/defaults/main.yml              |  8 +++++++-
 .../config/prometheus/alert.rules.j2          | 20 ++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/roles/postgres/defaults/main.yml b/roles/postgres/defaults/main.yml
index e2b8a09..09246b8 100644
--- a/roles/postgres/defaults/main.yml
+++ b/roles/postgres/defaults/main.yml
@@ -159,7 +159,13 @@ postgres_exporter_additional_queries:
       - lag:
           usage: "GAUGE"
           description: "Replication lag behind master in seconds"
-
+  pg_replication_wal_files:
+    query: "SELECT COUNT(*) FROM pg_ls_dir('pg_wal') as count"
+    master: true
+    metrics:
+      - wal_files_count:
+          usage: "COUNTER"
+          description: "Number of WAL files"
   pg_postmaster:
     query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
     master: true
diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2
index f19a402..de69aa0 100644
--- a/templates/prometheus/config/prometheus/alert.rules.j2
+++ b/templates/prometheus/config/prometheus/alert.rules.j2
@@ -260,8 +260,18 @@ groups:
       summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
       description: "checks if endpoint is reachable or not"
 
+  - alert: postgres down
+    expr: pg_up == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
+      summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
+      description: "checks if postgres service is running.."
+
   - alert: postgres replication broken
-    expr: pg_replication_lag > 120
+    expr: pg_replication_lag > 120 or absent(pg_replication_lag)
     for: 1m
     labels:
       severity: critical
@@ -270,15 +280,15 @@ groups:
       summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
       description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
 
-  - alert: postgres down
-    expr: (-100 * delta((rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m])*8)[60m:])) / (rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m] offset 1m)*8) > 80
+  - alert: postgres replication broken too many wal files
+    expr: pg_replication_wal_files_count > 200
     for: 1m
     labels:
       severity: critical
     annotations:
       identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
-      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has less traffic than expected"
-      description: "checks if postgres receives traffic on internal interface."
+      summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
+      description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
 
   - alert: ssh root login
     expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)