Extended Monitoring/Alerting for PostgreSQL

4 years ago · ff9c0d94a1
parent acee683569
commit ff9c0d94a1
2 changed files with 22 additions and 6 deletions
--- a/roles/postgres/defaults/main.yml
+++ b/roles/postgres/defaults/main.yml
@ -159,7 +159,13 @@ postgres_exporter_additional_queries:
      - lag:
          usage: "GAUGE"
          description: "Replication lag behind master in seconds"
-
+  pg_replication_wal_files:
    query: "SELECT COUNT(*) FROM pg_ls_dir('pg_wal') as count"
    master: true
    metrics:
      - wal_files_count:
          usage: "COUNTER"
          description: "Number of WAL files"
  pg_postmaster:
    query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
    master: true
--- a/templates/prometheus/config/prometheus/alert.rules.j2
+++ b/templates/prometheus/config/prometheus/alert.rules.j2
@ -260,8 +260,18 @@ groups:
      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
      description: "checks if endpoint is reachable or not"
  - alert: postgres down
    expr: pg_up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
      summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
      description: "checks if postgres service is running.."
  - alert: postgres replication broken
-    expr: pg_replication_lag > 120
+    expr: pg_replication_lag > 120 or absent(pg_replication_lag)
    for: 1m
    labels:
      severity: critical
@ -270,15 +280,15 @@ groups:
      summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
      description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
-  - alert: postgres down
+  - alert: postgres replication broken too many wal files
-    expr: (-100 * delta((rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m])*8)[60m:])) / (rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m] offset 1m)*8) > 80
+    expr: pg_replication_wal_files_count > 200
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
-      summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has less traffic than expected"
+      summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
-      description: "checks if postgres receives traffic on internal interface."
+      description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
  - alert: ssh root login
    expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)