Extended Monitoring/Alerting for PostgreSQL

feature/DEV-380
Michael Hähnel 4 years ago committed by Görz, Friedrich
parent acee683569
commit ff9c0d94a1

@ -159,7 +159,13 @@ postgres_exporter_additional_queries:
- lag: - lag:
usage: "GAUGE" usage: "GAUGE"
description: "Replication lag behind master in seconds" description: "Replication lag behind master in seconds"
pg_replication_wal_files:
query: "SELECT COUNT(*) FROM pg_ls_dir('pg_wal') as count"
master: true
metrics:
- wal_files_count:
usage: "COUNTER"
description: "Number of WAL files"
pg_postmaster: pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()" query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true master: true

@ -260,8 +260,18 @@ groups:
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check." summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
description: "checks if endpoint is reachable or not" description: "checks if endpoint is reachable or not"
- alert: postgres down
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
description: "checks if postgres service is running.."
- alert: postgres replication broken - alert: postgres replication broken
expr: pg_replication_lag > 120 expr: pg_replication_lag > 120 or absent(pg_replication_lag)
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -270,15 +280,15 @@ groups:
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check" summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
description: "checks if postgres replication works well, if lag is higher than X - something bad happened." description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
- alert: postgres down - alert: postgres replication broken too many wal files
expr: (-100 * delta((rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m])*8)[60m:])) / (rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m] offset 1m)*8) > 80 expr: pg_replication_wal_files_count > 200
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
annotations: annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has less traffic than expected" summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
description: "checks if postgres receives traffic on internal interface." description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
- alert: ssh root login - alert: ssh root login
expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits) expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)

Loading…
Cancel
Save