|
|
|
|
@ -260,8 +260,18 @@ groups:
|
|
|
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check."
|
|
|
|
|
description: "checks if endpoint is reachable or not"
|
|
|
|
|
|
|
|
|
|
- alert: postgres down
|
|
|
|
|
expr: pg_up == 0
|
|
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
annotations:
|
|
|
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
|
|
|
summary: "Postgres service on instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz. check"
|
|
|
|
|
description: "checks if postgres service is running.."
|
|
|
|
|
|
|
|
|
|
- alert: postgres replication broken
|
|
|
|
|
expr: pg_replication_lag > 120
|
|
|
|
|
expr: pg_replication_lag > 120 or absent(pg_replication_lag)
|
|
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
@ -270,15 +280,15 @@ groups:
|
|
|
|
|
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
|
|
|
|
|
description: "checks if postgres replication works well, if lag is higher than X - something bad happened."
|
|
|
|
|
|
|
|
|
|
- alert: postgres down
|
|
|
|
|
expr: (-100 * delta((rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m])*8)[60m:])) / (rate(node_network_receive_bytes_total{instance=~".*postgres.*",job=~"node-exporter",device=~"enp7s0"}[1m] offset 1m)*8) > 80
|
|
|
|
|
- alert: postgres replication broken too many wal files
|
|
|
|
|
expr: pg_replication_wal_files_count > 200
|
|
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
annotations:
|
|
|
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
|
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has less traffic than expected"
|
|
|
|
|
description: "checks if postgres receives traffic on internal interface."
|
|
|
|
|
summary: "Postgres replication on instance <{{ '{{' }} $labels.instance {{ '}}' }}> broken. plz. check"
|
|
|
|
|
description: "checks if postgres replication works well, if count of wal files higher than X - something bad happened."
|
|
|
|
|
|
|
|
|
|
- alert: ssh root login
|
|
|
|
|
expr: authlog_root_login_hits > 0 or absent(authlog_root_login_hits)
|
|
|
|
|
|