From 6c6dd5c1aecf022bc11281b8f96eeb64d8ccd21f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6rz=2C=20Friedrich?= Date: Wed, 11 May 2022 20:00:52 +0000 Subject: [PATCH] DEV-442: added threshold for pg_repl_lag to avoid false positives on DEV-stage --- group_vars/all/plain.yml | 1 + group_vars/stage_dev/plain.yml | 2 ++ group_vars/stage_prodnso/plain.yml | 2 ++ templates/prometheus/config/prometheus/alert.rules.j2 | 4 ++-- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/group_vars/all/plain.yml b/group_vars/all/plain.yml index af32dd4..619b968 100644 --- a/group_vars/all/plain.yml +++ b/group_vars/all/plain.yml @@ -214,3 +214,4 @@ k8s_basic_services: selfsigned_ca_private_key_passphrase: '{{ selfsigned_ca_private_key_passphrase_vault }}' prometheus_alert_diskspaceusage_warning: 85 +prometheus_alert_pg_replication_lag: 120 diff --git a/group_vars/stage_dev/plain.yml b/group_vars/stage_dev/plain.yml index 3e3c5e9..fe03fcf 100644 --- a/group_vars/stage_dev/plain.yml +++ b/group_vars/stage_dev/plain.yml @@ -374,3 +374,5 @@ webdav_opentracing_jaeger_enabled: true webdav_opentracing_jaeger_http_sender_url: "http://{{ shared_service_kube_jaeger_collector_hostname }}/api/traces" connect_opentracing_jaeger_enabled: true connect_opentracing_jaeger_http_sender_url: "http://{{ shared_service_kube_jaeger_collector_hostname }}/api/traces" + +prometheus_alert_pg_replication_lag: 300 diff --git a/group_vars/stage_prodnso/plain.yml b/group_vars/stage_prodnso/plain.yml index e834c30..e77dd67 100644 --- a/group_vars/stage_prodnso/plain.yml +++ b/group_vars/stage_prodnso/plain.yml @@ -367,3 +367,5 @@ management_oidc_client_secret: "{{ management_oidc_client_secret_vault }}" # https://git.dev-at.de/smardigo-hetzner/communication-keys/ # push mirror: https://prodnso-gitea-01.smardigo.digital/gitea-admin/communication-keys/ gpg_key_smardigo_automation__private: '{{ gpg_key_smardigo_automation__private__vault }}' + +prometheus_alert_pg_replication_lag: 60 diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index de69aa0..defac25 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -271,8 +271,8 @@ groups: description: "checks if postgres service is running.." - alert: postgres replication broken - expr: pg_replication_lag > 120 or absent(pg_replication_lag) - for: 1m + expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag) + for: 5m labels: severity: critical annotations: