diff --git a/group_vars/all/plain.yml b/group_vars/all/plain.yml index af32dd4..619b968 100644 --- a/group_vars/all/plain.yml +++ b/group_vars/all/plain.yml @@ -214,3 +214,4 @@ k8s_basic_services: selfsigned_ca_private_key_passphrase: '{{ selfsigned_ca_private_key_passphrase_vault }}' prometheus_alert_diskspaceusage_warning: 85 +prometheus_alert_pg_replication_lag: 120 diff --git a/group_vars/stage_dev/plain.yml b/group_vars/stage_dev/plain.yml index 3e3c5e9..fe03fcf 100644 --- a/group_vars/stage_dev/plain.yml +++ b/group_vars/stage_dev/plain.yml @@ -374,3 +374,5 @@ webdav_opentracing_jaeger_enabled: true webdav_opentracing_jaeger_http_sender_url: "http://{{ shared_service_kube_jaeger_collector_hostname }}/api/traces" connect_opentracing_jaeger_enabled: true connect_opentracing_jaeger_http_sender_url: "http://{{ shared_service_kube_jaeger_collector_hostname }}/api/traces" + +prometheus_alert_pg_replication_lag: 300 diff --git a/group_vars/stage_prodnso/plain.yml b/group_vars/stage_prodnso/plain.yml index e834c30..e77dd67 100644 --- a/group_vars/stage_prodnso/plain.yml +++ b/group_vars/stage_prodnso/plain.yml @@ -367,3 +367,5 @@ management_oidc_client_secret: "{{ management_oidc_client_secret_vault }}" # https://git.dev-at.de/smardigo-hetzner/communication-keys/ # push mirror: https://prodnso-gitea-01.smardigo.digital/gitea-admin/communication-keys/ gpg_key_smardigo_automation__private: '{{ gpg_key_smardigo_automation__private__vault }}' + +prometheus_alert_pg_replication_lag: 60 diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index de69aa0..defac25 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -271,8 +271,8 @@ groups: description: "checks if postgres service is running.." - alert: postgres replication broken - expr: pg_replication_lag > 120 or absent(pg_replication_lag) - for: 1m + expr: pg_replication_lag > {{ prometheus_alert_pg_replication_lag }} or absent(pg_replication_lag) + for: 5m labels: severity: critical annotations: