From 8374ae0d2abf9a70d2ae311ba3ae04d7b15eeaf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=A4hnel?= Date: Mon, 27 Mar 2023 10:40:59 +0000 Subject: [PATCH] DEV-880 Configure Prometheus high_load Alert instance specific --- group_vars/ubuntu_docker/plain.yml | 5 ++++ host_vars/dev-devops-iaas-01.yml | 5 ++++ host_vars/prodnso-hocr-iaas-01.yml | 7 +++++- host_vars/prodnso-platform-iaas-01.yml | 5 ++++ .../config/prometheus/alert.rules.j2 | 23 ++++++++++++++++++- .../config/prometheus/prometheus.yml.j2 | 10 +++++--- 6 files changed, 50 insertions(+), 5 deletions(-) diff --git a/group_vars/ubuntu_docker/plain.yml b/group_vars/ubuntu_docker/plain.yml index 3b19b97..6298b53 100644 --- a/group_vars/ubuntu_docker/plain.yml +++ b/group_vars/ubuntu_docker/plain.yml @@ -7,3 +7,8 @@ traefik_letsencrypt_provider: "hetzner" hetzner_server_type: cpx21 hetzner_server_labels: "stage={{ stage }} service=ubuntu_docker" + +prometheus_alert_extra_config: + high_load: + duration: 1h + description: "High load for more than 1 hour." \ No newline at end of file diff --git a/host_vars/dev-devops-iaas-01.yml b/host_vars/dev-devops-iaas-01.yml index f51337b..9fa4c36 100644 --- a/host_vars/dev-devops-iaas-01.yml +++ b/host_vars/dev-devops-iaas-01.yml @@ -3,3 +3,8 @@ pass_tenant_id: "devops" custom_plattform_users: [] + +prometheus_alert_extra_config: + high_load: + duration: 1h + description: "High load for more than 1 hour." \ No newline at end of file diff --git a/host_vars/prodnso-hocr-iaas-01.yml b/host_vars/prodnso-hocr-iaas-01.yml index 12017da..cf3e0dd 100644 --- a/host_vars/prodnso-hocr-iaas-01.yml +++ b/host_vars/prodnso-hocr-iaas-01.yml @@ -4,4 +4,9 @@ pass_tenant_id: "hocr" hetzner_server_type: cpx41 -custom_plattform_users: [] \ No newline at end of file +custom_plattform_users: [] + +prometheus_alert_extra_config: + high_load: + duration: 1h + description: "High load for more than 1 hour." \ No newline at end of file diff --git a/host_vars/prodnso-platform-iaas-01.yml b/host_vars/prodnso-platform-iaas-01.yml index a6474f9..786c063 100644 --- a/host_vars/prodnso-platform-iaas-01.yml +++ b/host_vars/prodnso-platform-iaas-01.yml @@ -8,3 +8,8 @@ custom_plattform_users: - 'daniel.risse' - 'esther.fuhrmann' - 'philipp.eichhorn' + +prometheus_alert_extra_config: + high_load: + duration: 1h + description: "High load for more than 1 hour." diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 0f3a44f..397c5ca 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -21,8 +21,14 @@ groups: summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down" description: "Down for more than 2 minutes." +{% set high_load_instance_exclude=[] %} +{% for server_info in stage_server_infos | default([]) %} +{%- if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %} +{{ high_load_instance_exclude.append("instance!="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"') }} +{%- endif %} +{% endfor %} - alert: high_load - expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90 + expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_exclude|join(', ') if high_load_instance_exclude }}} [30s])) * 100) > 90 for: 2m labels: severity: critical @@ -31,6 +37,21 @@ groups: summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" description: "High load for more than 2 minutes." +{% for server_info in stage_server_infos | default([]) %} +{% if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %} +{% set high_load_instance_custom="instance="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"' %} + - alert: high_load_{{ server_info.name }} + expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_custom if high_load_instance_custom }}} [30s])) * 100) > 90 + for: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.duration }}" + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" + description: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.description }}" + +{% endif %} +{% endfor %} - alert: apt_upgrades_pending expr: apt_upgrades_pending{origin=~".*security.*"} > 0 for: 1w diff --git a/templates/prometheus/config/prometheus/prometheus.yml.j2 b/templates/prometheus/config/prometheus/prometheus.yml.j2 index c560b75..aa5ab0e 100644 --- a/templates/prometheus/config/prometheus/prometheus.yml.j2 +++ b/templates/prometheus/config/prometheus/prometheus.yml.j2 @@ -381,7 +381,7 @@ scrape_configs: and hostvars[server_info.name].node_exporter_enabled) or (hostvars[server_info.name].node_exporter_enabled is not defined) %} - '{{ server_info.private_ip }}:{{ monitor_port_system }}!{{ server_info.name }}.{{ hostvars[server_info.name].domain }}', + '{{ server_info.private_ip }}:{{ monitor_port_system }}!{{ server_info.name }}.{{ hostvars[server_info.name].domain }}!{{ server_info.service }}', {% endif %} {% endfor %} ] @@ -390,11 +390,15 @@ scrape_configs: project: servers relabel_configs: - source_labels: [__address__] - regex: .*!(.*) + regex: .*!(.*)!.* target_label: instance replacement: $1 - source_labels: [__address__] - regex: (.*)!.* + regex: .*!.*!(.*) + target_label: service + replacement: $1 + - source_labels: [__address__] + regex: (.*)!.*!.* target_label: __address__ replacement: $1