DEV-880 Configure Prometheus high_load Alert instance specific

qa
Michael Hähnel 3 years ago committed by Görz, Friedrich
parent b9f753fa92
commit 8374ae0d2a

@ -7,3 +7,8 @@ traefik_letsencrypt_provider: "hetzner"
hetzner_server_type: cpx21 hetzner_server_type: cpx21
hetzner_server_labels: "stage={{ stage }} service=ubuntu_docker" hetzner_server_labels: "stage={{ stage }} service=ubuntu_docker"
prometheus_alert_extra_config:
high_load:
duration: 1h
description: "High load for more than 1 hour."

@ -3,3 +3,8 @@
pass_tenant_id: "devops" pass_tenant_id: "devops"
custom_plattform_users: [] custom_plattform_users: []
prometheus_alert_extra_config:
high_load:
duration: 1h
description: "High load for more than 1 hour."

@ -4,4 +4,9 @@ pass_tenant_id: "hocr"
hetzner_server_type: cpx41 hetzner_server_type: cpx41
custom_plattform_users: [] custom_plattform_users: []
prometheus_alert_extra_config:
high_load:
duration: 1h
description: "High load for more than 1 hour."

@ -8,3 +8,8 @@ custom_plattform_users:
- 'daniel.risse' - 'daniel.risse'
- 'esther.fuhrmann' - 'esther.fuhrmann'
- 'philipp.eichhorn' - 'philipp.eichhorn'
prometheus_alert_extra_config:
high_load:
duration: 1h
description: "High load for more than 1 hour."

@ -21,8 +21,14 @@ groups:
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down" summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
description: "Down for more than 2 minutes." description: "Down for more than 2 minutes."
{% set high_load_instance_exclude=[] %}
{% for server_info in stage_server_infos | default([]) %}
{%- if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %}
{{ high_load_instance_exclude.append("instance!="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"') }}
{%- endif %}
{% endfor %}
- alert: high_load - alert: high_load
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90 expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_exclude|join(', ') if high_load_instance_exclude }}} [30s])) * 100) > 90
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
@ -31,6 +37,21 @@ groups:
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load" summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
description: "High load for more than 2 minutes." description: "High load for more than 2 minutes."
{% for server_info in stage_server_infos | default([]) %}
{% if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %}
{% set high_load_instance_custom="instance="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"' %}
- alert: high_load_{{ server_info.name }}
expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_custom if high_load_instance_custom }}} [30s])) * 100) > 90
for: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.duration }}"
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
description: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.description }}"
{% endif %}
{% endfor %}
- alert: apt_upgrades_pending - alert: apt_upgrades_pending
expr: apt_upgrades_pending{origin=~".*security.*"} > 0 expr: apt_upgrades_pending{origin=~".*security.*"} > 0
for: 1w for: 1w

@ -381,7 +381,7 @@ scrape_configs:
and hostvars[server_info.name].node_exporter_enabled) and hostvars[server_info.name].node_exporter_enabled)
or (hostvars[server_info.name].node_exporter_enabled is not defined) or (hostvars[server_info.name].node_exporter_enabled is not defined)
%} %}
'{{ server_info.private_ip }}:{{ monitor_port_system }}!{{ server_info.name }}.{{ hostvars[server_info.name].domain }}', '{{ server_info.private_ip }}:{{ monitor_port_system }}!{{ server_info.name }}.{{ hostvars[server_info.name].domain }}!{{ server_info.service }}',
{% endif %} {% endif %}
{% endfor %} {% endfor %}
] ]
@ -390,11 +390,15 @@ scrape_configs:
project: servers project: servers
relabel_configs: relabel_configs:
- source_labels: [__address__] - source_labels: [__address__]
regex: .*!(.*) regex: .*!(.*)!.*
target_label: instance target_label: instance
replacement: $1 replacement: $1
- source_labels: [__address__] - source_labels: [__address__]
regex: (.*)!.* regex: .*!.*!(.*)
target_label: service
replacement: $1
- source_labels: [__address__]
regex: (.*)!.*!.*
target_label: __address__ target_label: __address__
replacement: $1 replacement: $1

Loading…
Cancel
Save