|
|
|
|
@ -21,8 +21,14 @@ groups:
|
|
|
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
|
|
|
|
|
description: "Down for more than 2 minutes."
|
|
|
|
|
|
|
|
|
|
{% set high_load_instance_exclude=[] %}
|
|
|
|
|
{% for server_info in stage_server_infos | default([]) %}
|
|
|
|
|
{%- if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %}
|
|
|
|
|
{{ high_load_instance_exclude.append("instance!="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"') }}
|
|
|
|
|
{%- endif %}
|
|
|
|
|
{% endfor %}
|
|
|
|
|
- alert: high_load
|
|
|
|
|
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
|
|
|
|
|
expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_exclude|join(', ') if high_load_instance_exclude }}} [30s])) * 100) > 90
|
|
|
|
|
for: 2m
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
@ -31,6 +37,21 @@ groups:
|
|
|
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
|
|
|
|
|
description: "High load for more than 2 minutes."
|
|
|
|
|
|
|
|
|
|
{% for server_info in stage_server_infos | default([]) %}
|
|
|
|
|
{% if (hostvars[server_info.name].prometheus_alert_extra_config is defined) %}
|
|
|
|
|
{% set high_load_instance_custom="instance="+'\"'+server_info.name+'.'+hostvars[server_info.name].domain+'\"' %}
|
|
|
|
|
- alert: high_load_{{ server_info.name }}
|
|
|
|
|
expr: 100 - (avg by (instance, env, instance, job, instance, service) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"{{ ", "+high_load_instance_custom if high_load_instance_custom }}} [30s])) * 100) > 90
|
|
|
|
|
for: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.duration }}"
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
annotations:
|
|
|
|
|
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
|
|
|
|
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
|
|
|
|
|
description: "{{ hostvars[server_info.name].prometheus_alert_extra_config.high_load.description }}"
|
|
|
|
|
|
|
|
|
|
{% endif %}
|
|
|
|
|
{% endfor %}
|
|
|
|
|
- alert: apt_upgrades_pending
|
|
|
|
|
expr: apt_upgrades_pending{origin=~".*security.*"} > 0
|
|
|
|
|
for: 1w
|
|
|
|
|
|