Feature/dev 962 es clsuter activehards alert

qa
Görz, Friedrich 3 years ago committed by Michael Hähnel
parent 2d771f4989
commit 96da6ef83f

@ -17,3 +17,5 @@ kubernetes_prometheus_endpoint: "{{ stage_kube }}-prometheus.{{ domain }}"
prometheus_alert_diskspaceusage_warning: 85 prometheus_alert_diskspaceusage_warning: 85
prometheus_alert_pg_replication_lag: 120 prometheus_alert_pg_replication_lag: 120
elastic_cluster_settings_max_shards: 1000

@ -14,3 +14,4 @@ node_exporter_extra_servers:
kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}" kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}"
elastic_cluster_settings_max_shards: 1500

@ -80,3 +80,26 @@
tags: tags:
- update_config - update_config
- update_deployment - update_deployment
- name: "Set VAR"
set_fact:
es_cluster_settings:
persistent:
cluster.max_shards_per_node: "{{ elastic_cluster_settings_max_shards }}"
- name: "Set cluster config"
uri:
url: "https://localhost:9200/_cluster/settings"
method: PUT
ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
user: "{{ elastic_admin_username }}"
password: "{{ elastic_admin_password }}"
headers:
Content-Type: "application/json"
Accept: "application/json"
body_format: "json"
body: "{{ es_cluster_settings |to_json }}"
force_basic_auth: true
status_code: 200
when:
- inventory_hostname == groups['elastic'][0]

@ -383,6 +383,24 @@ groups:
summary: "Elasticsearch health status is not green. Please Check" summary: "Elasticsearch health status is not green. Please Check"
description: "Alert for Elasticsearch health status" description: "Alert for Elasticsearch health status"
- alert: elasticsearch - usage of active shards greater than 90%
expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 90
for: 30m
labels:
severity: warning
annotations:
summary: "ES cluster - active shards reaches WARN-threshold... Please clean up"
description: "WARNING: Alert for ES active shards usage"
- alert: elasticsearch - usage of active shards greater than 95%
expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 95
for: 5m
labels:
severity: critical
annotations:
summary: "ES cluster - active shards reaches CRITICAL-threshold... Please clean up"
description: "WARNING: Alert for ES active shards usage"
- alert: awx job failed with status error - alert: awx job failed with status error
expr: changes(awx_status_total{status="error"}[2m]) > 0 expr: changes(awx_status_total{status="error"}[2m]) > 0
for: 2m for: 2m

Loading…
Cancel
Save