Feature/dev 962 es clsuter activehards alert

3 years ago · 96da6ef83f
parent 2d771f4989
commit 96da6ef83f
4 changed files with 44 additions and 0 deletions
--- a/group_vars/all/prometheus.yml
+++ b/group_vars/all/prometheus.yml
@ -17,3 +17,5 @@ kubernetes_prometheus_endpoint: "{{ stage_kube }}-prometheus.{{ domain }}"
 prometheus_alert_diskspaceusage_warning: 85
 prometheus_alert_pg_replication_lag: 120
 elastic_cluster_settings_max_shards: 1000
--- a/group_vars/stage_prodnso/prometheus.yml
+++ b/group_vars/stage_prodnso/prometheus.yml
@ -14,3 +14,4 @@ node_exporter_extra_servers:
 kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}"
 elastic_cluster_settings_max_shards: 1500
--- a/roles/elastic/tasks/main.yaml
+++ b/roles/elastic/tasks/main.yaml
@ -80,3 +80,26 @@
  tags:
    - update_config
    - update_deployment
 - name: "Set VAR"
  set_fact:
    es_cluster_settings:
      persistent:
        cluster.max_shards_per_node: "{{ elastic_cluster_settings_max_shards }}"
 - name: "Set cluster config"
  uri:
    url: "https://localhost:9200/_cluster/settings"
    method: PUT
    ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
    user: "{{ elastic_admin_username }}"
    password: "{{ elastic_admin_password }}"
    headers:
      Content-Type: "application/json"
      Accept: "application/json"
    body_format: "json"
    body: "{{ es_cluster_settings |to_json }}"
    force_basic_auth: true
    status_code: 200
  when:
    - inventory_hostname == groups['elastic'][0]
--- a/templates/prometheus/config/prometheus/alert.rules.j2
+++ b/templates/prometheus/config/prometheus/alert.rules.j2
@ -383,6 +383,24 @@ groups:
      summary: "Elasticsearch health status is not green. Please Check"
      description: "Alert for Elasticsearch health status"
  - alert: elasticsearch - usage of active shards greater than 90%
    expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 90
    for: 30m
    labels:
      severity: warning
    annotations:
      summary: "ES cluster - active shards reaches WARN-threshold... Please clean up"
      description: "WARNING: Alert for ES active shards usage"
  - alert: elasticsearch - usage of active shards greater than 95%
    expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 95
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "ES cluster - active shards reaches CRITICAL-threshold... Please clean up"
      description: "WARNING: Alert for ES active shards usage"
  - alert: awx job failed with status error
    expr: changes(awx_status_total{status="error"}[2m]) > 0
    for: 2m
`@ -14,3 +14,4 @@ node_exporter_extra_servers:`


	`kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}"`	`kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}"`
		`elastic_cluster_settings_max_shards: 1500`