diff --git a/group_vars/all/prometheus.yml b/group_vars/all/prometheus.yml index eca2bb6..fd5ed39 100644 --- a/group_vars/all/prometheus.yml +++ b/group_vars/all/prometheus.yml @@ -17,3 +17,5 @@ kubernetes_prometheus_endpoint: "{{ stage_kube }}-prometheus.{{ domain }}" prometheus_alert_diskspaceusage_warning: 85 prometheus_alert_pg_replication_lag: 120 + +elastic_cluster_settings_max_shards: 1000 diff --git a/group_vars/stage_prodnso/prometheus.yml b/group_vars/stage_prodnso/prometheus.yml index ad7977e..8e0483f 100644 --- a/group_vars/stage_prodnso/prometheus.yml +++ b/group_vars/stage_prodnso/prometheus.yml @@ -14,3 +14,4 @@ node_exporter_extra_servers: kubernetes_prometheus_endpoint: "prodnso-prometheus.{{ domain }}" +elastic_cluster_settings_max_shards: 1500 diff --git a/roles/elastic/tasks/main.yaml b/roles/elastic/tasks/main.yaml index 99bb24d..adeee89 100644 --- a/roles/elastic/tasks/main.yaml +++ b/roles/elastic/tasks/main.yaml @@ -80,3 +80,26 @@ tags: - update_config - update_deployment + +- name: "Set VAR" + set_fact: + es_cluster_settings: + persistent: + cluster.max_shards_per_node: "{{ elastic_cluster_settings_max_shards }}" + +- name: "Set cluster config" + uri: + url: "https://localhost:9200/_cluster/settings" + method: PUT + ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt" + user: "{{ elastic_admin_username }}" + password: "{{ elastic_admin_password }}" + headers: + Content-Type: "application/json" + Accept: "application/json" + body_format: "json" + body: "{{ es_cluster_settings |to_json }}" + force_basic_auth: true + status_code: 200 + when: + - inventory_hostname == groups['elastic'][0] diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 1f57d5b..0f3a44f 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -383,6 +383,24 @@ groups: summary: "Elasticsearch health status is not green. Please Check" description: "Alert for Elasticsearch health status" + - alert: elasticsearch - usage of active shards greater than 90% + expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 90 + for: 30m + labels: + severity: warning + annotations: + summary: "ES cluster - active shards reaches WARN-threshold... Please clean up" + description: "WARNING: Alert for ES active shards usage" + + - alert: elasticsearch - usage of active shards greater than 95% + expr: avg(elasticsearch_cluster_health_active_shards) * 100 / (avg(elasticsearch_cluster_health_number_of_nodes) * {{ elastic_cluster_settings_max_shards }} ) > 95 + for: 5m + labels: + severity: critical + annotations: + summary: "ES cluster - active shards reaches CRITICAL-threshold... Please clean up" + description: "WARNING: Alert for ES active shards usage" + - alert: awx job failed with status error expr: changes(awx_status_total{status="error"}[2m]) > 0 for: 2m