From 3905dff581df91fdf0c0be70af9afad5d41a264a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6rz=2C=20Friedrich?= Date: Tue, 28 Mar 2023 12:47:47 +0000 Subject: [PATCH] DEV-471: added push metrics part to restre playbook --- restore-remote-database-backup.yml | 31 +++++++++++++++++++ .../config/prometheus/alert.rules.j2 | 24 ++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/restore-remote-database-backup.yml b/restore-remote-database-backup.yml index 326d345..8364c80 100644 --- a/restore-remote-database-backup.yml +++ b/restore-remote-database-backup.yml @@ -247,6 +247,37 @@ vars: record_to_remove: '{{ inventory_hostname }}' +############################################################# +# Sending metric to prometheus push gateway +############################################################# + +- hosts: "{{ stage }}-virtual-host-to-read-groups-vars" + serial: "{{ serial_number | default(1) }}" + gather_facts: false + connection: local + run_once: true + vars: + prometheus_pushgw_host: prometheus-pushgateway.monitoring:9091 # pushgw-endpoint within k8s cluster running AWX + tasks: + # attention: do not remove manually set newline char at the end of metric line + # in case of removing: request will die with HTTP 400 bad request + - name: "Set Fact" + set_fact: + metric: "nightly_restore_successful_generic {{ lookup('pipe', 'date +%s') }}\n" + + # body => trim needed to remove new line char + # due to created yaml string + - name: "Send metric to prometheus pushgw" + uri: + url: "http://{{ prometheus_pushgw_host }}/metrics/job/restore_test/creator/awx-run/database_engine/{{ database_engine }}" + method: POST + body: "{{ metric }}" + status_code: [200] + register: send_prome_metric + retries: 5 + delay: 5 + until: send_prome_metric.status in [200] + ############################################################# # Sending smardigo management message to process ############################################################# diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 397c5ca..0386a87 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -174,6 +174,30 @@ groups: summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed." description: "PostgreSQL backup failed." + - alert: nightly_restoretest_failed_postgres + expr: | + (time() - nightly_restore_successful_generic{job="restore_test",database_engine="postgres"}) > 30 * 3600 or + absent(nightly_restore_successful_generic{job="restore_test",database_engine="postgres"}) + for: 2m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly restore test failed." + description: "PostgreSQL restore test failed." + + - alert: nightly_restoretest_failed_maria + expr: | + (time() - nightly_restore_successful_generic{job="restore_test",database_engine="maria"}) > 30 * 3600 or + absent(nightly_restore_successful_generic{job="restore_test",database_engine="maria"}) + for: 2m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly restore test failed." + description: "MariaDB restore test failed." + - alert: megaraid_smart_errors expr: megaraid_smart_errors > 1 for: 2m