From 0c390415c92a4937f83a774a41033b0f5a2b62a1 Mon Sep 17 00:00:00 2001 From: Hoan To Date: Fri, 14 Apr 2023 07:34:53 +0000 Subject: [PATCH] DEV-580: Added prom2teams alert and receiver for email --- roles/prometheus/vars/main.yml | 10 ++++++++++ .../prometheus/config/alertmanager/config.yml.j2 | 13 +++++++++++++ .../prometheus/config/prometheus/alert.rules.j2 | 10 ++++++++++ .../prometheus/config/prometheus/prometheus.yml.j2 | 12 ++++++++++++ 4 files changed, 45 insertions(+) diff --git a/roles/prometheus/vars/main.yml b/roles/prometheus/vars/main.yml index 921867d..b797d6f 100644 --- a/roles/prometheus/vars/main.yml +++ b/roles/prometheus/vars/main.yml @@ -126,6 +126,16 @@ prometheus_docker: { name: "{{ prom2teams_id }}", image_name: "{{ prom2teams_image_name }}", image_version: "{{ prom_prom2teams_version }}", + environment: [ + "PROM2TEAMS_PROMETHEUS_METRICS: \"true\"", + "DEBUG_METRICS: \"true\"", + ], + ports: [ + { + "external": "8089", + "internal": "8089", + }, + ], volumes: [ '"./config/prom2teams/config.ini:/opt/prom2teams/config.ini"', ], diff --git a/templates/prometheus/config/alertmanager/config.yml.j2 b/templates/prometheus/config/alertmanager/config.yml.j2 index a41d214..9e3521c 100644 --- a/templates/prometheus/config/alertmanager/config.yml.j2 +++ b/templates/prometheus/config/alertmanager/config.yml.j2 @@ -8,9 +8,22 @@ route: - receiver: 'netgo_msteams_receiver' match: stage: '{{ stage }}' + - receiver: 'netgo_email_receiver' + match: + receiver: email receivers: - name: 'netgo_msteams_receiver' webhook_configs: - send_resolved: true url: 'http://{{ prom2teams_id }}:8089/v2/NetgoTeamsConnector' + +- name: 'netgo_email_receiver' + email_configs: + - to: '{{ devops_email_address }}' + from: 'prometheus_{{ stage }}@smardigo.digital' + smarthost: '{{ shared_service_mail_hostname }}:25' + auth_username: '' + auth_identity: '' + auth_password: '' + require_tls: false diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 0386a87..869ff72 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -503,3 +503,13 @@ groups: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "hetzner locked server exists" description: "hetzner locked server exists" + + - alert: prom2teams down + expr: process_cpu_seconds_total{application="prom2teams"} <= 0 or absent(process_cpu_seconds_total{application="prom2teams"}) + for: 5m + labels: + severity: critical + receiver: email + annotations: + summary: "prom2teams is down" + description: "prom2teams is down" diff --git a/templates/prometheus/config/prometheus/prometheus.yml.j2 b/templates/prometheus/config/prometheus/prometheus.yml.j2 index aa5ab0e..b9d4905 100644 --- a/templates/prometheus/config/prometheus/prometheus.yml.j2 +++ b/templates/prometheus/config/prometheus/prometheus.yml.j2 @@ -57,6 +57,18 @@ scrape_configs: target_label: instance replacement: '{{ inventory_hostname }}-alertmanager.{{ domain }}' + - job_name: 'prom2teams' + scheme: http + metrics_path: '/metrics' + static_configs: + - targets: [ + '{{ inventory_hostname }}-prom2teams:8089' + ] + labels: + env: {{ stage }} + project: monitoring + application: prom2teams + - job_name: 'blackbox' metrics_path: /probe params: