diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index ea3d9cf..601c823 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -3,6 +3,8 @@ ### tags: ### update_config ### update_deployment +### update-digitalocean-metrics +### update-hetzner-metrics - name: "Create/Resize LVM for datadir" include_role: @@ -133,39 +135,70 @@ } when: grafana_stats.json.users == 1 -- name: "Create DO-metrics script from template" +- name: "Create digitalocean api metric script from template" template: - src: do_too_many_req_metrics.sh.j2 - dest: /root/do_too_many_req_metrics.sh + src: digitalocean_api.sh.j2 + dest: /root/digitalocean_api.sh mode: 0700 owner: root - group: root + group: root tags: - - update-do-metrics + - update-digitalocean-metrics -- name: "Create cronjob to exec DO-metrics script" +- name: "Create cronjob to exec digitalocean api metric script" ansible.builtin.cron: name: "update replication table" minute: "*" - job: /root/do_too_many_req_metrics.sh + job: /root/digitalocean_api.sh tags: - - update-do-metrics + - update-digitalocean-metrics -- name: "Create hetzner-metrics script from template" +- name: "Create cronjob to exec digitalocean api metric script" + ansible.builtin.cron: + name: "update digitalocean metrics" + minute: "*" + job: /root/digitalocean_api.sh + tags: + - update-digitalocean-metrics + +- name: "Create hetzner api metric script from template" template: - src: hetzner_unattached_volumes.py.j2 - dest: /root/hetzner_unattached_volumes.py + src: hetzner_api.py.j2 + dest: /root/hetzner_api.py mode: 0700 owner: root - group: root + group: root tags: - update-hetzner-metrics -- name: "Create cronjob to exec hetzner-metrics script" +- name: "Create cronjob to exec hetzner api metric script" ansible.builtin.cron: name: "update hetzner metrics" - minute: "*/5" - job: /root/hetzner_unattached_volumes.py + minute: "*" + job: /root/hetzner_api.py tags: - update-hetzner-metrics +# remove after 31.12.2022 +- name: "Remove old cronjob to exec digitalocean api metric script" + ansible.builtin.cron: + name: "update replication table" + state: "absent" + tags: + - update-digitalocean-metrics + +# remove after 31.12.2022 +- name: "Remove old digitalocean api metric script" + ansible.builtin.file: + path: /root/do_too_many_req_metrics.sh + state: absent + tags: + - update-digitalocean-metrics + +# remove after 31.12.2022 +- name: "Remove old hetzner api metric script" + ansible.builtin.file: + path: /root/hetzner_unattached_volumes.py + state: absent + tags: + - update-hetzner-metrics diff --git a/roles/prometheus/templates/do_too_many_req_metrics.sh.j2 b/roles/prometheus/templates/digitalocean_api.sh.j2 similarity index 100% rename from roles/prometheus/templates/do_too_many_req_metrics.sh.j2 rename to roles/prometheus/templates/digitalocean_api.sh.j2 diff --git a/roles/prometheus/templates/hetzner_unattached_volumes.py.j2 b/roles/prometheus/templates/hetzner_api.py.j2 similarity index 100% rename from roles/prometheus/templates/hetzner_unattached_volumes.py.j2 rename to roles/prometheus/templates/hetzner_api.py.j2 diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index a8c7a44..8a81e12 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -367,7 +367,7 @@ groups: annotations: summary: "awx job failed with status failed" description: "Alert awx jobs failed" - + - alert: postgres backup zombies expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10 for: 2h @@ -378,17 +378,6 @@ groups: summary: "postgres backup zombies, have not been deleted" description: "postgres backup zombies, have not been deleted" - - alert: hetzner unattached volumes - expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes) - for: 2h - labels: - severity: critical - annotations: - identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' - summary: "unattached volumes in hetzner" - description: "unattached volumes in hetzner" - - - alert: hetzner ratelimit_remaining low expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_limit < 50 or absent(hetzner_api_ratelimit_remaining) for: 10m @@ -399,6 +388,16 @@ groups: summary: "hetzner ratelimit_remaining below 50%" description: "hetzner ratelimit_remaining below 50%" + - alert: hetzner unattached volumes + expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes) + for: 2h + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "unattached volumes in hetzner" + description: "unattached volumes in hetzner" + - alert: hetzner locked server exists expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers) for: 1h @@ -408,5 +407,3 @@ groups: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "hetzner locked server exists" description: "hetzner locked server exists" - -