DEV-540: added awx dashbord to grafana, added alert for failed jobs

feature/DEV-655
Hoan To 3 years ago
parent 223141da20
commit 244245336f

@ -63,6 +63,9 @@ awx_ansible_user_name: "awx"
awx_ansible_user_ssh_key_private: "{{ ansible_ssh_key_private_vault }}"
awx_credential_machine_hetzner_name: hetzner-ansible-ssh
awx_ansible_username: ansible
awx_ansible_password: ansible
gitlab_ansible_user_name: "gitlabci"
backupuser_user_name: backupuser
@ -225,3 +228,5 @@ prometheus_alert_pg_replication_lag: 120
upstream_dns_servers:
- 185.12.64.1
- 185.12.64.2

File diff suppressed because it is too large Load Diff

@ -23,6 +23,7 @@
vars:
username: "{{ awx_ansible_username }}"
password: "{{ awx_ansible_password }}"
is_system_auditor: "true"
uri:
url: "{{ awx_base_url }}/api/v2/users/"
method: POST

@ -350,6 +350,24 @@ groups:
summary: "Elasticsearch health status is not green. Please Check"
description: "Alert for Elasticsearch health status"
- alert: awx job failed with status error
expr: changes(awx_status_total{status="error"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status error"
description: "Alert awx jobs has an error"
- alert: awx job failed with status failed
expr: changes(awx_status_total{status="failed"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status failed"
description: "Alert awx jobs failed"
- alert: postgres backup zombies
expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10
for: 2h

@ -452,3 +452,18 @@ scrape_configs:
static_configs:
- targets: ['{{ kubernetes_prometheus_endpoint }}']
{% endif %}
##############################################
### awx ####
##############################################
- job_name: 'awx'
metrics_path: '/api/v2/metrics'
scrape_interval: 5s
scheme: https
basic_auth:
username: '{{ awx_ansible_username }}'
password: '{{ awx_ansible_password }}'
static_configs:
- targets: ['{{ shared_service_kube_awx_hostname }}']

Loading…
Cancel
Save