From 9e6f28c62a837fa5ad3bfee2cf786da5797d6264 Mon Sep 17 00:00:00 2001 From: friedrich goerz Date: Wed, 14 Sep 2022 21:15:02 +0200 Subject: [PATCH] DEV-563: added hetzner dashboard + svennes dashboard + refactoring alert for hetzner_api_rate_limit --- roles/prometheus/tasks/main.yml | 2 +- .../Stage Overview-1663181809417.json | 420 ++++++++++++++++++ .../hetzner metrics-1663181386348.json | 357 +++++++++++++++ .../config/prometheus/alert.rules.j2 | 10 +- 4 files changed, 783 insertions(+), 6 deletions(-) create mode 100644 templates/prometheus/config/grafana/provisioning/dashboards/Stage Overview-1663181809417.json create mode 100644 templates/prometheus/config/grafana/provisioning/dashboards/hetzner metrics-1663181386348.json diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 9f1d77c..ea3d9cf 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -164,7 +164,7 @@ - name: "Create cronjob to exec hetzner-metrics script" ansible.builtin.cron: name: "update hetzner metrics" - minute: "*" + minute: "*/5" job: /root/hetzner_unattached_volumes.py tags: - update-hetzner-metrics diff --git a/templates/prometheus/config/grafana/provisioning/dashboards/Stage Overview-1663181809417.json b/templates/prometheus/config/grafana/provisioning/dashboards/Stage Overview-1663181809417.json new file mode 100644 index 0000000..7e8fc9b --- /dev/null +++ b/templates/prometheus/config/grafana/provisioning/dashboards/Stage Overview-1663181809417.json @@ -0,0 +1,420 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 22, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "Rate Limits", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "remaining rate limit", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "axisWidth": 40, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "(hetzner_api_ratelimit_remaining / hetzner_api_ratelimit_limit) * 100", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{env}}", + "refId": "A" + } + ], + "title": "Hetzner", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "remaining rate limit", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "axisWidth": 40, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "(digitalocean_api_ratelimit_remaining / digitalocean_api_ratelimit_limit) * 100", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{env}}", + "refId": "A" + } + ], + "title": "Digital Ocean", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 7, + "panels": [], + "title": "Errors", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Count/5m", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "changes(awx_status_total{status=\"failed\"}[5m])", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Failed AWX Jobs", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Count/5m", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "irate(logback_events_total{level=\"error\"}[5m])", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Application Errors", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stage Overview", + "uid": "NXk5Nk74k", + "version": 7, + "weekStart": "" +} \ No newline at end of file diff --git a/templates/prometheus/config/grafana/provisioning/dashboards/hetzner metrics-1663181386348.json b/templates/prometheus/config/grafana/provisioning/dashboards/hetzner metrics-1663181386348.json new file mode 100644 index 0000000..20bca40 --- /dev/null +++ b/templates/prometheus/config/grafana/provisioning/dashboards/hetzner metrics-1663181386348.json @@ -0,0 +1,357 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 21, + "links": [], + "liveNow": false, + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max_requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "hetzner_api_ratelimit_limit", + "interval": "", + "legendFormat": "max_requests", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "hetzner_api_ratelimit_remaining", + "hide": false, + "interval": "", + "legendFormat": "remaining requests", + "refId": "B" + } + ], + "title": "API requests", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max_requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "hetzner_api_unattached_volumes", + "interval": "", + "legendFormat": "locked servers", + "refId": "A" + } + ], + "title": "unattached volumes", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max_requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "hetzner_api_locked_servers", + "interval": "", + "legendFormat": "locked servers", + "refId": "A" + } + ], + "title": "locked servers", + "type": "timeseries" + } + ], + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "hetzner metrics", + "uid": "DClNWkn4k", + "version": 2, + "weekStart": "" +} diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 4dbc5cb..ecc3e8b 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -333,7 +333,7 @@ groups: description: "offsite metrics unavailable." - alert: DO_API_REQUEST_usage - expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 + expr: (digitalocean_api_ratelimit_remaining * 100)/digitalocean_api_ratelimit_limit < 50 or absent(digitalocean_api_ratelimit_remaining) for: 10m labels: severity: critical @@ -390,14 +390,14 @@ groups: - alert: hetzner ratelimit_remaining low - expr: hetzner_api_ratelimit_remaining < 720 or absent(hetzner_api_ratelimit_remaining) - for: 1h + expr: (hetzner_api_ratelimit_remaining * 100)/ hetzner_api_ratelimit_remaining < 50 or absent(hetzner_api_ratelimit_remaining) + for: 10m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' - summary: "hetzner ratelimit_remaining below 720" - description: "hetzner ratelimit_remaining below 720" + summary: "hetzner ratelimit_remaining below 50%" + description: "hetzner ratelimit_remaining below 50%" - alert: hetzner locked server exists expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers)