diff --git a/README.md b/README.md index 9c7da9d..06ee771 100644 --- a/README.md +++ b/README.md @@ -59,11 +59,60 @@ Create/Start servers for stage-dev > with Poetry prefix with `poetry run` +# ansible-builder +[AWX](https://www.ansible.com/products/awx-project/) is used in smardigo setup do execute several playbooks due to user interaction within smardigo product. To finish ansible runs successfully we have to make sure every ansible dependency(ansible collection/role or pip package) is installed. therefore ansible-builder gave us the opportunity to configure the needed environment to exec plays successfully. + +before every git-merge depending on changes concerning new ansible collections/roles and/or new pip-packages, please run the following: + + cd ansible-builder/ + ansible-builder build --tag awx-custom-ee:latest --container-runtime docker + + + +hopefully it will result in e.g.: + + cd ansible-builder/ + ATTENTION: to get more information plz set --verbosity 3 + ansible-builder build --tag awx-custom-ee:latest --container-runtime docker + Running command: + docker build -f context/Dockerfile -t awx-custom-ee:latest context + + Complete! The build context can be found at: /home/friedrich/sandbox/netgo_stuff/hetzner-ansible/ansible-builder/context + + real 2m56,131s + user 0m0,208s + sys 0m0,102s` + +you can start the docker container locally with e.g. : + docker run -it --rm --mount type=bind,source="$(pwd)"/,target=/gitrepo/ awx-custom-ee:latest /bin/bash + + +after it, you are able to exec some ansbible-runs like: +(please solve dependencies before starting to docker container) + cd /gitrepo + ansible-playbook -i stage-digitalocean external_monitoring.yml --ask-vault-password -u root --private-key sshkey_pw_less + + export HETZNER_LABEL_SELECTOR='stage=dev' + ansible-playbook -i stage-netgo-hcloud.yml -l redis smardigo.yml --ask-vault-password -u root --private-key sshkey_pw_less + + +if everything works fine, plz push the created docker container with: + + docker login dev-docker-registry-01.smardigo.digital + docker tag XXXXXXXX dev-docker-registry-01.smardigo.digital/awx/awx-custom-ee:latest + docker push dev-docker-registry-01.smardigo.digital/awx/awx-custom-ee + + + + + + + # TODO IPFire 149.233.6.129 - eShelter - 212.121.131.106 - Siemansdamm + 212.121.131.106 - Siemensdamm Prometheus curl -X POST https://{{ prometheus-url }}/-/reload diff --git a/ansible-builder/README.md b/ansible-builder/README.md index c5489bf..ff6f5e9 100644 --- a/ansible-builder/README.md +++ b/ansible-builder/README.md @@ -1,4 +1,6 @@ # Execution Environment for AWX ansible-builder build --tag dev-docker-registry-01.smardigo.digital/awx/awx-custom-ee:latest + docker login dev-docker-registry-01.smardigo.digital + docker tag XXXXXXXX dev-docker-registry-01.smardigo.digital/awx/awx-custom-ee:latest docker push dev-docker-registry-01.smardigo.digital/awx/awx-custom-ee diff --git a/external_monitoring.yml b/external_monitoring.yml new file mode 100644 index 0000000..9158d77 --- /dev/null +++ b/external_monitoring.yml @@ -0,0 +1,66 @@ +--- +- name: 'apply setup to {{ host | default("all") }}' + hosts: '{{ host | default("all") }}' + serial: "{{ serial_number | default(5) }}" + tasks: + - set_fact: + dev_prometheus_fqdn: 'dev-prometheus-01.{{ domain }}' + qa_prometheus_fqdn: 'qa-prometheus-01.{{ domain }}' + + - set_fact: + dev_prometheus_ip: "{{ lookup('community.general.dig', dev_prometheus_fqdn ) }}" + qa_prometheus_ip: "{{ lookup('community.general.dig', qa_prometheus_fqdn ) }}" + + - name: "Allow SSH in UFW" + ufw: + rule: limit + port: 22 + proto: tcp + + - name: "Allow port 9100 for node-exporter in UFW" + ufw: + rule: allow + port: 9100 + proto: tcp + src: "{{ item }}" + loop: + - "{{ dev_prometheus_ip }}" + - "{{ qa_prometheus_ip }}" + + - name: "Allow port 9115 for blackbox-exporter in UFW" + ufw: + rule: allow + port: 9115 + proto: tcp + src: "{{ item }}" + loop: + - "{{ dev_prometheus_ip }}" + - "{{ qa_prometheus_ip }}" + + - name: "Set firewall default policy" + ufw: + state: enabled + policy: reject + + - name: "configure ssh_hardening" + include_role: + # include role from collection called 'devsec' + name: devsec.hardening.ssh_hardening + apply: + tags: + - ssh_hardening + tags: + - ssh_hardening + + - name: "Install node-exporter via include_role" + include_role: + name: cloudalchemy.node-exporter + + - name: "Install blackbox-exporter via include_role" + include_role: + name: cloudalchemy.blackbox-exporter + apply: + tags: + - blackbox + tags: + - blackbox diff --git a/galaxy-requirements.yml b/galaxy-requirements.yml index 59954e7..ae1253f 100644 --- a/galaxy-requirements.yml +++ b/galaxy-requirements.yml @@ -4,6 +4,15 @@ roles: version: 1.7.0 - name: idealista.prometheus_redis_exporter_role version: 2.1.0 +- name: cloudalchemy.node-exporter + version: 2.0.0 + scm: git + src: https://github.com/cloudalchemy/ansible-node-exporter +- name: cloudalchemy.blackbox-exporter + version: 1.0.0 + src: https://github.com/cloudalchemy/ansible-blackbox-exporter + scm: git + collections: - name: hetzner.hcloud version: 1.6.0 @@ -11,3 +20,8 @@ collections: - name: kubernetes.core - name: community.mysql - name: community.postgresql +- name: community.digitalocean + version: 1.11.0 +- name: devsec.hardening + version: 7.12.0 + src: https://github.com/dev-sec/ansible-collection-hardening diff --git a/group_vars/all/plain.yml b/group_vars/all/plain.yml index c8c30b2..651d21d 100644 --- a/group_vars/all/plain.yml +++ b/group_vars/all/plain.yml @@ -1,6 +1,29 @@ --- debug: false +ssh_macs: + - umac-128-etm@openssh.com + - hmac-sha2-256-etm@openssh.com + - hmac-sha2-512-etm@openssh.com +ssh_host_key_algorithms: + - rsa-sha2-512 + - rsa-sha2-256 + - ssh-ed25519 +ssh_kex: + - curve25519-sha256 + - curve25519-sha256@libssh.org + - diffie-hellman-group-exchange-sha256 + - diffie-hellman-group16-sha512 + - diffie-hellman-group18-sha512 + - diffie-hellman-group14-sha256 +ssh_ciphers: + - chacha20-poly1305@openssh.com + - aes128-ctr + - aes192-ctr + - aes256-ctr + - aes128-gcm@openssh.com + - aes256-gcm@openssh.com +ssh_permit_root_login: 'yes' docker_enabled: true traefik_enabled: true @@ -147,3 +170,9 @@ backup_directory: "/backups" #grafana_user_smardigo_login: "< see vault >" #grafana_user_smardigo_password: "< see vault >" + +blackbox_exporter_fqdn: "dev-blackbox-01.{{ domain }}" +blackbox_http_2xx_targets: +- 'https://{{ stage }}-keycloak-01.smardigo.digital/auth/' +- 'https://{{ stage }}-management-01-connect.smardigo.digital/auth/' +blackbox_http_2xx_additional_targets: [] diff --git a/group_vars/digitalocean/plain.yml b/group_vars/digitalocean/plain.yml new file mode 100644 index 0000000..b3a4d69 --- /dev/null +++ b/group_vars/digitalocean/plain.yml @@ -0,0 +1,7 @@ +--- +ansible_ssh_host: "{{ inventory_hostname }}.{{ domain }}" + +droplet_defaults: + size: s-1vcpu-1gb + region: fra1 + image: ubuntu-20-04-x64 diff --git a/group_vars/service_blackbox/plain.yml b/group_vars/service_blackbox/plain.yml new file mode 100644 index 0000000..c04c6f1 --- /dev/null +++ b/group_vars/service_blackbox/plain.yml @@ -0,0 +1,3 @@ +--- +blackbox_exporter_cli_flags: + log.level: warn diff --git a/hcloud_firewall.yml b/hcloud_firewall.yml new file mode 100644 index 0000000..29f21fe --- /dev/null +++ b/hcloud_firewall.yml @@ -0,0 +1,36 @@ +--- +- hosts: localhost + vars: + hcloud_firewall_objects: + - + name: monitoring-extern-https + state: present + rules: + - + direction: in + protocol: tcp + port: '443' + source_ips: + - "{{ lookup('community.general.dig', 'dev-blackbox-01.smardigo.digital' ) }}/32" + destination_ips: [] + description: null + apply_to: + - + type: label_selector + label_selector: + selector: 'service=connect' + - + type: label_selector + label_selector: + selector: 'service=keycloak' + + + + tasks: + - name: "Setup hcloud firewall via include_role" + include_role: + name: hcloud + tasks_from: configure-firewall2 + loop: "{{ hcloud_firewall_objects }}" + loop_control: + loop_var: firewall_object diff --git a/pip-requirements b/pip-requirements index 6f16ad3..05dff90 100644 --- a/pip-requirements +++ b/pip-requirements @@ -1,5 +1,7 @@ ansible ansible-core>=2.10 +ansible-builder +dnspython hcloud>=1.16.0 jmespath netaddr diff --git a/provisioning.yml b/provisioning.yml index 9963c74..96b9e03 100644 --- a/provisioning.yml +++ b/provisioning.yml @@ -57,3 +57,15 @@ vars: hetzner_state: 'started' when: "'hcloud' in group_names" + tasks: + - name: "Create server in DO-cloud via include_tasks" + include_role: + name: _digitalocean + tasks_from: _create_server + vars: + droplet: + name: dev-blackbox-01 + tags: + - service_blackbox + - stage_dev + when: "'digitalocean' in group_names" diff --git a/roles/_digitalocean/tasks/_create_server.yml b/roles/_digitalocean/tasks/_create_server.yml new file mode 100644 index 0000000..0755c72 --- /dev/null +++ b/roles/_digitalocean/tasks/_create_server.yml @@ -0,0 +1,69 @@ +- name: "Create ssh key" + delegate_to: localhost + community.digitalocean.digital_ocean_sshkey: + oauth_token: "{{ digitalocean_authentication_token }}" + name: "{{ item }}" + ssh_pub_key: "{{ lookup('file', 'users/' + item + '/ssh.pub') }}" + state: present + register: result + loop: '{{ smardigo_plattform_users }}' + +- name: "Get fingerprints for ssh_keys" + delegate_to: localhost + community.digitalocean.digital_ocean_sshkey_info: + oauth_token: "{{ digitalocean_authentication_token }}" + register: do_sshkeys_found + +- set_fact: + droplet_combined: "{{ droplet_defaults | combine(droplet) }}" + +- block: + - set_fact: + tag_service: "{{ droplet.name | regex_search('[a-z]+-([a-z]+)-[0-9]+','\\1') | first | string }}" + tag_stage: "{{ droplet.name | regex_search('([a-z]+)-[a-z]+-[0-9]+','\\1') | first | string }}" + + - name: "Create new droplet" + delegate_to: localhost + community.digitalocean.digital_ocean_droplet: + oauth_token: "{{ digitalocean_authentication_token }}" + state: "{{ droplet_combined.state | default('present') }}" + name: "{{ droplet_combined.name }}" + size: "{{ droplet_combined.size }}" + region: "{{ droplet_combined.region }}" + image: "{{ droplet_combined.image }}" + wait_timeout: "{{ droplet_combined.wait_timeout | default(500) }}" + unique_name: "{{ droplet_combined.unique_name | default(True) }}" + ssh_keys: "{{ do_sshkeys_found.data | json_query(querystring) }}" + register: new_droplet + vars: + querystring: "[*].fingerprint" + + - name: "Tag new server" + delegate_to: localhost + community.digitalocean.digital_ocean_tag: + oauth_token: "{{ digitalocean_authentication_token }}" + name: "{{ item | replace('-','_') }}" + resource_id: "{{ new_droplet.data.droplet.id }}" + state: present + register: tag_response + loop: "{{ droplet.tags }}" + + - name: "Set fact" + delegate_to: localhost + set_fact: + stage_server_ip: "{{ item }}" + loop: "{{ new_droplet.data | community.general.json_query(jsonquery_ipaddress) }}" + vars: + jsonquery_ipaddress: "droplet.networks.v4[?type=='public'].ip_address" + +# TODO: abolish _digitalocean/tasks/domain.yml + - name: "Create dns record for droplet" + delegate_to: localhost + community.digitalocean.digital_ocean_domain_record: + oauth_token: "{{ digitalocean_authentication_token }}" + state: present + domain: "{{ domain }}" + type: A + name: "{{ new_droplet.data.droplet.name }}" + data: "{{ stage_server_ip }}" + force_update: yes diff --git a/roles/hcloud/tasks/configure-firewall2.yml b/roles/hcloud/tasks/configure-firewall2.yml new file mode 100644 index 0000000..a82963b --- /dev/null +++ b/roles/hcloud/tasks/configure-firewall2.yml @@ -0,0 +1,131 @@ +--- +- name: "Get all existing firewalls" + uri: + method: GET + url: "https://api.hetzner.cloud/v1/firewalls" + body_format: json + headers: + accept: application/json + authorization: Bearer {{ hetzner_authentication_token }} + status_code: [200] + register: hcloud_firewalls_all + delegate_to: 127.0.0.1 + become: false + +- name: "Setting hetzner firewall pagination count: <{{ hcloud_firewalls_all.json.meta.pagination.last_page }}>" + set_fact: + total_server_pages: "{{ hcloud_firewalls_all.json.meta.pagination.last_page }}" + become: false + tags: + - always + + +- name: "BLOCK << WITHOUT >> pagination" + block: + - set_fact: + lookup_fw_obj: "{{ hcloud_firewalls_all.json.firewalls | community.general.json_query(jsonquery_find_firewall_name) }}" + vars: + jsonquery_find_firewall_name: "[?name=='{{ firewall_object.name }}']" + when: + - total_server_pages == '1' + + +- name: "<< WITH >> pagination" + block: + - name: "Get all existing firewalls" + uri: + method: GET + url: "https://api.hetzner.cloud/v1/firewalls?page={{ item }}" + body_format: json + headers: + accept: application/json + authorization: Bearer {{ hetzner_authentication_token }} + status_code: [200] + register: hcloud_firewalls_all + delegate_to: 127.0.0.1 + become: false + + - set_fact: + lookup_fw_obj: "{{ hcloud_firewalls_all.json.results | community.general.json_query(querystr1) | first | community.general.json_query(querystr2) | community.general.json_query(querystr2) }}" + vars: + querystr1: "[[*].json.firewalls]" + querystr2: "[?name=='{{ firewall_object.name }}']" + when: + - total_server_pages != '1' + + +- name: "Create firewall rule for <<{{ firewall_object.name }}>>" + uri: + method: POST + url: "https://api.hetzner.cloud/v1/firewalls" + body_format: json + headers: + Content-Type: application/json + authorization: Bearer {{ hetzner_authentication_token }} + body: "{{ firewall_object | to_json }}" + return_content: yes + status_code: [201] + delegate_to: 127.0.0.1 + become: false + when: + - firewall_object.state == 'present' + - lookup_fw_obj | length == 0 + +- name: "Update firewall rule for <<{{ firewall_object.name }}>>" + uri: + method: PUT + url: "https://api.hetzner.cloud/v1/firewalls/{{ lookup_fw_obj.0.id }}" + body_format: json + headers: + Content-Type: application/json + authorization: Bearer {{ hetzner_authentication_token }} + body: "{{ firewall_object | to_json }}" + return_content: yes + status_code: [200] + delegate_to: 127.0.0.1 + become: false + when: + - firewall_object.state == 'present' + - lookup_fw_obj | length > 0 + +- name: "Delete firewall rule for <<{{ firewall_object.name }}>>" + block: + + - set_fact: + deactivate_fw_obj: + remove_from: "{{ firewall_object.apply_to }}" + + - name: "Step_1: Unset usage of firewall rule <<{{ firewall_object.name }}>>" + uri: + method: POST + url: "https://api.hetzner.cloud/v1/firewalls/{{ lookup_fw_obj.0.id }}/actions/remove_from_resources" + body_format: json + headers: + Content-Type: application/json + authorization: Bearer {{ hetzner_authentication_token }} + body: "{{ deactivate_fw_obj | to_json }}" + return_content: yes + status_code: [201] + delegate_to: 127.0.0.1 + become: false + + - name: "Step_2: Delete firewall rule for <<{{ firewall_object.name }}>>" + uri: + method: DELETE + url: "https://api.hetzner.cloud/v1/firewalls/{{ lookup_fw_obj.0.id }}" + body_format: json + headers: + Content-Type: application/json + authorization: Bearer {{ hetzner_authentication_token }} + return_content: yes + status_code: [204] + register: cleanup_firewall + delegate_to: 127.0.0.1 + become: false + until: cleanup_firewall.status in [204] + retries: 15 + delay: 10 + + when: + - firewall_object.state == 'absent' + - lookup_fw_obj | length > 0 diff --git a/stage-digitalocean b/stage-digitalocean new file mode 100644 index 0000000..580bb50 --- /dev/null +++ b/stage-digitalocean @@ -0,0 +1,11 @@ +[blackbox] +dev-blackbox-01 + +[stage_dev:children] +blackbox + +[all:children] +stage_dev + +[digitalocean:children] +stage_dev diff --git a/tasks/autodiscover_pre_tasks.yml b/tasks/autodiscover_pre_tasks.yml index 67af490..d1b8f7c 100644 --- a/tasks/autodiscover_pre_tasks.yml +++ b/tasks/autodiscover_pre_tasks.yml @@ -28,33 +28,31 @@ when: - total_server_pages == '1' -- name: "Reading hetzner server infos for stage <{{ stage }}> with pagination" - uri: - url: "https://api.hetzner.cloud/v1/servers?label_selector=stage={{ stage }}&page={{ item }}" - method: GET - headers: - authorization: Bearer {{ hetzner_authentication_token }} - register: hetzner_servers_results - with_sequence: start=1 end={{ total_server_pages }} - delegate_to: 127.0.0.1 - become: false - tags: - - always +- name: "Block << WITH >> pagination" + block: + - name: "Reading hetzner server infos for stage <{{ stage }}> with pagination" + uri: + url: "https://api.hetzner.cloud/v1/servers?label_selector=stage={{ stage }}&page={{ item }}" + method: GET + headers: + authorization: Bearer {{ hetzner_authentication_token }} + register: hetzner_servers_results + with_sequence: start=1 end={{ total_server_pages }} + delegate_to: 127.0.0.1 + become: false + + - name: "Reading hetzner server infos for stage <{{ stage }}> with pagination" + set_fact: + hetzner_servers: "{{ hetzner_servers_results.results | json_query(querystr1) | first | json_query(querystr2) | json_query(querystr2) }}" + vars: + querystr1: "[[*].json.servers]" + querystr2: "[]" + delegate_to: 127.0.0.1 + become: false when: - total_server_pages != '1' - -- name: "Reading hetzner server infos for stage <{{ stage }}> with pagination" - set_fact: - hetzner_servers: "{{ hetzner_servers_results.results | json_query(querystr1) | first | json_query(querystr2) | json_query(querystr2) }}" - vars: - querystr1: "[[*].json.servers]" - querystr2: "[]" - delegate_to: 127.0.0.1 - become: false tags: - always - when: - - total_server_pages != '1' - name: "Printing hetzner server infos for stage <{{ stage }}>" debug: diff --git a/templates/prometheus/config/grafana/provisioning/dashboards/BlackboxExporter.json b/templates/prometheus/config/grafana/provisioning/dashboards/BlackboxExporter.json new file mode 100644 index 0000000..3dc97ff --- /dev/null +++ b/templates/prometheus/config/grafana/provisioning/dashboards/BlackboxExporter.json @@ -0,0 +1,1087 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Prometheus Blackbox Exporter Overview", + "editable": true, + "gnetId": 7587, + "graphTooltip": 0, + "id": 16, + "iteration": 1638378422339, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 138, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "probe_duration_seconds{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Global Probe Duration", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 15, + "panels": [], + "repeat": "target", + "title": "$target status", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "#299c46", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 9 + }, + "id": 2, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "repeatDirection": "v", + "targets": [ + { + "expr": "probe_success{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Status", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 10, + "x": 4, + "y": 9 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "probe_http_duration_seconds{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ phase }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HTTP Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 9 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "probe_duration_seconds{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "seconds", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Probe Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "N/A" + }, + "1": { + "text": "YES" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 201 + }, + { + "color": "#d44a3a", + "value": 399 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 11 + }, + "id": 20, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "repeatDirection": "h", + "targets": [ + { + "expr": "probe_http_status_code{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "HTTP Status Code", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 13 + }, + "id": 27, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "targets": [ + { + "expr": "probe_http_version{instance=~\"$target\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "HTTP Version", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "NO" + }, + "1": { + "text": "YES" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 15 + }, + "id": 18, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "repeatDirection": "v", + "targets": [ + { + "expr": "probe_http_ssl{instance=~\"$target\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "SSL", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [ + { + "options": { + "0": { + "text": "NO" + }, + "1": { + "text": "YES" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 1209600 + } + ] + }, + "unit": "dtdurations" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 10, + "x": 4, + "y": 15 + }, + "id": 19, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "repeatDirection": "h", + "targets": [ + { + "expr": "probe_ssl_earliest_cert_expiry{instance=~\"$target\"} - time()", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "title": "SSL Expiry", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 14, + "y": 15 + }, + "id": 23, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "targets": [ + { + "expr": "avg(probe_duration_seconds{instance=~\"$target\"})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Average Probe Duration", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 19, + "y": 15 + }, + "id": 24, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "repeat": null, + "repeatDirection": "h", + "targets": [ + { + "expr": "avg(probe_dns_lookup_time_seconds{instance=~\"$target\"})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Average DNS Lookup", + "type": "stat" + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": [ + "blackbox", + "prometheus" + ], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 10, + "auto_min": "10s", + "current": { + "selected": false, + "text": "1m", + "value": "1m" + }, + "description": null, + "error": null, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "target", + "options": [], + "query": { + "query": "label_values(probe_success, instance)", + "refId": "Prometheus-target-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Prometheus Blackbox Exporter", + "uid": "N1eLnqtnz", + "version": 1 +} \ No newline at end of file diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index a324a67..3deee42 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -250,3 +250,13 @@ groups: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes." description: "backup duration took more than 60 minutes." + + - alert: endpoint down + expr: probe_success == 0 + for: 2m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> is down. plz check." + description: "checks if endpoint is reachable or not" diff --git a/templates/prometheus/config/prometheus/prometheus.yml.j2 b/templates/prometheus/config/prometheus/prometheus.yml.j2 index 3ffb78b..05cb46b 100644 --- a/templates/prometheus/config/prometheus/prometheus.yml.j2 +++ b/templates/prometheus/config/prometheus/prometheus.yml.j2 @@ -26,7 +26,7 @@ alerting: scrape_configs: ############################################## -### Apllications #### +### Applications #### ############################################## - job_name: 'prometheus' @@ -336,6 +336,28 @@ scrape_configs: target_label: __address__ replacement: $1 + - job_name: 'node-exporter-external-monitoring(digitalocean)' + scheme: http + metrics_path: '/metrics' + static_configs: + - targets: + - '{{ blackbox_exporter_fqdn }}:9100' + labels: + env: {{ stage }} + project: smardigo + relabel_configs: + - source_labels: [job] + target_label: job + replacement: 'node-exporter' + - source_labels: [__address__] + regex: .*!(.*) + target_label: instance + replacement: $1 + - source_labels: [__address__] + regex: (.*)!.* + target_label: __address__ + replacement: $1 + ############################################## ### Databases #### ############################################## @@ -416,3 +438,22 @@ scrape_configs: regex: (.*)!.* target_label: __address__ replacement: $1 +############################################## +### blackbox #### +############################################## + - job_name: 'blackbox_smardigo' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: {{ blackbox_http_2xx_targets + blackbox_http_2xx_additional_targets }} + labels: + env: {{ stage }} + project: smardigo + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: "{{ lookup('community.general.dig', blackbox_exporter_fqdn ) }}:9115"