DEV-543: integrated DO-blackbox VM into DEV-patchday + increased threshold for...

3 years ago · ffb3aa2122
parent a0ff9a5d8e
commit ffb3aa2122
3 changed files with 95 additions and 32 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -305,6 +305,25 @@ run-patchday-dev:
  rules:
    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main"
 run-patchday-dev-digitalocean:
  stage: run-patchday
  before_script:
    - echo "${ANSIBLE_VAULT_PASS_DEV}" > /tmp/vault-pass
  script:
    - 'command -v ssh-agent >/dev/null || ( apt-get update -y && apt-get install openssh-client -y )'
    - eval $(ssh-agent -s)
    - 'echo "$GITLAB_SSH_KEY" | tr -d "\r" | ssh-add -'
    - mkdir -p ~/.ssh
    - chmod 0700 ~/.ssh
    - '[[ -f /.dockerenv ]] && echo -e "Host *\n\tStrictHostKeyChecking no\n\n" >> ~/.ssh/config'
    - ssh-add -L
    - ansible-playbook -i stage-digitalocean patchday.yml --vault-password-file=/tmp/vault-pass -u gitlabci
  after_script:
    - rm /tmp/vault-pass
  timeout: 2h
  rules:
    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main"
 run-patchday-qa:
  extends: .run-patchday
  resource_group: qa
--- a/patchday.yml
+++ b/patchday.yml
@ -9,19 +9,20 @@
    start: '{{ ansible_date_time.epoch }}'
  tasks:
-    - set_fact:
+    - name: "Set VAR for silence start and end"
-        startsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
+      set_fact:
-        endsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
+        silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
        silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
-    - name: "set fact"
+    - name: "Set VAR - define prometheus silence object"
      set_fact:
        silence:
          matchers:
            - name: job
              value: .+
              isRegex: true
-          startsAt: '{{ startsAt }}'
+          silence_starts_at: '{{ silence_starts_at }}'
-          endsAt: '{{ endsAt }}'
+          silence_ends_at: '{{ silence_ends_at }}'
          createdBy: patchday-automatism
          comment: patchday
          id:
@ -35,7 +36,8 @@
          Content-Type: application/json
        body_format: json
        body: '{{ silence | to_json }}'
-      ignore_errors: yes
+      ignore_errors: yes # noqa ignore-errors
      # if failing, patchday continues
 - hosts: elastic
  serial: 1
@ -190,6 +192,12 @@
  serial: 10
  become: yes
  tasks:
    - name: "set VAR"
      set_fact:
        stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}"
      when:
        - "'blackbox' in group_names"
    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        upgrade: yes
@ -231,6 +239,42 @@
        state: present
      loop: '{{ docker_compose_services.files }}'
    - name: "Ensure SMA-portal is up and running"
      # there is a hard dependency within SMA-portal (VM: <<stage>>-management-01) during application start process
      # to iam-instance (VM: <<stage>>-iam-01)
      # grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed
      #
      # ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process
      block:
        - name: "Check SMA-portal if reachable"
          become: no
          uri:
            url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus"
            method: GET
            status_code: [200]
          register: sma_portal_avail
          delay: 10
          retries: 5
          no_log: true
          until: sma_portal_avail.status in [200]
      rescue:
        - name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
          community.docker.docker_compose:
            project_src: '{{ item.path | dirname }}'
            state: absent
          loop: '{{ docker_compose_services.files }}'
        - name: "Smardigo Patchday: SMA-portal not reachable - start services again"
          community.docker.docker_compose:
            project_src: '{{ item.path | dirname }}'
            state: present
          loop: '{{ docker_compose_services.files }}'
      when:
        - "'management' in inventory_hostname"
 - hosts: k8s_cluster
  serial: 1
  become: yes
--- a/templates/prometheus/config/prometheus/alert.rules.j2
+++ b/templates/prometheus/config/prometheus/alert.rules.j2
@ -302,19 +302,19 @@ groups:
  - alert: offsite backup pending
    expr: |
-      (time() - offsite_backup_archive_started_seconds) > 25 * 3600 or
+      (time() - offsite_backup_archive_started_seconds) > 27 * 3600 or
-      (time() - offsite_backup_archive_ended_seconds) > 25 * 3600 or
+      (time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or
-      (time() - offsite_backup_transfer_started_seconds) > 25 * 3600 or
+      (time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or
-      (time() - offsite_backup_transfer_ended_seconds) > 25 * 3600 or
+      (time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or
-      (time() - offsite_backup_forget_started_seconds) > 25 * 3600 or
+      (time() - offsite_backup_forget_started_seconds) > 27 * 3600 or
-      (time() - offsite_backup_forget_ended_seconds) > 25 * 3600
+      (time() - offsite_backup_forget_ended_seconds) > 27 * 3600
    for: 1m
    labels:
      severity: critical
    annotations:
      identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
-      summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 25h"
+      summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h"
-      description: "offsite backups older then 25h."
+      description: "offsite backups older then 27h."
  - alert: offsite backup metrics unavailable
    expr: |