diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b3205b9..327ff92 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -305,6 +305,25 @@ run-patchday-dev: rules: - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main" +run-patchday-dev-digitalocean: + stage: run-patchday + before_script: + - echo "${ANSIBLE_VAULT_PASS_DEV}" > /tmp/vault-pass + script: + - 'command -v ssh-agent >/dev/null || ( apt-get update -y && apt-get install openssh-client -y )' + - eval $(ssh-agent -s) + - 'echo "$GITLAB_SSH_KEY" | tr -d "\r" | ssh-add -' + - mkdir -p ~/.ssh + - chmod 0700 ~/.ssh + - '[[ -f /.dockerenv ]] && echo -e "Host *\n\tStrictHostKeyChecking no\n\n" >> ~/.ssh/config' + - ssh-add -L + - ansible-playbook -i stage-digitalocean patchday.yml --vault-password-file=/tmp/vault-pass -u gitlabci + after_script: + - rm /tmp/vault-pass + timeout: 2h + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main" + run-patchday-qa: extends: .run-patchday resource_group: qa diff --git a/patchday.yml b/patchday.yml index 7ad45e7..84b70ca 100644 --- a/patchday.yml +++ b/patchday.yml @@ -9,19 +9,20 @@ start: '{{ ansible_date_time.epoch }}' tasks: - - set_fact: - startsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}" - endsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}" + - name: "Set VAR for silence start and end" + set_fact: + silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}" + silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}" - - name: "set fact" + - name: "Set VAR - define prometheus silence object" set_fact: silence: matchers: - - name: job - value: .+ - isRegex: true - startsAt: '{{ startsAt }}' - endsAt: '{{ endsAt }}' + - name: job + value: .+ + isRegex: true + silence_starts_at: '{{ silence_starts_at }}' + silence_ends_at: '{{ silence_ends_at }}' createdBy: patchday-automatism comment: patchday id: @@ -35,7 +36,8 @@ Content-Type: application/json body_format: json body: '{{ silence | to_json }}' - ignore_errors: yes + ignore_errors: yes # noqa ignore-errors + # if failing, patchday continues - hosts: elastic serial: 1 @@ -149,20 +151,20 @@ - name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address" block: - - name: "Smardigo Patchday: error-handling - ensure service(s) started" - ansible.builtin.systemd: - name: postgresql - state: started - - - name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address" - become: no - community.postgresql.postgresql_ping: - port: 5432 - ssl_mode: require - login_host: '{{ stage_private_server_ip }}' - register: check_postgres_again - retries: 5 - failed_when: not check_postgres_again.is_available + - name: "Smardigo Patchday: error-handling - ensure service(s) started" + ansible.builtin.systemd: + name: postgresql + state: started + + - name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address" + become: no + community.postgresql.postgresql_ping: + port: 5432 + ssl_mode: require + login_host: '{{ stage_private_server_ip }}' + register: check_postgres_again + retries: 5 + failed_when: not check_postgres_again.is_available rescue: - name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL" @@ -190,6 +192,12 @@ serial: 10 become: yes tasks: + - name: "set VAR" + set_fact: + stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}" + when: + - "'blackbox' in group_names" + - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: upgrade: yes @@ -231,6 +239,42 @@ state: present loop: '{{ docker_compose_services.files }}' + - name: "Ensure SMA-portal is up and running" + # there is a hard dependency within SMA-portal (VM: <>-management-01) during application start process + # to iam-instance (VM: <>-iam-01) + # grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed + # + # ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process + block: + - name: "Check SMA-portal if reachable" + become: no + uri: + url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus" + method: GET + status_code: [200] + register: sma_portal_avail + delay: 10 + retries: 5 + no_log: true + until: sma_portal_avail.status in [200] + + rescue: + - name: "Smardigo Patchday: SMA-portal not reachable - shutdown services" + community.docker.docker_compose: + project_src: '{{ item.path | dirname }}' + state: absent + loop: '{{ docker_compose_services.files }}' + + - name: "Smardigo Patchday: SMA-portal not reachable - start services again" + community.docker.docker_compose: + project_src: '{{ item.path | dirname }}' + state: present + loop: '{{ docker_compose_services.files }}' + + when: + - "'management' in inventory_hostname" + + - hosts: k8s_cluster serial: 1 become: yes diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 18c966a..7bc7b0d 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -302,19 +302,19 @@ groups: - alert: offsite backup pending expr: | - (time() - offsite_backup_archive_started_seconds) > 25 * 3600 or - (time() - offsite_backup_archive_ended_seconds) > 25 * 3600 or - (time() - offsite_backup_transfer_started_seconds) > 25 * 3600 or - (time() - offsite_backup_transfer_ended_seconds) > 25 * 3600 or - (time() - offsite_backup_forget_started_seconds) > 25 * 3600 or - (time() - offsite_backup_forget_ended_seconds) > 25 * 3600 + (time() - offsite_backup_archive_started_seconds) > 27 * 3600 or + (time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or + (time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or + (time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or + (time() - offsite_backup_forget_started_seconds) > 27 * 3600 or + (time() - offsite_backup_forget_ended_seconds) > 27 * 3600 for: 1m labels: severity: critical annotations: identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' - summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 25h" - description: "offsite backups older then 25h." + summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h" + description: "offsite backups older then 27h." - alert: offsite backup metrics unavailable expr: |