DEV-543: integrated DO-blackbox VM into DEV-patchday + increased threshold for...

feature/DEV-655
Görz, Friedrich 3 years ago committed by Hoan To
parent a0ff9a5d8e
commit ffb3aa2122

@ -305,6 +305,25 @@ run-patchday-dev:
rules: rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main" - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main"
run-patchday-dev-digitalocean:
stage: run-patchday
before_script:
- echo "${ANSIBLE_VAULT_PASS_DEV}" > /tmp/vault-pass
script:
- 'command -v ssh-agent >/dev/null || ( apt-get update -y && apt-get install openssh-client -y )'
- eval $(ssh-agent -s)
- 'echo "$GITLAB_SSH_KEY" | tr -d "\r" | ssh-add -'
- mkdir -p ~/.ssh
- chmod 0700 ~/.ssh
- '[[ -f /.dockerenv ]] && echo -e "Host *\n\tStrictHostKeyChecking no\n\n" >> ~/.ssh/config'
- ssh-add -L
- ansible-playbook -i stage-digitalocean patchday.yml --vault-password-file=/tmp/vault-pass -u gitlabci
after_script:
- rm /tmp/vault-pass
timeout: 2h
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main"
run-patchday-qa: run-patchday-qa:
extends: .run-patchday extends: .run-patchday
resource_group: qa resource_group: qa

@ -9,19 +9,20 @@
start: '{{ ansible_date_time.epoch }}' start: '{{ ansible_date_time.epoch }}'
tasks: tasks:
- set_fact: - name: "Set VAR for silence start and end"
startsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}" set_fact:
endsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}" silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
- name: "set fact" - name: "Set VAR - define prometheus silence object"
set_fact: set_fact:
silence: silence:
matchers: matchers:
- name: job - name: job
value: .+ value: .+
isRegex: true isRegex: true
startsAt: '{{ startsAt }}' silence_starts_at: '{{ silence_starts_at }}'
endsAt: '{{ endsAt }}' silence_ends_at: '{{ silence_ends_at }}'
createdBy: patchday-automatism createdBy: patchday-automatism
comment: patchday comment: patchday
id: id:
@ -35,7 +36,8 @@
Content-Type: application/json Content-Type: application/json
body_format: json body_format: json
body: '{{ silence | to_json }}' body: '{{ silence | to_json }}'
ignore_errors: yes ignore_errors: yes # noqa ignore-errors
# if failing, patchday continues
- hosts: elastic - hosts: elastic
serial: 1 serial: 1
@ -190,6 +192,12 @@
serial: 10 serial: 10
become: yes become: yes
tasks: tasks:
- name: "set VAR"
set_fact:
stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}"
when:
- "'blackbox' in group_names"
- name: "Smardigo Patchday: update pkgs" - name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt: ansible.builtin.apt:
upgrade: yes upgrade: yes
@ -231,6 +239,42 @@
state: present state: present
loop: '{{ docker_compose_services.files }}' loop: '{{ docker_compose_services.files }}'
- name: "Ensure SMA-portal is up and running"
# there is a hard dependency within SMA-portal (VM: <<stage>>-management-01) during application start process
# to iam-instance (VM: <<stage>>-iam-01)
# grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed
#
# ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process
block:
- name: "Check SMA-portal if reachable"
become: no
uri:
url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus"
method: GET
status_code: [200]
register: sma_portal_avail
delay: 10
retries: 5
no_log: true
until: sma_portal_avail.status in [200]
rescue:
- name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: absent
loop: '{{ docker_compose_services.files }}'
- name: "Smardigo Patchday: SMA-portal not reachable - start services again"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: present
loop: '{{ docker_compose_services.files }}'
when:
- "'management' in inventory_hostname"
- hosts: k8s_cluster - hosts: k8s_cluster
serial: 1 serial: 1
become: yes become: yes

@ -302,19 +302,19 @@ groups:
- alert: offsite backup pending - alert: offsite backup pending
expr: | expr: |
(time() - offsite_backup_archive_started_seconds) > 25 * 3600 or (time() - offsite_backup_archive_started_seconds) > 27 * 3600 or
(time() - offsite_backup_archive_ended_seconds) > 25 * 3600 or (time() - offsite_backup_archive_ended_seconds) > 27 * 3600 or
(time() - offsite_backup_transfer_started_seconds) > 25 * 3600 or (time() - offsite_backup_transfer_started_seconds) > 27 * 3600 or
(time() - offsite_backup_transfer_ended_seconds) > 25 * 3600 or (time() - offsite_backup_transfer_ended_seconds) > 27 * 3600 or
(time() - offsite_backup_forget_started_seconds) > 25 * 3600 or (time() - offsite_backup_forget_started_seconds) > 27 * 3600 or
(time() - offsite_backup_forget_ended_seconds) > 25 * 3600 (time() - offsite_backup_forget_ended_seconds) > 27 * 3600
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
annotations: annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 25h" summary: "one or more offsite backup metric timestamps for Instance <{{ '{{' }} $labels.instance {{ '}}' }}> older then 27h"
description: "offsite backups older then 25h." description: "offsite backups older then 27h."
- alert: offsite backup metrics unavailable - alert: offsite backup metrics unavailable
expr: | expr: |

Loading…
Cancel
Save