--- ### tags: ### check_elastic_cluster - hosts: prometheus vars: start: '{{ ansible_date_time.epoch }}' tasks: - name: "DO some stuff for silencing" block: - name: "Set VAR for silence start and end" set_fact: silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}" silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}" - name: "Set VAR - define prometheus silence object" set_fact: silence: matchers: - name: job value: .+ isRegex: true startsAt: '{{ silence_starts_at }}' endsAt: '{{ silence_ends_at }}' createdBy: patchday-automatism comment: patchday id: - name: "Schedule silences for stage..." uri: url: "https://{{ stage }}-prometheus-01-alertmanager.smardigo.digital/api/v2/silences" url_username: "{{ alertmanager_admin_username }}" url_password: "{{ alertmanager_admin_password }}" method: POST status_code: [200] headers: Content-Type: application/json body_format: json body: '{{ silence | to_json }}' rescue: - name: "Rescue silencing - sending mail to DEVOPS-DL" delegate_to: '{{ stage }}-mail-01' community.general.mail: host: localhost port: 25 to: '{{ devops_email_address }}' subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for failed silencing" body: | Dear Sir or Madam, silencing alerts for patchday failed. Plz check what happened/ fix it little padawan ;) kind regards, your automation-bofh - hosts: elastic serial: 1 become: yes tasks: - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: upgrade: yes update_cache: yes autoremove: yes autoclean: yes - name: "Smardigo Patchday: find docker_compose.yml files" ansible.builtin.find: paths: '{{ service_base_path }}' pattern: 'docker*.yml' recurse: yes register: docker_compose_services - name: "Smardigo Patchday: shutdown services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: absent loop: '{{ docker_compose_services.files }}' - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>" ansible.builtin.reboot: post_reboot_delay: 30 reboot_timeout: 300 - name: "Smardigo Patchday: wait_for host after reboot" become: no delegate_to: localhost ansible.builtin.wait_for: delay: 15 timeout: 180 port: 22 host: '{{ stage_server_ip }}' search_regex: OpenSSH - name: "Smardigo Patchday: start services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: present loop: '{{ docker_compose_services.files }}' - name: "Smardigo Patchday: wait until cluster is green" ansible.builtin.uri: url: "https://localhost:9200/_cluster/health" user: "{{ elastic_admin_username }}" password: "{{ elastic_admin_password }}" force_basic_auth: true status_code: 200 ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt" register: check_elastic_cluster delay: 30 retries: 30 until: - check_elastic_cluster.json is defined - check_elastic_cluster.json.status == 'green' no_log: true tags: - check_elastic_cluster - hosts: postgres serial: 1 become: yes tasks: - name: "Smardigo Patchday: stop service(s)" ansible.builtin.systemd: name: postgresql state: stopped - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: upgrade: yes update_cache: yes autoremove: yes autoclean: yes - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>" ansible.builtin.reboot: post_reboot_delay: 30 reboot_timeout: 300 - name: "Smardigo Patchday: wait_for host after reboot" become: no delegate_to: localhost ansible.builtin.wait_for: delay: 15 timeout: 180 port: 22 host: '{{ stage_server_ip }}' search_regex: OpenSSH - name: "Smardigo Patchday: start services" ansible.builtin.systemd: name: postgresql state: started # wait_for cannot be used anymore due to enabled SSL encryption for postgres connections in DEV-382 - name: "Smardigo Patchday: check if postgres is listing on net internal ip address" become: no community.postgresql.postgresql_ping: port: 5432 ssl_mode: require login_host: '{{ stage_private_server_ip }}' register: check_postgres ignore_errors: yes - name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address" block: - name: "Smardigo Patchday: error-handling - ensure service(s) started" ansible.builtin.systemd: name: postgresql state: started - name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address" become: no community.postgresql.postgresql_ping: port: 5432 ssl_mode: require login_host: '{{ stage_private_server_ip }}' register: check_postgres_again retries: 5 failed_when: not check_postgres_again.is_available rescue: - name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL" delegate_to: '{{ stage }}-mail-01' community.general.mail: host: localhost port: 25 to: '{{ devops_email_address }}' subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for {{ inventory_hostname }}" body: | Dear Sir or Madam, I have to inform you that {{ inventory_hostname }} isn'n listening on {{ stage_private_server_ip }} anymore. Plz check what happened/ fix it little padawan ;) kind regards, your automation-bofh when: - not check_postgres.is_available # due to bloody dependencies in SMA application startup, iam must be available during startup # => patching IAM service outsourced in separate part to make sure that is up and running - hosts: iam serial: 10 become: yes tasks: - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: upgrade: yes update_cache: yes autoremove: yes autoclean: yes - name: "Smardigo Patchday: find docker_compose.yml files" ansible.builtin.find: paths: '{{ service_base_path }}' pattern: 'docker*.yml' recurse: yes register: docker_compose_services - name: "Smardigo Patchday: shutdown services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: absent loop: '{{ docker_compose_services.files }}' - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>" ansible.builtin.reboot: post_reboot_delay: 30 reboot_timeout: 300 - name: "Smardigo Patchday: wait_for host after reboot" become: no delegate_to: localhost ansible.builtin.wait_for: delay: 15 timeout: 180 port: 22 host: '{{ stage_server_ip }}' search_regex: OpenSSH - name: "Smardigo Patchday: start services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: present loop: '{{ docker_compose_services.files }}' - hosts: all,!elastic,!postgres,!k8s_cluster,!iam,!restore serial: 10 become: yes tasks: - name: "set VAR" set_fact: stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}" when: - "'blackbox' in group_names" - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: upgrade: yes update_cache: yes autoremove: yes autoclean: yes - name: "Smardigo Patchday: find docker_compose.yml files" ansible.builtin.find: paths: '{{ service_base_path }}' pattern: 'docker*.yml' recurse: yes register: docker_compose_services - name: "Smardigo Patchday: shutdown services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: absent loop: '{{ docker_compose_services.files }}' - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>" ansible.builtin.reboot: post_reboot_delay: 30 reboot_timeout: 300 - name: "Smardigo Patchday: wait_for host after reboot" become: no delegate_to: localhost ansible.builtin.wait_for: delay: 15 timeout: 180 port: 22 host: '{{ stage_server_ip }}' search_regex: OpenSSH - name: "Smardigo Patchday: start services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: present loop: '{{ docker_compose_services.files }}' - name: "Ensure SMA-portal is up and running" # there is a hard dependency within SMA-portal (VM: <>-management-01) during application start process # to iam-instance (VM: <>-iam-01) # grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed # # ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process block: - name: "Check SMA-portal if reachable" become: no uri: url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus" method: GET status_code: [200] register: sma_portal_avail delay: 10 retries: 5 no_log: true until: sma_portal_avail.status in [200] rescue: - name: "Check SMA-portal dependency << iam-instance >>is reachable" become: no uri: url: "https://{{ stage }}-iam-01.{{ domain }}/api/v1/roles" method: GET status_code: [403] register: iam_avail delay: 10 retries: 10 no_log: true until: iam.status in [403] ignore_errors: yes # noqa ignore-errors # patchday continues ion case of failed request towards iam service; # iam service is hard dependency for SMA-portal-instance but not for # patchday itself - it;s just a work around - name: "Smardigo Patchday: SMA-portal not reachable - shutdown services" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: absent loop: '{{ docker_compose_services.files }}' - name: "Smardigo Patchday: SMA-portal not reachable - start services again" community.docker.docker_compose: project_src: '{{ item.path | dirname }}' state: present loop: '{{ docker_compose_services.files }}' when: - "'management' in inventory_hostname" - hosts: k8s_cluster serial: 1 become: yes tasks: # draining the hard way # due to force( delete static pods) + relative short terminate_grace_period + # --delete-local-data to kick pods with emptyDir # # ATTENTION: needs to be done via command instead of kubernetes module # due to missing flag for --delete-emptydir-data # ¯\_(ツ)_/¯ - name: "Smardigo Patchday: drain node" delegate_to: "{{ groups['kube_control_plane'][0] }}" ansible.builtin.command: "/usr/local/bin/kubectl drain --timeout 2m --ignore-daemonsets --force --delete-emptydir-data {{ inventory_hostname | lower }}" register: node_drained until: node_drained retries: 3 delay: 30 failed_when: false - name: "Smardigo Patchday: stop k8s basic services" ansible.builtin.systemd: name: '{{ item }}' state: stopped loop: '{{ k8s_basic_services }}' - name: "Smardigo Patchday: update pkgs" ansible.builtin.apt: autoclean: yes autoremove: yes update_cache: yes upgrade: yes - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>" ansible.builtin.reboot: post_reboot_delay: 30 reboot_timeout: 300 - name: "Smardigo Patchday: wait_for host after reboot" become: no delegate_to: localhost ansible.builtin.wait_for: delay: 15 timeout: 180 port: 22 host: '{{ stage_server_ip }}' search_regex: OpenSSH - name: "Smardigo Patchday: start k8s basic services" ansible.builtin.systemd: name: '{{ item }}' state: started loop: '{{ k8s_basic_services }}' - name: "Smardigo Patchday: wait for node readiness" delegate_to: "{{ groups['kube_control_plane'][0] }}" kubernetes.core.k8s: kind: Node state: present name: '{{ inventory_hostname | lower }}' wait_condition: reason: KubeletReady type: Ready status: True wait_timeout: 120 retries: 5 delay: 10 - name: "Smardigo Patchday: uncordon node" delegate_to: "{{ groups['kube_control_plane'][0] }}" kubernetes.core.k8s_drain: state: uncordon name: '{{ inventory_hostname }}'