You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hetzner-ansible/patchday.yml

501 lines
16 KiB
YAML

---
### tags:
### check_elastic_cluster
- hosts: prometheus
vars:
start: "{{ ansible_date_time.epoch }}"
tasks:
- name: "DO some stuff for silencing"
block:
- name: "Set VAR for silence start and end"
set_fact:
silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
- name: "Set VAR - define prometheus silence object"
set_fact:
silence:
matchers:
- name: job
value: .+
isRegex: true
startsAt: "{{ silence_starts_at }}"
endsAt: "{{ silence_ends_at }}"
createdBy: patchday-automatism
comment: patchday
id:
- name: "Schedule silences for stage..."
uri:
url: "https://{{ stage }}-prometheus-01-alertmanager.smardigo.digital/api/v2/silences"
url_username: "{{ alertmanager_admin_username }}"
url_password: "{{ alertmanager_admin_password }}"
method: POST
status_code: [200]
headers:
Content-Type: application/json
body_format: json
body: "{{ silence | to_json }}"
rescue:
- name: "Rescue silencing - sending mail to DEVOPS-DL"
delegate_to: "{{ stage }}-mail-01"
community.general.mail:
host: localhost
port: 25
to: "{{ devops_email_address }}"
subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for failed silencing"
body: |
Dear Sir or Madam,
silencing alerts for patchday failed.
Plz check what happened/ fix it little padawan ;)
kind regards,
your automation-bofh
- name: "Harbor"
hosts: harbor
serial: 1
become: true
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: true
update_cache: true
autoremove: true
autoclean: true
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: "{{ service_base_path }}"
pattern: 'docker*.yml'
recurse: true
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose_v2:
project_src: "{{ item | dirname }}"
state: absent
loop: "{{ docker_compose_services.files | map(attribute='path') | select('match', '.*/' + stage + '-.*') }}"
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: false
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Wait period for harbor restart"
become: false
ansible.builtin.wait_for:
timeout: 60
delegate_to: localhost
- hosts: elastic
serial: 1
become: yes
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: "{{ service_base_path }}"
pattern: 'docker*.yml'
recurse: yes
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: "{{ item | dirname }}"
state: absent
loop: "{{ docker_compose_services.files | map(attribute='path') | select('match', '.*/'+stage+'-.*') }}"
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: "{{ item.path | dirname }}"
state: present
loop: "{{ docker_compose_services.files }}"
- name: "Smardigo Patchday: wait until cluster is green"
ansible.builtin.uri:
url: "https://localhost:9200/_cluster/health"
user: "{{ elastic_admin_username }}"
password: "{{ elastic_admin_password }}"
force_basic_auth: true
status_code: 200
ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
register: check_elastic_cluster
delay: 30
retries: 30
until:
- check_elastic_cluster.json is defined
- check_elastic_cluster.json.status == 'green'
no_log: true
tags:
- check_elastic_cluster
- hosts: postgres
serial: 1
become: true
tasks:
- name: "Smardigo Patchday: stop service(s)"
ansible.builtin.systemd:
name: postgresql
state: stopped
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: true
update_cache: true
autoremove: true
autoclean: true
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: false
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Open and mount LUKS encrypted LVM for datadir"
ansible.builtin.include_role:
name: lvm_with_hetzner_volumes
when: lvm_volume_encryption | bool
vars:
lvm_with_hetzner_volumes__volprefix: "postgres_datadir"
lvm_with_hetzner_volumes__volsize: "{{ postgres_pgdatadir_lvm_hcloudvol_size }}"
lvm_with_hetzner_volumes__volcount: "{{ postgres_pgdatadir_lvm_hcloudvol_count }}"
lvm_with_hetzner_volumes__mountpath: "{{ postgres_pgdatadir_lvm_hcloudvol_mountpath }}"
lvm_with_hetzner_volumes__passphrase: "{{ postgres_volume_encryption_passphrase }}"
- name: "Open and mount LUKS encrypted LVM for backupdir"
ansible.builtin.include_role:
name: lvm_with_hetzner_volumes
when: lvm_volume_encryption | bool and server_type == "slave"
vars:
lvm_with_hetzner_volumes__volprefix: "postgres-backup"
lvm_with_hetzner_volumes__volsize: "{{ postgres_backup_volume_size }}"
lvm_with_hetzner_volumes__volcount: "{{ postgres_backup_volume_count }}"
lvm_with_hetzner_volumes__mountpath: "{{ backup_directory }}"
lvm_with_hetzner_volumes__passphrase: "{{ postgres_volume_encryption_passphrase }}"
- name: "Smardigo Patchday: restart services"
ansible.builtin.systemd:
name: postgresql
state: restarted
# wait_for cannot be used anymore due to enabled SSL encryption for postgres connections in DEV-382
- name: "Smardigo Patchday: check if postgres is listing on net internal ip address"
become: false
community.postgresql.postgresql_ping:
port: 5432
ssl_mode: require
login_host: "{{ stage_private_server_ip }}"
register: check_postgres
ignore_errors: true
- name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address"
when:
- not check_postgres.is_available
block:
- name: "Smardigo Patchday: error-handling - ensure service(s) started"
ansible.builtin.systemd:
name: postgresql
state: started
- name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address"
become: false
community.postgresql.postgresql_ping:
port: 5432
ssl_mode: require
login_host: "{{ stage_private_server_ip }}"
register: check_postgres_again
retries: 5
failed_when: not check_postgres_again.is_available
rescue:
- name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL"
delegate_to: "{{ stage }}-mail-01"
community.general.mail:
host: localhost
port: 25
to: "{{ devops_email_address }}"
subject: "patchday( {{ lookup('pipe', 'date +%Y-%m-%d_%H:%M') }} ) problem report for {{ inventory_hostname }}"
body: |
Dear Sir or Madam,
I have to inform you that {{ inventory_hostname }} isn'n listening on {{ stage_private_server_ip }} anymore.
Plz check what happened/ fix it little padawan ;)
kind regards,
your automation-bofh
# due to bloody dependencies in SMA application startup, iam must be available during startup
# => patching IAM service outsourced in separate part to make sure that is up and running
- hosts: iam,keycloak
serial: 10
become: true
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: true
update_cache: true
autoremove: true
autoclean: true
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: "{{ service_base_path }}"
pattern: 'docker*.yml'
recurse: true
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: "{{ item | dirname }}"
state: absent
loop: "{{ docker_compose_services.files | map(attribute='path') | select('match', '.*/'+stage+'-.*') }}"
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: false
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: "{{ item | dirname }}"
state: present
loop: "{{ docker_compose_services.files | map(attribute='path') | select('match', '.*/'+stage+'-.*') }}"
- hosts: all,!harbor,!elastic,!postgres,!iam,!keycloak,!k8s_cluster,!restore
serial: 10
become: yes
tasks:
- name: "set VAR"
set_fact:
stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}"
when:
- "'blackbox' in group_names"
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: "{{ service_base_path }}"
pattern: 'docker*.yml'
recurse: yes
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: "{{ item.path | dirname }}"
state: absent
loop: "{{ docker_compose_services.files }}"
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: "{{ item.path | dirname }}"
state: present
loop: "{{ docker_compose_services.files }}"
- name: "Ensure SMA-portal is up and running"
# there is a hard dependency within SMA-portal (VM: <<stage>>-management-01) during application start process
# to iam-instance (VM: <<stage>>-iam-01)
# grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed
#
# ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process
block:
- name: "Check SMA-portal if reachable"
become: no
uri:
url: "{{ shared_service_url_management }}:{{ admin_port_service }}/management/prometheus"
method: GET
status_code: [200]
register: sma_portal_avail
delay: 15
retries: 10
no_log: true
until: sma_portal_avail.status in [200]
rescue:
- name: "Check SMA-portal dependency << iam-instance >>is reachable"
become: no
uri:
url: "https://{{ stage }}-iam-01.{{ domain }}/api/v1/roles"
method: GET
status_code: [403]
register: iam_avail
delay: 10
retries: 10
no_log: true
until: iam.status in [403]
ignore_errors: yes # noqa ignore-errors
# patchday continues ion case of failed request towards iam service;
# iam service is hard dependency for SMA-portal-instance but not for
# patchday itself - it;s just a work around
- name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
community.docker.docker_compose:
project_src: "{{ item.path | dirname }}"
state: absent
loop: "{{ docker_compose_services.files }}"
- name: "Smardigo Patchday: SMA-portal not reachable - start services again"
community.docker.docker_compose:
project_src: "{{ item.path | dirname }}"
state: present
loop: "{{ docker_compose_services.files }}"
when:
- "'management' in inventory_hostname"
- hosts: k8s_cluster
serial: 1
become: yes
tasks:
# draining the hard way
# due to force( delete static pods) + relative short terminate_grace_period +
# --delete-local-data to kick pods with emptyDir
#
# ATTENTION: needs to be done via command instead of kubernetes module
# due to missing flag for --delete-emptydir-data
# ¯\_(ツ)_/¯
- name: "Smardigo Patchday: drain node"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
ansible.builtin.command: "/usr/local/bin/kubectl drain --timeout 2m --ignore-daemonsets --force --delete-emptydir-data {{ inventory_hostname | lower }}"
register: node_drained
until: node_drained
retries: 3
delay: 30
failed_when: false
- name: "Smardigo Patchday: stop k8s basic services"
ansible.builtin.systemd:
name: "{{ item }}"
state: stopped
loop: "{{ k8s_basic_services }}"
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
autoclean: yes
autoremove: yes
update_cache: yes
upgrade: yes
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: "{{ stage_server_ip }}"
search_regex: OpenSSH
- name: "Smardigo Patchday: start k8s basic services"
ansible.builtin.systemd:
name: "{{ item }}"
state: started
loop: "{{ k8s_basic_services }}"
- name: "Smardigo Patchday: wait for node readiness"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
kubernetes.core.k8s:
kind: Node
state: present
name: "{{ inventory_hostname | lower }}"
wait_condition:
reason: KubeletReady
type: Ready
status: True
wait_timeout: 120
retries: 5
delay: 10
- name: "Smardigo Patchday: uncordon node"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
kubernetes.core.k8s_drain:
state: uncordon
name: "{{ inventory_hostname }}"