You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hetzner-ansible/patchday.yml

308 lines
9.0 KiB
YAML

---
### tags:
### check_elastic_cluster
- hosts: prometheus
vars:
start: '{{ ansible_date_time.epoch }}'
tasks:
- set_fact:
startsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
endsAt: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
- name: "set fact"
set_fact:
silence:
matchers:
- name: job
value: .+
isRegex: true
startsAt: '{{ startsAt }}'
endsAt: '{{ endsAt }}'
createdBy: patchday-automatism
comment: patchday
id:
- name: "Schedule silences for stage..."
uri:
url: "https://{{ stage }}-prometheus-01-alertmanager.smardigo.digital/api/v2/silences"
method: POST
status_code: [200]
headers:
Content-Type: application/json
body_format: json
body: '{{ silence | to_json }}'
ignore_errors: yes
- hosts: elastic
serial: 1
become: yes
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: '{{ service_base_path }}'
pattern: 'docker*.yml'
recurse: yes
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: absent
loop: '{{ docker_compose_services.files }}'
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: '{{ stage_server_ip }}'
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: present
loop: '{{ docker_compose_services.files }}'
- name: "Smardigo Patchday: wait until cluster is green"
ansible.builtin.uri:
url: "https://localhost:9200/_cluster/health"
user: "{{ elastic_admin_username }}"
password: "{{ elastic_admin_password }}"
force_basic_auth: true
status_code: 200
ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
register: check_elastic_cluster
delay: 30
retries: 30
until:
- check_elastic_cluster.json is defined
- check_elastic_cluster.json.status == 'green'
no_log: true
tags:
- check_elastic_cluster
- hosts: postgres
serial: 1
become: yes
tasks:
- name: "Smardigo Patchday: stop service(s)"
ansible.builtin.systemd:
name: postgresql
state: stopped
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: '{{ stage_server_ip }}'
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
ansible.builtin.systemd:
name: postgresql
state: started
# wait_for cannot be used anymore due to enabled SSL encryption for postgres connections in DEV-382
- name: "Smardigo Patchday: check if postgres is listing on net internal ip address"
become: no
community.postgresql.postgresql_ping:
port: 5432
ssl_mode: require
login_host: '{{ stage_private_server_ip }}'
register: check_postgres
ignore_errors: yes
- name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address"
block:
- name: "Smardigo Patchday: error-handling - ensure service(s) started"
ansible.builtin.systemd:
name: postgresql
state: started
- name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address"
become: no
community.postgresql.postgresql_ping:
port: 5432
ssl_mode: require
login_host: '{{ stage_private_server_ip }}'
register: check_postgres_again
retries: 5
failed_when: not check_postgres_again.is_available
rescue:
- name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL"
delegate_to: '{{ stage }}-mail-01'
community.general.mail:
host: localhost
port: 25
to: '{{ devops_email_address }}'
subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for {{ inventory_hostname }}"
body: |
Dear Sir or Madam,
I have to inform you that {{ inventory_hostname }} isn'n listening on {{ stage_private_server_ip }} anymore.
Plz check what happened/ fix it little padawan ;)
kind regards,
your automation-bofh
when:
- not check_postgres.is_available
- hosts: all,!elastic,!postgres,!k8s_cluster
serial: 10
become: yes
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: '{{ service_base_path }}'
pattern: 'docker*.yml'
recurse: yes
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: absent
loop: '{{ docker_compose_services.files }}'
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: '{{ stage_server_ip }}'
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: present
loop: '{{ docker_compose_services.files }}'
- hosts: k8s_cluster
serial: 1
become: yes
tasks:
# draining the hard way
# due to force( delete static pods) + relative short terminate_grace_period +
# --delete-local-data to kick pods with emptyDir
#
# ATTENTION: needs to be done via command instead of kubernetes module
# due to missing flag for --delete-emptydir-data
# ¯\_(ツ)_/¯
- name: "Smardigo Patchday: drain node"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
ansible.builtin.command: "/usr/local/bin/kubectl drain --timeout 2m --ignore-daemonsets --force --delete-emptydir-data {{ inventory_hostname | lower }}"
register: node_drained
until: node_drained
retries: 3
delay: 30
failed_when: false
- name: "Smardigo Patchday: stop k8s basic services"
ansible.builtin.systemd:
name: '{{ item }}'
state: stopped
loop: '{{ k8s_basic_services }}'
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
autoclean: yes
autoremove: yes
update_cache: yes
upgrade: yes
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: '{{ stage_server_ip }}'
search_regex: OpenSSH
- name: "Smardigo Patchday: start k8s basic services"
ansible.builtin.systemd:
name: '{{ item }}'
state: started
loop: '{{ k8s_basic_services }}'
- name: "Smardigo Patchday: wait for node readiness"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
kubernetes.core.k8s:
kind: Node
state: present
name: '{{ inventory_hostname | lower }}'
wait_condition:
reason: KubeletReady
type: Ready
status: True
wait_timeout: 120
retries: 5
delay: 10
- name: "Smardigo Patchday: uncordon node"
delegate_to: "{{ groups['kube_control_plane'][0] }}"
kubernetes.core.k8s_drain:
state: uncordon
name: '{{ inventory_hostname }}'