You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
352 lines
11 KiB
YAML
352 lines
11 KiB
YAML
---
|
|
|
|
### tags:
|
|
### check_elastic_cluster
|
|
|
|
|
|
- hosts: prometheus
|
|
vars:
|
|
start: '{{ ansible_date_time.epoch }}'
|
|
|
|
tasks:
|
|
- name: "Set VAR for silence start and end"
|
|
set_fact:
|
|
silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
|
|
silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"
|
|
|
|
- name: "Set VAR - define prometheus silence object"
|
|
set_fact:
|
|
silence:
|
|
matchers:
|
|
- name: job
|
|
value: .+
|
|
isRegex: true
|
|
startsAt: '{{ silence_starts_at }}'
|
|
endsAt: '{{ silence_ends_at }}'
|
|
createdBy: patchday-automatism
|
|
comment: patchday
|
|
id:
|
|
|
|
- name: "Schedule silences for stage..."
|
|
uri:
|
|
url: "https://{{ stage }}-prometheus-01-alertmanager.smardigo.digital/api/v2/silences"
|
|
method: POST
|
|
status_code: [200]
|
|
headers:
|
|
Content-Type: application/json
|
|
body_format: json
|
|
body: '{{ silence | to_json }}'
|
|
ignore_errors: yes # noqa ignore-errors
|
|
# if failing, patchday continues
|
|
|
|
- hosts: elastic
|
|
serial: 1
|
|
become: yes
|
|
tasks:
|
|
- name: "Smardigo Patchday: update pkgs"
|
|
ansible.builtin.apt:
|
|
upgrade: yes
|
|
update_cache: yes
|
|
autoremove: yes
|
|
autoclean: yes
|
|
|
|
- name: "Smardigo Patchday: find docker_compose.yml files"
|
|
ansible.builtin.find:
|
|
paths: '{{ service_base_path }}'
|
|
pattern: 'docker*.yml'
|
|
recurse: yes
|
|
register: docker_compose_services
|
|
|
|
- name: "Smardigo Patchday: shutdown services"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: absent
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
|
|
ansible.builtin.reboot:
|
|
post_reboot_delay: 30
|
|
reboot_timeout: 300
|
|
|
|
- name: "Smardigo Patchday: wait_for host after reboot"
|
|
become: no
|
|
delegate_to: localhost
|
|
ansible.builtin.wait_for:
|
|
delay: 15
|
|
timeout: 180
|
|
port: 22
|
|
host: '{{ stage_server_ip }}'
|
|
search_regex: OpenSSH
|
|
|
|
- name: "Smardigo Patchday: start services"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: present
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
- name: "Smardigo Patchday: wait until cluster is green"
|
|
ansible.builtin.uri:
|
|
url: "https://localhost:9200/_cluster/health"
|
|
user: "{{ elastic_admin_username }}"
|
|
password: "{{ elastic_admin_password }}"
|
|
force_basic_auth: true
|
|
status_code: 200
|
|
ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
|
|
register: check_elastic_cluster
|
|
delay: 30
|
|
retries: 30
|
|
until:
|
|
- check_elastic_cluster.json is defined
|
|
- check_elastic_cluster.json.status == 'green'
|
|
no_log: true
|
|
tags:
|
|
- check_elastic_cluster
|
|
|
|
- hosts: postgres
|
|
serial: 1
|
|
become: yes
|
|
tasks:
|
|
- name: "Smardigo Patchday: stop service(s)"
|
|
ansible.builtin.systemd:
|
|
name: postgresql
|
|
state: stopped
|
|
|
|
- name: "Smardigo Patchday: update pkgs"
|
|
ansible.builtin.apt:
|
|
upgrade: yes
|
|
update_cache: yes
|
|
autoremove: yes
|
|
autoclean: yes
|
|
|
|
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
|
|
ansible.builtin.reboot:
|
|
post_reboot_delay: 30
|
|
reboot_timeout: 300
|
|
|
|
- name: "Smardigo Patchday: wait_for host after reboot"
|
|
become: no
|
|
delegate_to: localhost
|
|
ansible.builtin.wait_for:
|
|
delay: 15
|
|
timeout: 180
|
|
port: 22
|
|
host: '{{ stage_server_ip }}'
|
|
search_regex: OpenSSH
|
|
|
|
- name: "Smardigo Patchday: start services"
|
|
ansible.builtin.systemd:
|
|
name: postgresql
|
|
state: started
|
|
|
|
# wait_for cannot be used anymore due to enabled SSL encryption for postgres connections in DEV-382
|
|
- name: "Smardigo Patchday: check if postgres is listing on net internal ip address"
|
|
become: no
|
|
community.postgresql.postgresql_ping:
|
|
port: 5432
|
|
ssl_mode: require
|
|
login_host: '{{ stage_private_server_ip }}'
|
|
register: check_postgres
|
|
ignore_errors: yes
|
|
|
|
- name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address"
|
|
block:
|
|
|
|
- name: "Smardigo Patchday: error-handling - ensure service(s) started"
|
|
ansible.builtin.systemd:
|
|
name: postgresql
|
|
state: started
|
|
|
|
- name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address"
|
|
become: no
|
|
community.postgresql.postgresql_ping:
|
|
port: 5432
|
|
ssl_mode: require
|
|
login_host: '{{ stage_private_server_ip }}'
|
|
register: check_postgres_again
|
|
retries: 5
|
|
failed_when: not check_postgres_again.is_available
|
|
|
|
rescue:
|
|
- name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL"
|
|
delegate_to: '{{ stage }}-mail-01'
|
|
community.general.mail:
|
|
host: localhost
|
|
port: 25
|
|
to: '{{ devops_email_address }}'
|
|
subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for {{ inventory_hostname }}"
|
|
body: |
|
|
Dear Sir or Madam,
|
|
|
|
I have to inform you that {{ inventory_hostname }} isn'n listening on {{ stage_private_server_ip }} anymore.
|
|
|
|
Plz check what happened/ fix it little padawan ;)
|
|
|
|
kind regards,
|
|
|
|
your automation-bofh
|
|
|
|
when:
|
|
- not check_postgres.is_available
|
|
|
|
- hosts: all,!elastic,!postgres,!k8s_cluster
|
|
serial: 10
|
|
become: yes
|
|
tasks:
|
|
- name: "set VAR"
|
|
set_fact:
|
|
stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}"
|
|
when:
|
|
- "'blackbox' in group_names"
|
|
|
|
- name: "Smardigo Patchday: update pkgs"
|
|
ansible.builtin.apt:
|
|
upgrade: yes
|
|
update_cache: yes
|
|
autoremove: yes
|
|
autoclean: yes
|
|
|
|
- name: "Smardigo Patchday: find docker_compose.yml files"
|
|
ansible.builtin.find:
|
|
paths: '{{ service_base_path }}'
|
|
pattern: 'docker*.yml'
|
|
recurse: yes
|
|
register: docker_compose_services
|
|
|
|
- name: "Smardigo Patchday: shutdown services"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: absent
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
|
|
ansible.builtin.reboot:
|
|
post_reboot_delay: 30
|
|
reboot_timeout: 300
|
|
|
|
- name: "Smardigo Patchday: wait_for host after reboot"
|
|
become: no
|
|
delegate_to: localhost
|
|
ansible.builtin.wait_for:
|
|
delay: 15
|
|
timeout: 180
|
|
port: 22
|
|
host: '{{ stage_server_ip }}'
|
|
search_regex: OpenSSH
|
|
|
|
- name: "Smardigo Patchday: start services"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: present
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
- name: "Ensure SMA-portal is up and running"
|
|
# there is a hard dependency within SMA-portal (VM: <<stage>>-management-01) during application start process
|
|
# to iam-instance (VM: <<stage>>-iam-01)
|
|
# grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed
|
|
#
|
|
# ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process
|
|
block:
|
|
- name: "Check SMA-portal if reachable"
|
|
become: no
|
|
uri:
|
|
url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus"
|
|
method: GET
|
|
status_code: [200]
|
|
register: sma_portal_avail
|
|
delay: 10
|
|
retries: 5
|
|
no_log: true
|
|
until: sma_portal_avail.status in [200]
|
|
|
|
rescue:
|
|
- name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: absent
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
- name: "Smardigo Patchday: SMA-portal not reachable - start services again"
|
|
community.docker.docker_compose:
|
|
project_src: '{{ item.path | dirname }}'
|
|
state: present
|
|
loop: '{{ docker_compose_services.files }}'
|
|
|
|
when:
|
|
- "'management' in inventory_hostname"
|
|
|
|
|
|
- hosts: k8s_cluster
|
|
serial: 1
|
|
become: yes
|
|
|
|
tasks:
|
|
# draining the hard way
|
|
# due to force( delete static pods) + relative short terminate_grace_period +
|
|
# --delete-local-data to kick pods with emptyDir
|
|
#
|
|
# ATTENTION: needs to be done via command instead of kubernetes module
|
|
# due to missing flag for --delete-emptydir-data
|
|
# ¯\_(ツ)_/¯
|
|
- name: "Smardigo Patchday: drain node"
|
|
delegate_to: "{{ groups['kube_control_plane'][0] }}"
|
|
ansible.builtin.command: "/usr/local/bin/kubectl drain --timeout 2m --ignore-daemonsets --force --delete-emptydir-data {{ inventory_hostname | lower }}"
|
|
register: node_drained
|
|
until: node_drained
|
|
retries: 3
|
|
delay: 30
|
|
failed_when: false
|
|
|
|
- name: "Smardigo Patchday: stop k8s basic services"
|
|
ansible.builtin.systemd:
|
|
name: '{{ item }}'
|
|
state: stopped
|
|
loop: '{{ k8s_basic_services }}'
|
|
|
|
- name: "Smardigo Patchday: update pkgs"
|
|
ansible.builtin.apt:
|
|
autoclean: yes
|
|
autoremove: yes
|
|
update_cache: yes
|
|
upgrade: yes
|
|
|
|
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
|
|
ansible.builtin.reboot:
|
|
post_reboot_delay: 30
|
|
reboot_timeout: 300
|
|
|
|
- name: "Smardigo Patchday: wait_for host after reboot"
|
|
become: no
|
|
delegate_to: localhost
|
|
ansible.builtin.wait_for:
|
|
delay: 15
|
|
timeout: 180
|
|
port: 22
|
|
host: '{{ stage_server_ip }}'
|
|
search_regex: OpenSSH
|
|
|
|
- name: "Smardigo Patchday: start k8s basic services"
|
|
ansible.builtin.systemd:
|
|
name: '{{ item }}'
|
|
state: started
|
|
loop: '{{ k8s_basic_services }}'
|
|
|
|
- name: "Smardigo Patchday: wait for node readiness"
|
|
delegate_to: "{{ groups['kube_control_plane'][0] }}"
|
|
kubernetes.core.k8s:
|
|
kind: Node
|
|
state: present
|
|
name: '{{ inventory_hostname | lower }}'
|
|
wait_condition:
|
|
reason: KubeletReady
|
|
type: Ready
|
|
status: True
|
|
wait_timeout: 120
|
|
retries: 5
|
|
delay: 10
|
|
|
|
- name: "Smardigo Patchday: uncordon node"
|
|
delegate_to: "{{ groups['kube_control_plane'][0] }}"
|
|
kubernetes.core.k8s_drain:
|
|
state: uncordon
|
|
name: '{{ inventory_hostname }}'
|