---

### tags:
###   check_elastic_cluster


- hosts: prometheus
  vars:
    start: '{{ ansible_date_time.epoch }}'

  tasks:
    - name: "DO some stuff for silencing"
      block:
        - name: "Set VAR for silence start and end"
          set_fact:
            silence_starts_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime(start) }}"
            silence_ends_at: "{{ '%Y-%m-%d %H:%M:%S' | strftime( ( start | int ) + 3600 |int ) }}"

        - name: "Set VAR - define prometheus silence object"
          set_fact:
            silence:
              matchers:
                - name: job
                  value: .+
                  isRegex: true
              startsAt: '{{ silence_starts_at }}'
              endsAt: '{{ silence_ends_at }}'
              createdBy: patchday-automatism
              comment: patchday
              id:

        - name: "Schedule silences for stage..."
          uri:
            url: "https://{{ stage }}-prometheus-01-alertmanager.smardigo.digital/api/v2/silences"
            url_username: "{{ alertmanager_admin_username }}"
            url_password: "{{ alertmanager_admin_password }}"
            method: POST
            status_code: [200]
            headers:
              Content-Type: application/json
            body_format: json
            body: '{{ silence | to_json }}'

      rescue:
        - name: "Rescue silencing - sending mail to DEVOPS-DL"
          delegate_to: '{{ stage }}-mail-01'
          community.general.mail:
            host: localhost
            port: 25
            to: '{{ devops_email_address }}'
            subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for failed silencing"
            body: |
              Dear Sir or Madam,

              silencing alerts for patchday failed.

              Plz check what happened/ fix it little padawan ;)

              kind regards,

              your automation-bofh


- hosts: elastic
  serial: 1
  become: yes
  tasks:
    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        upgrade: yes
        update_cache: yes
        autoremove: yes
        autoclean: yes

    - name: "Smardigo Patchday: find docker_compose.yml files"
      ansible.builtin.find:
        paths: '{{ service_base_path }}'
        pattern: 'docker*.yml'
        recurse: yes
      register: docker_compose_services

    - name: "Smardigo Patchday: shutdown services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: absent
      loop: '{{ docker_compose_services.files }}'

    - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
      ansible.builtin.reboot:
        post_reboot_delay: 30
        reboot_timeout: 300

    - name: "Smardigo Patchday: wait_for host after reboot"
      become: no
      delegate_to: localhost
      ansible.builtin.wait_for:
        delay: 15
        timeout: 180
        port: 22
        host: '{{ stage_server_ip }}'
        search_regex: OpenSSH

    - name: "Smardigo Patchday: start services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: present
      loop: '{{ docker_compose_services.files }}'

    - name: "Smardigo Patchday: wait until cluster is green"
      ansible.builtin.uri:
        url: "https://localhost:9200/_cluster/health"
        user: "{{ elastic_admin_username }}"
        password: "{{ elastic_admin_password }}"
        force_basic_auth: true
        status_code: 200
        ca_path: "{{ service_base_path }}/{{ elastic_id }}/certs/ca/ca.crt"
      register: check_elastic_cluster
      delay: 30
      retries: 30
      until:
        - check_elastic_cluster.json is defined
        - check_elastic_cluster.json.status == 'green'
      no_log: true
      tags:
        - check_elastic_cluster

- hosts: postgres
  serial: 1
  become: yes
  tasks:
    - name: "Smardigo Patchday: stop service(s)"
      ansible.builtin.systemd:
        name: postgresql
        state: stopped

    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        upgrade: yes
        update_cache: yes
        autoremove: yes
        autoclean: yes

    - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
      ansible.builtin.reboot:
        post_reboot_delay: 30
        reboot_timeout: 300

    - name: "Smardigo Patchday: wait_for host after reboot"
      become: no
      delegate_to: localhost
      ansible.builtin.wait_for:
        delay: 15
        timeout: 180
        port: 22
        host: '{{ stage_server_ip }}'
        search_regex: OpenSSH

    - name: "Smardigo Patchday: start services"
      ansible.builtin.systemd:
        name: postgresql
        state: started

    # wait_for cannot be used anymore due to enabled SSL encryption for postgres connections in DEV-382
    - name: "Smardigo Patchday: check if postgres is listing on net internal ip address"
      become: no
      community.postgresql.postgresql_ping:
        port: 5432
        ssl_mode: require
        login_host: '{{ stage_private_server_ip }}'
      register: check_postgres
      ignore_errors: yes

    - name: "Smardigo Patchday: error-handling - ensure postgres started and check listing on net internal ip address"
      block:

        - name: "Smardigo Patchday: error-handling - ensure service(s) started"
          ansible.builtin.systemd:
            name: postgresql
            state: started

        - name: "Smardigo Patchday: error-handling - check if postgres is listing on net internal ip address"
          become: no
          community.postgresql.postgresql_ping:
            port: 5432
            ssl_mode: require
            login_host: '{{ stage_private_server_ip }}'
          register: check_postgres_again
          retries: 5
          failed_when: not check_postgres_again.is_available

      rescue:
        - name: "Smardigo Patchday: error-handling - send mail to DEVOPS-DL"
          delegate_to: '{{ stage }}-mail-01'
          community.general.mail:
            host: localhost
            port: 25
            to: '{{ devops_email_address }}'
            subject: "patchday( {{ lookup('pipe','date +%Y-%m-%d_%H:%M') }} ) problem report for {{ inventory_hostname }}"
            body: |
              Dear Sir or Madam,

              I have to inform you that {{ inventory_hostname }} isn'n listening on {{ stage_private_server_ip }} anymore.

              Plz check what happened/ fix it little padawan ;)

              kind regards,

              your automation-bofh

      when:
        - not check_postgres.is_available

# due to bloody dependencies in SMA application startup, iam must be available during startup
# => patching IAM service outsourced in separate part to make sure that is up and running
- hosts: iam
  serial: 10
  become: yes
  tasks:
    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        upgrade: yes
        update_cache: yes
        autoremove: yes
        autoclean: yes

    - name: "Smardigo Patchday: find docker_compose.yml files"
      ansible.builtin.find:
        paths: '{{ service_base_path }}'
        pattern: 'docker*.yml'
        recurse: yes
      register: docker_compose_services

    - name: "Smardigo Patchday: shutdown services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: absent
      loop: '{{ docker_compose_services.files }}'

    - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
      ansible.builtin.reboot:
        post_reboot_delay: 30
        reboot_timeout: 300

    - name: "Smardigo Patchday: wait_for host after reboot"
      become: no
      delegate_to: localhost
      ansible.builtin.wait_for:
        delay: 15
        timeout: 180
        port: 22
        host: '{{ stage_server_ip }}'
        search_regex: OpenSSH

    - name: "Smardigo Patchday: start services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: present
      loop: '{{ docker_compose_services.files }}'

- hosts: all,!elastic,!postgres,!k8s_cluster,!iam,!restore
  serial: 10
  become: yes
  tasks:
    - name: "set VAR"
      set_fact:
        stage_server_ip: "{{ lookup('community.general.dig', inventory_hostname + '.' + domain ) }}"
      when:
        - "'blackbox' in group_names"

    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        upgrade: yes
        update_cache: yes
        autoremove: yes
        autoclean: yes

    - name: "Smardigo Patchday: find docker_compose.yml files"
      ansible.builtin.find:
        paths: '{{ service_base_path }}'
        pattern: 'docker*.yml'
        recurse: yes
      register: docker_compose_services

    - name: "Smardigo Patchday: shutdown services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: absent
      loop: '{{ docker_compose_services.files }}'

    - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
      ansible.builtin.reboot:
        post_reboot_delay: 30
        reboot_timeout: 300

    - name: "Smardigo Patchday: wait_for host after reboot"
      become: no
      delegate_to: localhost
      ansible.builtin.wait_for:
        delay: 15
        timeout: 180
        port: 22
        host: '{{ stage_server_ip }}'
        search_regex: OpenSSH

    - name: "Smardigo Patchday: start services"
      community.docker.docker_compose:
        project_src: '{{ item.path | dirname }}'
        state: present
      loop: '{{ docker_compose_services.files }}'

    - name: "Ensure SMA-portal is up and running"
      # there is a hard dependency within SMA-portal (VM: <<stage>>-management-01) during application start process
      # to iam-instance (VM: <<stage>>-iam-01)
      # grouped tasks within ansible block statement is just a work around until smardigo-app depending problem will be fixed
      #
      # ATTENTION: iam-server must be up and running => SMA-portal will be restarted and will finished successfully its application start process
      block:
        - name: "Check SMA-portal if reachable"
          become: no
          uri:
            url: "https://{{ stage }}-management-01-connect.{{ domain }}:{{ admin_port_service }}/management/prometheus"
            method: GET
            status_code: [200]
          register: sma_portal_avail
          delay: 10
          retries: 5
          no_log: true
          until: sma_portal_avail.status in [200]

      rescue:
        - name: "Check SMA-portal dependency << iam-instance >>is reachable"
          become: no
          uri:
            url: "https://{{ stage }}-iam-01.{{ domain }}/api/v1/roles"
            method: GET
            status_code: [403]
          register: iam_avail
          delay: 10
          retries: 10
          no_log: true
          until: iam.status in [403]
          ignore_errors: yes # noqa ignore-errors
          # patchday continues ion case of failed request towards iam service;
          # iam service is hard dependency for SMA-portal-instance but not for
          # patchday itself - it;s just a work around

        - name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
          community.docker.docker_compose:
            project_src: '{{ item.path | dirname }}'
            state: absent
          loop: '{{ docker_compose_services.files }}'

        - name: "Smardigo Patchday: SMA-portal not reachable - start services again"
          community.docker.docker_compose:
            project_src: '{{ item.path | dirname }}'
            state: present
          loop: '{{ docker_compose_services.files }}'

      when:
        - "'management' in inventory_hostname"


- hosts: k8s_cluster
  serial: 1
  become: yes

  tasks:
      # draining the hard way
      # due to force( delete static pods) + relative short terminate_grace_period +
      # --delete-local-data to kick pods with emptyDir
      #
      # ATTENTION: needs to be done via command instead of kubernetes module
      #            due to missing flag for --delete-emptydir-data
      #            ¯\_(ツ)_/¯
    - name: "Smardigo Patchday: drain node"
      delegate_to: "{{ groups['kube_control_plane'][0] }}"
      ansible.builtin.command: "/usr/local/bin/kubectl drain --timeout 2m --ignore-daemonsets --force --delete-emptydir-data {{ inventory_hostname | lower }}"
      register: node_drained
      until: node_drained
      retries: 3
      delay: 30
      failed_when: false

    - name: "Smardigo Patchday: stop k8s basic services"
      ansible.builtin.systemd:
        name: '{{ item }}'
        state: stopped
      loop: '{{ k8s_basic_services }}'

    - name: "Smardigo Patchday: update pkgs"
      ansible.builtin.apt:
        autoclean: yes
        autoremove: yes
        update_cache: yes
        upgrade: yes

    - name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
      ansible.builtin.reboot:
        post_reboot_delay: 30
        reboot_timeout: 300

    - name: "Smardigo Patchday: wait_for host after reboot"
      become: no
      delegate_to: localhost
      ansible.builtin.wait_for:
        delay: 15
        timeout: 180
        port: 22
        host: '{{ stage_server_ip }}'
        search_regex: OpenSSH

    - name: "Smardigo Patchday: start k8s basic services"
      ansible.builtin.systemd:
        name: '{{ item }}'
        state: started
      loop: '{{ k8s_basic_services }}'

    - name: "Smardigo Patchday: wait for node readiness"
      delegate_to: "{{ groups['kube_control_plane'][0] }}"
      kubernetes.core.k8s:
        kind: Node
        state: present
        name: '{{ inventory_hostname | lower }}'
        wait_condition:
          reason: KubeletReady
          type: Ready
          status: True
        wait_timeout: 120
      retries: 5
      delay: 10

    - name: "Smardigo Patchday: uncordon node"
      delegate_to: "{{ groups['kube_control_plane'][0] }}"
      kubernetes.core.k8s_drain:
        state: uncordon
        name: '{{ inventory_hostname }}'