Rollout main=>qa 13.09.2022

feature/DEV-655
Görz, Friedrich 3 years ago
parent 5367c9929e
commit 01c972771b

@ -306,6 +306,7 @@ run-patchday-dev:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main" - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "main"
run-patchday-dev-digitalocean: run-patchday-dev-digitalocean:
extends: .run-ansible
stage: run-patchday stage: run-patchday
before_script: before_script:
- echo "${ANSIBLE_VAULT_PASS_DEV}" > /tmp/vault-pass - echo "${ANSIBLE_VAULT_PASS_DEV}" > /tmp/vault-pass

@ -63,6 +63,9 @@ awx_ansible_user_name: "awx"
awx_ansible_user_ssh_key_private: "{{ ansible_ssh_key_private_vault }}" awx_ansible_user_ssh_key_private: "{{ ansible_ssh_key_private_vault }}"
awx_credential_machine_hetzner_name: hetzner-ansible-ssh awx_credential_machine_hetzner_name: hetzner-ansible-ssh
awx_ansible_username: ansible
awx_ansible_password: ansible
gitlab_ansible_user_name: "gitlabci" gitlab_ansible_user_name: "gitlabci"
backupuser_user_name: backupuser backupuser_user_name: backupuser
@ -225,3 +228,5 @@ prometheus_alert_pg_replication_lag: 120
upstream_dns_servers: upstream_dns_servers:
- 185.12.64.1 - 185.12.64.1
- 185.12.64.2 - 185.12.64.2

File diff suppressed because it is too large Load Diff

@ -2,3 +2,6 @@
postgres_backup_volume_count: 4 postgres_backup_volume_count: 4
postgres_backup_volume_size: 20 postgres_backup_volume_size: 20
postgres_pgdatadir_lvm_hcloudvol_size: 20
postgres_pgdatadir_lvm_hcloudvol_count: 4

@ -0,0 +1,5 @@
---
prometheus_lvm_hcloudvol_size: 30
prometheus_lvm_hcloudvol_count: 2
prometheus_tsdb_rentention_time: '90d'

@ -188,7 +188,54 @@
when: when:
- not check_postgres.is_available - not check_postgres.is_available
- hosts: all,!elastic,!postgres,!k8s_cluster # due to bloody dependencies in SMA application startup, iam must be available during startup
# => patching IAM service outsourced in separate part to make sure that is up and running
- hosts: iam
serial: 10
become: yes
tasks:
- name: "Smardigo Patchday: update pkgs"
ansible.builtin.apt:
upgrade: yes
update_cache: yes
autoremove: yes
autoclean: yes
- name: "Smardigo Patchday: find docker_compose.yml files"
ansible.builtin.find:
paths: '{{ service_base_path }}'
pattern: 'docker*.yml'
recurse: yes
register: docker_compose_services
- name: "Smardigo Patchday: shutdown services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: absent
loop: '{{ docker_compose_services.files }}'
- name: "Smardigo Patchday: rebooting <{{ inventory_hostname }}>"
ansible.builtin.reboot:
post_reboot_delay: 30
reboot_timeout: 300
- name: "Smardigo Patchday: wait_for host after reboot"
become: no
delegate_to: localhost
ansible.builtin.wait_for:
delay: 15
timeout: 180
port: 22
host: '{{ stage_server_ip }}'
search_regex: OpenSSH
- name: "Smardigo Patchday: start services"
community.docker.docker_compose:
project_src: '{{ item.path | dirname }}'
state: present
loop: '{{ docker_compose_services.files }}'
- hosts: all,!elastic,!postgres,!k8s_cluster,!iam
serial: 10 serial: 10
become: yes become: yes
tasks: tasks:
@ -259,6 +306,22 @@
until: sma_portal_avail.status in [200] until: sma_portal_avail.status in [200]
rescue: rescue:
- name: "Check SMA-portal dependency << iam-instance >>is reachable"
become: no
uri:
url: "https://{{ stage }}-iam-01.{{ domain }}/api/v1/roles"
method: GET
status_code: [403]
register: iam_avail
delay: 10
retries: 10
no_log: true
until: iam.status in [403]
ignore_errors: yes # noqa ignore-errors
# patchday continues ion case of failed request towards iam service;
# iam service is hard dependency for SMA-portal-instance but not for
# patchday itself - it;s just a work around
- name: "Smardigo Patchday: SMA-portal not reachable - shutdown services" - name: "Smardigo Patchday: SMA-portal not reachable - shutdown services"
community.docker.docker_compose: community.docker.docker_compose:
project_src: '{{ item.path | dirname }}' project_src: '{{ item.path | dirname }}'

@ -23,6 +23,7 @@
vars: vars:
username: "{{ awx_ansible_username }}" username: "{{ awx_ansible_username }}"
password: "{{ awx_ansible_password }}" password: "{{ awx_ansible_password }}"
is_system_auditor: "true"
uri: uri:
url: "{{ awx_base_url }}/api/v2/users/" url: "{{ awx_base_url }}/api/v2/users/"
method: POST method: POST

@ -0,0 +1,80 @@
---
- name: "Creating some hcloud volumes for LVM purpose"
hcloud_volume:
api_token: "{{ hetzner_authentication_ansible }}"
name: "{{ lvm_with_hetzner_volumes__volprefix }}-{{ inventory_hostname }}--vol{{ item }}"
server: "{{ inventory_hostname }}"
labels:
stage: "{{ stage }}"
used_for: "{{ lvm_with_hetzner_volumes__volprefix }}"
bound_on: "{{ inventory_hostname }}"
vol_no: "{{ item | string }}"
size: "{{ lvm_with_hetzner_volumes__volsize }}"
state: present
delete_protection: yes
loop: "{{ range(1, lvm_with_hetzner_volumes__volcount+1) | list }}"
register: created_volume
delegate_to: localhost
become: false
- name: "Getting all hcloud volumes for {{ inventory_hostname }}"
hcloud_volume_info:
api_token: "{{ hetzner_authentication_ansible }}"
label_selector: "stage={{ stage }},used_for={{ lvm_with_hetzner_volumes__volprefix }},bound_on={{ inventory_hostname }}"
register: hcloud_volumes_found
delegate_to: localhost
become: false
- name: "Getting all hcloud volumes for {{ inventory_hostname }}"
debug:
msg: "{{ hcloud_volumes_found }}"
- name: "Setting LVM related VARs"
set_fact:
pvs: "{{ hcloud_volumes_found.hcloud_volume_info | json_query(jmesquery) }}"
vg_name: "vg.{{ lvm_with_hetzner_volumes__volprefix }}"
lv_name: "lv.{{ lvm_with_hetzner_volumes__volprefix }}"
vars:
jmesquery: "[*].linux_device"
- name: "Creating a volume group on top of all found hcloud volumes"
community.general.lvg:
vg: "{{ vg_name }}"
pvs: "{{ pvs }}"
pvresize: yes
register: create_vg
- name: "Create logical volume" # noqa no-handler
community.general.lvol:
vg: "{{ vg_name }}"
lv: "{{ lv_name }}"
size: '100%PVS'
when:
- create_vg.changed
- name: "Format volume"
filesystem:
fstype: ext4
dev: "/dev/{{ vg_name }}/{{ lv_name }}"
- name: "Resize volume" # noqa no-handler
filesystem:
fstype: ext4
dev: "/dev/{{ vg_name }}/{{ lv_name }}"
resizefs: yes
when:
- create_vg.changed
# set noqa linter 'tag' due to unknown file permissions/ownership for mount path ;
# must be set in role etc in which this role will be called!!!
- name: "Ensure mountpath exists without setting permission/ownership" # noqa risky-file-permissions
file:
path: "{{ lvm_with_hetzner_volumes__mountpath }}"
state: directory
- name: "Mount created LVM volume"
mount:
path: "{{ lvm_with_hetzner_volumes__mountpath }}"
src: "/dev/{{ vg_name }}/{{ lv_name }}"
fstype: ext4
state: mounted

@ -24,6 +24,10 @@ database_state: present
postgres_homedir: '/var/lib/postgresql' postgres_homedir: '/var/lib/postgresql'
postgres_pgdatadir_lvm_hcloudvol_size: 10
postgres_pgdatadir_lvm_hcloudvol_count: 1
postgres_pgdatadir_lvm_hcloudvol_mountpath: '{{ postgres_homedir }}'
postgres_listen_addresses: "listen_addresses = 'localhost,{{ stage_private_server_ip }}'" postgres_listen_addresses: "listen_addresses = 'localhost,{{ stage_private_server_ip }}'"
postgres_base_config: postgres_base_config:

@ -12,6 +12,14 @@
system: true system: true
shell: /bin/bash shell: /bin/bash
- name: "Ensure postgres_homedir exists"
file:
path: "{{ postgres_homedir }}"
state: directory
owner: postgres
group: postgres
mode: "0755"
- name: "Ensuring repository meta is installed" - name: "Ensuring repository meta is installed"
apt: apt:
name: ["debian-keyring", "debian-archive-keyring", "apt-transport-https"] name: ["debian-keyring", "debian-archive-keyring", "apt-transport-https"]

@ -2,6 +2,15 @@
### tags: ### tags:
- name: "Create/Resize LVM for datadir"
include_role:
name: lvm_with_hetzner_volumes
vars:
lvm_with_hetzner_volumes__volprefix: postgres_datadir
lvm_with_hetzner_volumes__volsize: "{{ postgres_pgdatadir_lvm_hcloudvol_size }}"
lvm_with_hetzner_volumes__volcount: "{{ postgres_pgdatadir_lvm_hcloudvol_count }}"
lvm_with_hetzner_volumes__mountpath: "{{ postgres_pgdatadir_lvm_hcloudvol_mountpath }}"
# Minimal requirements for postgres # Minimal requirements for postgres
- name: Include Base Requirements - name: Include Base Requirements
include_tasks: base-requirements.yml include_tasks: base-requirements.yml

@ -1,4 +1,9 @@
--- ---
prometheus_lvm_hcloudvol_size: 30
prometheus_lvm_hcloudvol_count: 1
prometheus_lvm_hcloudvol_mountpath: '/prometheus_datadir'
prometheus_datadir: "{{ prometheus_lvm_hcloudvol_mountpath }}"
# https://github.com/prometheus/prometheus # https://github.com/prometheus/prometheus
prometheus_image_name: "prom/prometheus" prometheus_image_name: "prom/prometheus"

@ -4,6 +4,15 @@
### update_config ### update_config
### update_deployment ### update_deployment
- name: "Create/Resize LVM for datadir"
include_role:
name: lvm_with_hetzner_volumes
vars:
lvm_with_hetzner_volumes__volprefix: prometheus_datadir
lvm_with_hetzner_volumes__volsize: "{{ prometheus_lvm_hcloudvol_size }}"
lvm_with_hetzner_volumes__volcount: "{{ prometheus_lvm_hcloudvol_count }}"
lvm_with_hetzner_volumes__mountpath: "{{ prometheus_lvm_hcloudvol_mountpath }}"
- name: "Setup DNS configuration for <{{ inventory_hostname }}>" - name: "Setup DNS configuration for <{{ inventory_hostname }}>"
include_role: include_role:
name: sma_digitalocean name: sma_digitalocean
@ -141,3 +150,22 @@
job: /root/do_too_many_req_metrics.sh job: /root/do_too_many_req_metrics.sh
tags: tags:
- update-do-metrics - update-do-metrics
- name: "Create hetzner-metrics script from template"
template:
src: hetzner_unattached_volumes.py.j2
dest: /root/hetzner_unattached_volumes.py
mode: 0700
owner: root
group: root
tags:
- update-hetzner-metrics
- name: "Create cronjob to exec hetzner-metrics script"
ansible.builtin.cron:
name: "update hetzner metrics"
minute: "*"
job: /root/hetzner_unattached_volumes.py
tags:
- update-hetzner-metrics

@ -0,0 +1,53 @@
#!/usr/bin/env python3
import requests
access_token = '{{ hetzner_authentication_ansible_vault }}'
node_exporter_txt_dir = '/var/lib/prometheus/node-exporter'
metrics_file = node_exporter_txt_dir + "/hetzner_metrics.prom"
query= {'per_page': '1000'}
headers = {'Authorization': 'Bearer ' + access_token}
try:
response_volumes = requests.get("https://api.hetzner.cloud/v1/volumes", headers=headers, params=query)
response_servers = requests.get("https://api.hetzner.cloud/v1/servers", headers=headers, params=query)
except requests.exceptions.RequestException as e: # This is the correct syntax
f = open(metrics_file, "w")
f.write("\n")
f.close()
raise SystemExit(e)
if response_volumes.ok and response_servers.ok:
volume_json = response_volumes.json()["volumes"]
unattached_volume_count = 0
for x in range(len(volume_json)):
if volume_json[x]["server"] == None:
unattached_volume_count+=1
locked_servers_count = 0
servers_json = response_servers.json()["servers"]
for x in range(len(servers_json)):
if servers_json[x]["locked"] == "true":
locked_servers_count+=1
ratelimit_limit = response_servers.headers['ratelimit-limit']
ratelimit_remaining = response_servers.headers['ratelimit-remaining']
f = open(metrics_file, "w")
f.write(
"hetzner_api_ratelimit_remaining " + str(ratelimit_remaining) + "\n"
+ "hetzner_api_ratelimit_limit " + str(ratelimit_limit) + "\n"
+ "hetzner_api_unattached_volumes " + str(unattached_volume_count) + "\n"
+ "hetzner_api_locked_servers " + str(locked_servers_count) + "\n"
)
f.close()
else:
f = open(metrics_file, "w")
f.write("\n")
f.close()

@ -21,9 +21,6 @@ prometheus_docker: {
}, },
], ],
volumes: [ volumes: [
{
name: "{{ prometheus_id }}-data"
},
{ {
name: "{{ alertmanager_id }}-data" name: "{{ alertmanager_id }}-data"
}, },
@ -56,7 +53,7 @@ prometheus_docker: {
], ],
volumes: [ volumes: [
'"./config/prometheus/:/etc/prometheus/:ro"', '"./config/prometheus/:/etc/prometheus/:ro"',
'"{{ prometheus_id }}-data:/prometheus"', '"{{ prometheus_datadir + "/" +"_data" | default(prometheus_id + "-data") }}:/prometheus"',
], ],
networks: [ networks: [
'"back-tier"', '"back-tier"',

@ -4,8 +4,6 @@
path: '{{ selfsigned_ca_cert_private_key | dirname }}' path: '{{ selfsigned_ca_cert_private_key | dirname }}'
state: directory state: directory
mode: '0755' mode: '0755'
owner: root
group: root
- name: "Generate an OpenSSL private key" - name: "Generate an OpenSSL private key"
community.crypto.openssl_privatekey: community.crypto.openssl_privatekey:

@ -5,7 +5,7 @@ dev-backup-01
dev-management-01 dev-management-01
[pdns] [pdns]
dev-pdns-01 #dev-pdns-01
[elastic] [elastic]
dev-elastic-stack-elastic-01 dev-elastic-stack-elastic-01

@ -1,23 +1,3 @@
[backup]
prodwork01-backup-01
[elastic]
prodwork01-elastic-stack-elastic-01
prodwork01-elastic-stack-elastic-02
prodwork01-elastic-stack-elastic-03
[kibana]
prodwork01-elastic-stack-kibana-01
[logstash]
prodwork01-elastic-stack-logstash-01
[maria]
prodwork01-maria-01
[postgres]
prodwork01-postgres-01
prodwork01-postgres-02
[kube_control_plane] [kube_control_plane]
prodwork01-kube-cpl-01 prodwork01-kube-cpl-01
prodwork01-kube-cpl-02 prodwork01-kube-cpl-02
@ -38,13 +18,7 @@ kube_control_plane
kube_node kube_node
[stage_prodwork01:children] [stage_prodwork01:children]
backup
elastic
k8s_cluster k8s_cluster
kibana
logstash
maria
postgres
[all:children] [all:children]
stage_prodwork01 stage_prodwork01

@ -5,7 +5,7 @@ qa-backup-01
qa-management-01 qa-management-01
[pdns] [pdns]
qa-pdns-01 #qa-pdns-01
[elastic] [elastic]
qa-elastic-stack-elastic-01 qa-elastic-stack-elastic-01

@ -350,3 +350,63 @@ groups:
summary: "Elasticsearch health status is not green. Please Check" summary: "Elasticsearch health status is not green. Please Check"
description: "Alert for Elasticsearch health status" description: "Alert for Elasticsearch health status"
- alert: awx job failed with status error
expr: changes(awx_status_total{status="error"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status error"
description: "Alert awx jobs has an error"
- alert: awx job failed with status failed
expr: changes(awx_status_total{status="failed"}[2m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "awx job failed with status failed"
description: "Alert awx jobs failed"
- alert: postgres backup zombies
expr: 100 - ((node_filesystem_avail_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'} * 100) / node_filesystem_size_bytes{instance=~"{{ stage }}-postgres-01.smardigo.digital",job=~"node-exporter",device='/dev/mapper/vg.postgres_backup-lv.postgres_backup'}) > 10
for: 2h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "postgres backup zombies, have not been deleted"
description: "postgres backup zombies, have not been deleted"
- alert: hetzner unattached volumes
expr: hetzner_api_unattached_volumes > 0 or absent(hetzner_api_unattached_volumes)
for: 2h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "unattached volumes in hetzner"
description: "unattached volumes in hetzner"
- alert: hetzner ratelimit_remaining low
expr: hetzner_api_ratelimit_remaining < 720 or absent(hetzner_api_ratelimit_remaining)
for: 1h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "hetzner ratelimit_remaining below 720"
description: "hetzner ratelimit_remaining below 720"
- alert: hetzner locked server exists
expr: hetzner_api_locked_servers > 0 or absent(hetzner_api_locked_servers)
for: 1h
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "hetzner locked server exists"
description: "hetzner locked server exists"

@ -452,3 +452,18 @@ scrape_configs:
static_configs: static_configs:
- targets: ['{{ kubernetes_prometheus_endpoint }}'] - targets: ['{{ kubernetes_prometheus_endpoint }}']
{% endif %} {% endif %}
##############################################
### awx ####
##############################################
- job_name: 'awx'
metrics_path: '/api/v2/metrics'
scrape_interval: 5s
scheme: https
basic_auth:
username: '{{ awx_ansible_username }}'
password: '{{ awx_ansible_password }}'
static_configs:
- targets: ['{{ shared_service_kube_awx_hostname }}']

Loading…
Cancel
Save