From e1d05f5e81b1503f734d8e171b267a810461f23c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6rz=2C=20Friedrich?= Date: Wed, 7 Dec 2022 15:50:31 +0000 Subject: [PATCH] DEV-721: exclude restore-servers from patchday - avoiding broken... --- patchday.yml | 2 +- roles/backup/files/pull_remote_backups.sh | 13 ++++++++++++- .../prometheus/config/prometheus/alert.rules.j2 | 8 ++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/patchday.yml b/patchday.yml index ad154e8..27036b4 100644 --- a/patchday.yml +++ b/patchday.yml @@ -235,7 +235,7 @@ state: present loop: '{{ docker_compose_services.files }}' -- hosts: all,!elastic,!postgres,!k8s_cluster,!iam +- hosts: all,!elastic,!postgres,!k8s_cluster,!iam,!restore serial: 10 become: yes tasks: diff --git a/roles/backup/files/pull_remote_backups.sh b/roles/backup/files/pull_remote_backups.sh index 2cb728b..0ffb791 100644 --- a/roles/backup/files/pull_remote_backups.sh +++ b/roles/backup/files/pull_remote_backups.sh @@ -14,7 +14,6 @@ DATABASE_SERVER_IP=$1 STAGE=$2 DATABASE_ENGINE=$3 DEST_DIR=${HOME}/backups/${STAGE}/${DATABASE_ENGINE} -BACKUP_STATUS_FILE=${DEST_DIR}/${DATE}/backup_finished_${DATE}_* METRICS_FILE=${HOME}/backup_status_${DATABASE_ENGINE}.prom LOG_FILE=${DEST_DIR}/backup_${DATE_TIME}.log @@ -31,19 +30,31 @@ mkdir -p ${DEST_DIR} find $DEST_DIR -type d -mtime +1 -print0 | xargs -I OLD_DIR -0 rm -rf "OLD_DIR" [ "$?" != "0" ] && exit 1 +echo "Removing logfiles older than 7d ..." +find $DEST_DIR -type f -mtime +7 -name "backup_*.log" -print0 | xargs -I OLD_FILES -0 rm -rf "OLD_FILES" + # Start rsync job from ${DATABASE_SERVER_IP} to ${DEST_DIR}/ rsync -av --remove-source-files -e "ssh -o StrictHostKeyChecking=no" ${REMOTE_SYSTEM_USER}@${DATABASE_SERVER_IP}:/backups/${DATABASE_ENGINE}/ ${DEST_DIR}/ [ "$?" -eq "0" ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1" +BACKUP_STATUS_FILE=$(ls -t1 ${DEST_DIR}/${DATE}/backup_finished_${DATE}_* | head -n1) # Check existence of current ${BACKUP_STATUS_FILE}, which is created by AWX, in case of succesful database backup only. [ -f ${BACKUP_STATUS_FILE} ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1" # Add backup status to Prometheus metrics file +if [ "$NIGHTLY_BACKUP_SUCCESSFUL" -eq "0" ]; then + echo "NIGHTLY_BACKUP_SUCCESSFUL=0 - writing METRICS_FILE" cat < $METRICS_FILE # HELP nightly_backup_successful_${DATABASE_ENGINE} # TYPE nightly_backup_successful_${DATABASE_ENGINE} gauge nightly_backup_successful_${DATABASE_ENGINE}{stage="$STAGE"} $NIGHTLY_BACKUP_SUCCESSFUL +nightly_backup_successful_${DATABASE_ENGINE}_finished_seconds{stage="$STAGE"} `date +%s` EOF +else + echo "NIGHTLY_BACKUP_SUCCESSFUL=1 - removing METRICS_FILE to trigger alert" + rm $METRICS_FILE +fi + # Log backup sync end time echo "----- End backup Sync - ${DATE_TIME} -----" diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 273cc05..9d8f57b 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -130,7 +130,9 @@ groups: description: "backup failed." - alert: nightly_backup_failed_maria - expr: nightly_backup_successful_maria > 0 or absent(nightly_backup_successful_maria) + expr: | + (time() - nightly_backup_successful_maria_finished_seconds) > 30 * 3600 or + absent(nightly_backup_successful_maria_finished_seconds) for: 2m labels: severity: critical @@ -140,7 +142,9 @@ groups: description: "MariaDB backup failed." - alert: nightly_backup_failed_postgres - expr: nightly_backup_successful_postgres > 0 or absent(nightly_backup_successful_postgres) + expr: | + (time() - nightly_backup_successful_postgres_finished_seconds) > 30 * 3600 or + absent(nightly_backup_successful_postgres_finished_seconds) for: 2m labels: severity: critical