diff --git a/roles/backup/files/pull_remote_backups.sh b/roles/backup/files/pull_remote_backups.sh index 24edf1f..2cb728b 100644 --- a/roles/backup/files/pull_remote_backups.sh +++ b/roles/backup/files/pull_remote_backups.sh @@ -3,20 +3,47 @@ # # +# Fail fast and be aware of exit codes +set -euo pipefail + # Define some variables +DATE=$(date +%F) +DATE_TIME=$(date +%F_%H:%M) REMOTE_SYSTEM_USER=backupuser DATABASE_SERVER_IP=$1 STAGE=$2 DATABASE_ENGINE=$3 DEST_DIR=${HOME}/backups/${STAGE}/${DATABASE_ENGINE} +BACKUP_STATUS_FILE=${DEST_DIR}/${DATE}/backup_finished_${DATE}_* +METRICS_FILE=${HOME}/backup_status_${DATABASE_ENGINE}.prom +LOG_FILE=${DEST_DIR}/backup_${DATE_TIME}.log + +# Redirect stderr to stdout and save everything to log file +exec > ${LOG_FILE} 2>&1 -# Exit on ${DEST_DIR} not found -[ ! -d "$DEST_DIR" ] && exit 1 +# Log backup sync start time +echo "----- Start backup Sync - ${DATE_TIME} -----" + +# Create backup directory ${DEST_DIR} if not exist +mkdir -p ${DEST_DIR} # Remove files oder than 48h in ${DEST_DIR} find $DEST_DIR -type d -mtime +1 -print0 | xargs -I OLD_DIR -0 rm -rf "OLD_DIR" [ "$?" != "0" ] && exit 1 -# Create backup directory ${DEST_DIR} if not exist and rsync from ${DATABASE_SERVER_IP} to ${DEST_DIR}/ -mkdir -p ${DEST_DIR} +# Start rsync job from ${DATABASE_SERVER_IP} to ${DEST_DIR}/ rsync -av --remove-source-files -e "ssh -o StrictHostKeyChecking=no" ${REMOTE_SYSTEM_USER}@${DATABASE_SERVER_IP}:/backups/${DATABASE_ENGINE}/ ${DEST_DIR}/ +[ "$?" -eq "0" ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1" + +# Check existence of current ${BACKUP_STATUS_FILE}, which is created by AWX, in case of succesful database backup only. +[ -f ${BACKUP_STATUS_FILE} ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1" + +# Add backup status to Prometheus metrics file +cat < $METRICS_FILE +# HELP nightly_backup_successful_${DATABASE_ENGINE} +# TYPE nightly_backup_successful_${DATABASE_ENGINE} gauge +nightly_backup_successful_${DATABASE_ENGINE}{stage="$STAGE"} $NIGHTLY_BACKUP_SUCCESSFUL +EOF + +# Log backup sync end time +echo "----- End backup Sync - ${DATE_TIME} -----" diff --git a/roles/backup/tasks/main.yml b/roles/backup/tasks/main.yml index 4db1552..aa66181 100644 --- a/roles/backup/tasks/main.yml +++ b/roles/backup/tasks/main.yml @@ -8,7 +8,6 @@ shell: /bin/bash register: create_user - - name: "Create .ssh dir and backups dir" become: yes file: @@ -32,8 +31,6 @@ lvm_with_hetzner_volumes__volcount: "{{ backup_lvm_hcloudvol_count }}" lvm_with_hetzner_volumes__mountpath: "{{ backup_lvm_hcloudvol_mountpath }}" - - - name: "Providing SSH priv.key" no_log: true become: yes @@ -64,9 +61,36 @@ owner: '{{ system_user }}' group: '{{ system_user }}' +- name: Touch backup_status_maria.prom if not exists + file: + path: "/home/{{ system_user }}/backup_status_maria.prom" + state: touch + mode: '0744' + owner: '{{ system_user }}' + group: '{{ system_user }}' + +- name: Touch backup_status_postgres.prom if not exists + file: + path: "/home/{{ system_user }}/backup_status_postgres.prom" + state: touch + mode: '0744' + owner: '{{ system_user }}' + group: '{{ system_user }}' + - name: Create symbolic link for node_exporter text metrics file: src: "/home/{{ system_user }}/metrics.prom" dest: "/var/lib/prometheus/node-exporter/offsite-metrics.prom" state: link +- name: Create symbolic link for node_exporter text metrics backup_status_maria + file: + src: "/home/{{ system_user }}/backup_status_maria.prom" + dest: "/var/lib/prometheus/node-exporter/backup_status_maria.prom" + state: link + +- name: Create symbolic link for node_exporter text metrics backup_status_postgres + file: + src: "/home/{{ system_user }}/backup_status_postgres.prom" + dest: "/var/lib/prometheus/node-exporter/backup_status_postgres.prom" + state: link diff --git a/templates/prometheus/config/prometheus/alert.rules.j2 b/templates/prometheus/config/prometheus/alert.rules.j2 index 12a91a0..4495365 100644 --- a/templates/prometheus/config/prometheus/alert.rules.j2 +++ b/templates/prometheus/config/prometheus/alert.rules.j2 @@ -129,6 +129,26 @@ groups: summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." description: "backup failed." + - alert: nightly_backup_failed_maria + expr: nightly_backup_successful_maria > 0 or absent(nightly_backup_successful_maria) + for: 2m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed." + description: "MariaDB backup failed." + + - alert: nightly_backup_failed_postgres + expr: nightly_backup_successful_postgres > 0 or absent(nightly_backup_successful_postgres) + for: 2m + labels: + severity: critical + annotations: + identifier: '{{ '{{' }} $labels.instance {{ '}}' }}' + summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed." + description: "PostgreSQL backup failed." + - alert: megaraid_smart_errors expr: megaraid_smart_errors > 1 for: 2m