DEV-624 New alert for failed db backups

feature/DEV-655
Michael Hähnel 3 years ago committed by Görz, Friedrich
parent f754404845
commit 87a286dd60

@ -3,20 +3,47 @@
# #
# #
# Fail fast and be aware of exit codes
set -euo pipefail
# Define some variables # Define some variables
DATE=$(date +%F)
DATE_TIME=$(date +%F_%H:%M)
REMOTE_SYSTEM_USER=backupuser REMOTE_SYSTEM_USER=backupuser
DATABASE_SERVER_IP=$1 DATABASE_SERVER_IP=$1
STAGE=$2 STAGE=$2
DATABASE_ENGINE=$3 DATABASE_ENGINE=$3
DEST_DIR=${HOME}/backups/${STAGE}/${DATABASE_ENGINE} DEST_DIR=${HOME}/backups/${STAGE}/${DATABASE_ENGINE}
BACKUP_STATUS_FILE=${DEST_DIR}/${DATE}/backup_finished_${DATE}_*
METRICS_FILE=${HOME}/backup_status_${DATABASE_ENGINE}.prom
LOG_FILE=${DEST_DIR}/backup_${DATE_TIME}.log
# Redirect stderr to stdout and save everything to log file
exec > ${LOG_FILE} 2>&1
# Exit on ${DEST_DIR} not found # Log backup sync start time
[ ! -d "$DEST_DIR" ] && exit 1 echo "----- Start backup Sync - ${DATE_TIME} -----"
# Create backup directory ${DEST_DIR} if not exist
mkdir -p ${DEST_DIR}
# Remove files oder than 48h in ${DEST_DIR} # Remove files oder than 48h in ${DEST_DIR}
find $DEST_DIR -type d -mtime +1 -print0 | xargs -I OLD_DIR -0 rm -rf "OLD_DIR" find $DEST_DIR -type d -mtime +1 -print0 | xargs -I OLD_DIR -0 rm -rf "OLD_DIR"
[ "$?" != "0" ] && exit 1 [ "$?" != "0" ] && exit 1
# Create backup directory ${DEST_DIR} if not exist and rsync from ${DATABASE_SERVER_IP} to ${DEST_DIR}/ # Start rsync job from ${DATABASE_SERVER_IP} to ${DEST_DIR}/
mkdir -p ${DEST_DIR}
rsync -av --remove-source-files -e "ssh -o StrictHostKeyChecking=no" ${REMOTE_SYSTEM_USER}@${DATABASE_SERVER_IP}:/backups/${DATABASE_ENGINE}/ ${DEST_DIR}/ rsync -av --remove-source-files -e "ssh -o StrictHostKeyChecking=no" ${REMOTE_SYSTEM_USER}@${DATABASE_SERVER_IP}:/backups/${DATABASE_ENGINE}/ ${DEST_DIR}/
[ "$?" -eq "0" ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1"
# Check existence of current ${BACKUP_STATUS_FILE}, which is created by AWX, in case of succesful database backup only.
[ -f ${BACKUP_STATUS_FILE} ] && NIGHTLY_BACKUP_SUCCESSFUL="0" || NIGHTLY_BACKUP_SUCCESSFUL="1"
# Add backup status to Prometheus metrics file
cat <<EOF > $METRICS_FILE
# HELP nightly_backup_successful_${DATABASE_ENGINE}
# TYPE nightly_backup_successful_${DATABASE_ENGINE} gauge
nightly_backup_successful_${DATABASE_ENGINE}{stage="$STAGE"} $NIGHTLY_BACKUP_SUCCESSFUL
EOF
# Log backup sync end time
echo "----- End backup Sync - ${DATE_TIME} -----"

@ -8,7 +8,6 @@
shell: /bin/bash shell: /bin/bash
register: create_user register: create_user
- name: "Create .ssh dir and backups dir" - name: "Create .ssh dir and backups dir"
become: yes become: yes
file: file:
@ -32,8 +31,6 @@
lvm_with_hetzner_volumes__volcount: "{{ backup_lvm_hcloudvol_count }}" lvm_with_hetzner_volumes__volcount: "{{ backup_lvm_hcloudvol_count }}"
lvm_with_hetzner_volumes__mountpath: "{{ backup_lvm_hcloudvol_mountpath }}" lvm_with_hetzner_volumes__mountpath: "{{ backup_lvm_hcloudvol_mountpath }}"
- name: "Providing SSH priv.key" - name: "Providing SSH priv.key"
no_log: true no_log: true
become: yes become: yes
@ -64,9 +61,36 @@
owner: '{{ system_user }}' owner: '{{ system_user }}'
group: '{{ system_user }}' group: '{{ system_user }}'
- name: Touch backup_status_maria.prom if not exists
file:
path: "/home/{{ system_user }}/backup_status_maria.prom"
state: touch
mode: '0744'
owner: '{{ system_user }}'
group: '{{ system_user }}'
- name: Touch backup_status_postgres.prom if not exists
file:
path: "/home/{{ system_user }}/backup_status_postgres.prom"
state: touch
mode: '0744'
owner: '{{ system_user }}'
group: '{{ system_user }}'
- name: Create symbolic link for node_exporter text metrics - name: Create symbolic link for node_exporter text metrics
file: file:
src: "/home/{{ system_user }}/metrics.prom" src: "/home/{{ system_user }}/metrics.prom"
dest: "/var/lib/prometheus/node-exporter/offsite-metrics.prom" dest: "/var/lib/prometheus/node-exporter/offsite-metrics.prom"
state: link state: link
- name: Create symbolic link for node_exporter text metrics backup_status_maria
file:
src: "/home/{{ system_user }}/backup_status_maria.prom"
dest: "/var/lib/prometheus/node-exporter/backup_status_maria.prom"
state: link
- name: Create symbolic link for node_exporter text metrics backup_status_postgres
file:
src: "/home/{{ system_user }}/backup_status_postgres.prom"
dest: "/var/lib/prometheus/node-exporter/backup_status_postgres.prom"
state: link

@ -129,6 +129,26 @@ groups:
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed." summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
description: "backup failed." description: "backup failed."
- alert: nightly_backup_failed_maria
expr: nightly_backup_successful_maria > 0 or absent(nightly_backup_successful_maria)
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed."
description: "MariaDB backup failed."
- alert: nightly_backup_failed_postgres
expr: nightly_backup_successful_postgres > 0 or absent(nightly_backup_successful_postgres)
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> nightly backup failed."
description: "PostgreSQL backup failed."
- alert: megaraid_smart_errors - alert: megaraid_smart_errors
expr: megaraid_smart_errors > 1 expr: megaraid_smart_errors > 1
for: 2m for: 2m

Loading…
Cancel
Save