Files
office_translator/scripts/verify-backups.sh
sepehr 3f980ad537
Some checks failed
Deploy to Production / Build and Deploy (push) Has been cancelled
feat: add NAS backup, verification, and DR scripts
2026-06-07 11:12:01 +02:00

371 lines
12 KiB
Bash

#!/bin/bash
# ==============================================================================
# Wordly.art - Backup Verification & Telegram Alerts
# ==============================================================================
# Runs after every backup to validate integrity and alert on failure.
# CRON: 30 */6 * * * (30 minutes after each backup)
#
# Checks:
# - Recent snapshot exists (< 8h)
# - Snapshot size > 1MB (not empty)
# - Snapshot gzip integrity
# - PostgreSQL is responding
# - DB contains data (COUNT > 0)
# - NAS is mounted and writable
# - Disk usage < 85%
# - App HTTP health check
# ==============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() { echo "[Verify ${TIMESTAMP}] $1"; }
log_success() { echo -e "[Verify ${TIMESTAMP}] ${GREEN}$1${NC}"; }
log_warning() { echo -e "[Verify ${TIMESTAMP}] ${YELLOW}⚠️ WARNING: $1${NC}"; }
log_error() { echo -e "[Verify ${TIMESTAMP}] ${RED}❌ ERROR: $1${NC}"; }
# ==============================================================================
# 1. LOAD CONFIGURATION
# ==============================================================================
ENV_FILE="${PROJECT_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
set -a
source "${ENV_FILE}"
set +a
fi
# Directories
NAS_MOUNT="${NAS_MOUNT:-/mnt/nas-wordly}"
LOCAL_BACKUP_DIR="${BACKUP_DIR:-/opt/wordly/backups}"
# PostgreSQL
POSTGRES_CONTAINER="${POSTGRES_CONTAINER:-wordly-postgres}"
POSTGRES_USER="${POSTGRES_USER:-translate}"
POSTGRES_DB="${POSTGRES_DB:-translate_db}"
POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}"
# App health check
APP_HEALTH_URL="${APP_HEALTH_URL:-http://localhost:8001/health}"
# Thresholds
MAX_SNAPSHOT_AGE_HOURS=8
MIN_SNAPSHOT_SIZE_MB=1
MAX_DISK_USAGE_PERCENT=85
# Telegram
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
# Track failures
FAILURES=0
WARNINGS=0
# ==============================================================================
# 2. TELEGRAM
# ==============================================================================
send_telegram() {
local message="$1"
if [ -n "${TELEGRAM_BOT_TOKEN}" ] && [ -n "${TELEGRAM_CHAT_ID}" ]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=Markdown" \
>/dev/null 2>&1 || true
else
log_warning "Telegram not configured (TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID missing)"
fi
}
# ==============================================================================
# 3. CHECK FUNCTIONS
# ==============================================================================
check_recent_snapshot() {
log "Check 1/8: Recent snapshot exists (< ${MAX_SNAPSHOT_AGE_HOURS}h)..."
# Look in both NAS and local backup directories
local search_dirs=("${LOCAL_BACKUP_DIR}/daily")
if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
search_dirs+=("${NAS_MOUNT}/snapshots")
fi
local newest_snapshot=""
for dir in "${search_dirs[@]}"; do
if [ -d "${dir}" ]; then
local candidate
candidate=$(ls -t "${dir}"/*.gz 2>/dev/null | head -n1 || true)
if [ -n "${candidate}" ]; then
newest_snapshot="${candidate}"
break
fi
fi
done
if [ -z "${newest_snapshot}" ]; then
log_error "No snapshot found in backup directories!"
FAILURES=$((FAILURES + 1))
return
fi
# Check age
local snapshot_time
snapshot_time=$(stat -c %Y "${newest_snapshot}" 2>/dev/null || stat -f %m "${newest_snapshot}" 2>/dev/null)
local now
now=$(date +%s)
local age_hours=$(( (now - snapshot_time) / 3600 ))
if [ "${age_hours}" -ge "${MAX_SNAPSHOT_AGE_HOURS}" ]; then
log_error "Newest snapshot is ${age_hours}h old (max: ${MAX_SNAPSHOT_AGE_HOURS}h): $(basename "${newest_snapshot}")"
FAILURES=$((FAILURES + 1))
else
log_success "Snapshot found: $(basename "${newest_snapshot}") (${age_hours}h old)"
fi
echo "${newest_snapshot}"
}
check_snapshot_size() {
local snapshot_path="$1"
log "Check 2/8: Snapshot size > ${MIN_SNAPSHOT_SIZE_MB}MB..."
if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then
log_warning "No snapshot to size-check."
return
fi
local size_bytes
size_bytes=$(stat -c %s "${snapshot_path}" 2>/dev/null || stat -f %z "${snapshot_path}" 2>/dev/null)
local min_bytes=$((MIN_SNAPSHOT_SIZE_MB * 1024 * 1024))
if [ "${size_bytes}" -lt "${min_bytes}" ]; then
log_error "Snapshot size is $(numfmt --to=iec ${size_bytes}) which is below minimum ${MIN_SNAPSHOT_SIZE_MB}MB — likely empty dump!"
FAILURES=$((FAILURES + 1))
else
log_success "Snapshot size: $(numfmt --to=iec ${size_bytes})"
fi
}
check_snapshot_integrity() {
local snapshot_path="$1"
log "Check 3/8: Snapshot gzip integrity..."
if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then
log_warning "No snapshot to integrity-check."
return
fi
if gzip -t "${snapshot_path}" 2>/dev/null; then
log_success "Snapshot gzip integrity OK"
else
log_error "Snapshot is CORRUPTED: $(basename "${snapshot_path}")"
FAILURES=$((FAILURES + 1))
fi
}
check_postgres_running() {
log "Check 4/8: PostgreSQL container is running and healthy..."
if ! command -v docker &>/dev/null; then
log_warning "Docker not found — skipping PostgreSQL check."
return
fi
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then
log_error "PostgreSQL container '${POSTGRES_CONTAINER}' is NOT running!"
FAILURES=$((FAILURES + 1))
return
fi
local health
health=$(docker inspect --format='{{.State.Health.Status}}' "${POSTGRES_CONTAINER}" 2>/dev/null || echo "unknown")
if [ "${health}" = "healthy" ]; then
log_success "PostgreSQL container is healthy"
elif [ "${health}" = "unknown" ]; then
log_warning "PostgreSQL health status unknown (no healthcheck configured?)"
WARNINGS=$((WARNINGS + 1))
else
log_error "PostgreSQL container health status: ${health}"
FAILURES=$((FAILURES + 1))
fi
}
check_db_has_data() {
log "Check 5/8: Database contains data (COUNT > 0)..."
if ! command -v docker &>/dev/null; then
log_warning "Docker not found — skipping DB data check."
return
fi
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then
log_warning "PostgreSQL container not running — skipping data check."
return
fi
# Count rows across key tables (gracefully handle missing tables)
local count
count=$(docker exec -e PGPASSWORD="${POSTGRES_PASSWORD}" "${POSTGRES_CONTAINER}" \
psql -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -t -A \
-c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" \
2>/dev/null || echo "0")
count=$(echo "${count}" | tr -d '[:space:]')
if [ "${count}" = "0" ] || [ -z "${count}" ]; then
log_error "Database appears to be empty (no public tables found)!"
FAILURES=$((FAILURES + 1))
else
log_success "Database has ${count} tables in public schema"
fi
}
check_nas_mounted() {
log "Check 6/8: NAS is mounted and writable at ${NAS_MOUNT}..."
if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
log_error "NAS is NOT mounted at ${NAS_MOUNT}!"
log "Attempting emergency remount..."
mount "${NAS_MOUNT}" 2>/dev/null || true
if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
log_error "Emergency remount FAILED. NAS is unavailable."
FAILURES=$((FAILURES + 1))
return
fi
log_warning "NAS remounted successfully (was temporarily unmounted)."
WARNINGS=$((WARNINGS + 1))
fi
# Test write access
local test_file="${NAS_MOUNT}/.write_test_${TIMESTAMP}"
if touch "${test_file}" 2>/dev/null && rm -f "${test_file}" 2>/dev/null; then
log_success "NAS is mounted and writable"
else
log_error "NAS is mounted but NOT writable!"
FAILURES=$((FAILURES + 1))
fi
}
check_disk_space() {
log "Check 7/8: Disk usage < ${MAX_DISK_USAGE_PERCENT}%..."
# Check NAS disk if mounted
if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
local nas_usage
nas_usage=$(df "${NAS_MOUNT}" | awk 'NR==2 {gsub(/%/,""); print $5}')
if [ "${nas_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then
log_error "NAS disk usage is ${nas_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)"
FAILURES=$((FAILURES + 1))
else
log_success "NAS disk usage: ${nas_usage}%"
fi
fi
# Check local disk
local local_usage
local_usage=$(df /opt 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}' || df / | awk 'NR==2 {gsub(/%/,""); print $5}')
if [ "${local_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then
log_error "Local disk usage is ${local_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)"
WARNINGS=$((WARNINGS + 1))
else
log_success "Local disk usage: ${local_usage}%"
fi
}
check_app_health() {
log "Check 8/8: App HTTP health check at ${APP_HEALTH_URL}..."
if ! command -v curl &>/dev/null; then
log_warning "curl not found — skipping HTTP health check."
return
fi
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 5 \
--max-time 10 \
"${APP_HEALTH_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log_success "App health check passed (HTTP ${http_code})"
elif [ "${http_code}" = "000" ]; then
log_error "App is unreachable (connection timeout)"
FAILURES=$((FAILURES + 1))
else
log_error "App health check returned HTTP ${http_code}"
FAILURES=$((FAILURES + 1))
fi
}
# ==============================================================================
# 4. MAIN
# ==============================================================================
main() {
echo ""
echo "========================================================="
echo " Wordly.art — Backup Verification"
echo " $(date '+%Y-%m-%d %H:%M:%S')"
echo "========================================================="
echo ""
# Run all checks
local newest_snapshot
newest_snapshot=$(check_recent_snapshot)
check_snapshot_size "${newest_snapshot}"
check_snapshot_integrity "${newest_snapshot}"
check_postgres_running
check_db_has_data
check_nas_mounted
check_disk_space
check_app_health
echo ""
echo "========================================================="
echo " Results: ${FAILURES} failure(s), ${WARNINGS} warning(s)"
echo "========================================================="
echo ""
# Send Telegram report
if [ "${FAILURES}" -gt 0 ]; then
local msg="🚨 *Wordly.art — Backup Verification FAILED*
Date: $(date '+%Y-%m-%d %H:%M:%S')
Failures: ${FAILURES}
Warnings: ${WARNINGS}
Check logs on 192.168.1.151:
\`cat /var/log/wordly-verify.log\`"
send_telegram "${msg}"
log_error "Verification FAILED with ${FAILURES} error(s). Telegram alert sent."
exit 1
elif [ "${WARNINGS}" -gt 0 ]; then
local msg="⚠️ *Wordly.art — Backup Verification passed with warnings*
Date: $(date '+%Y-%m-%d %H:%M:%S')
Failures: 0
Warnings: ${WARNINGS}"
send_telegram "${msg}"
log_warning "Verification passed with ${WARNINGS} warning(s)."
else
# Only send success alert once per day (at 06:30)
local hour
hour=$(date +%H)
if [ "${hour}" = "06" ]; then
local msg="✅ *Wordly.art — Daily backup check OK*
Date: $(date '+%Y-%m-%d %H:%M:%S')
All 8 checks passed."
send_telegram "${msg}"
fi
log_success "All checks passed."
fi
}
main "$@"