#!/bin/bash # ============================================================================== # Wordly.art - Backup Verification & Telegram Alerts # ============================================================================== # Runs after every backup to validate integrity and alert on failure. # CRON: 30 */6 * * * (30 minutes after each backup) # # Checks: # - Recent snapshot exists (< 8h) # - Snapshot size > 1MB (not empty) # - Snapshot gzip integrity # - PostgreSQL is responding # - DB contains data (COUNT > 0) # - NAS is mounted and writable # - Disk usage < 85% # - App HTTP health check # ============================================================================== set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" TIMESTAMP=$(date +"%Y%m%d_%H%M%S") RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' log() { echo "[Verify ${TIMESTAMP}] $1"; } log_success() { echo -e "[Verify ${TIMESTAMP}] ${GREEN}✅ $1${NC}"; } log_warning() { echo -e "[Verify ${TIMESTAMP}] ${YELLOW}⚠️ WARNING: $1${NC}"; } log_error() { echo -e "[Verify ${TIMESTAMP}] ${RED}❌ ERROR: $1${NC}"; } # ============================================================================== # 1. LOAD CONFIGURATION # ============================================================================== ENV_FILE="${PROJECT_ROOT}/.env" if [ -f "${ENV_FILE}" ]; then set -a set +u source "${ENV_FILE}" set -u set +a fi # Directories NAS_MOUNT="${NAS_MOUNT:-/mnt/nas-wordly}" LOCAL_BACKUP_DIR="${BACKUP_DIR:-/opt/wordly/backups}" # PostgreSQL POSTGRES_CONTAINER="${POSTGRES_CONTAINER:-wordly-postgres}" POSTGRES_USER="${POSTGRES_USER:-translate}" POSTGRES_DB="${POSTGRES_DB:-translate_db}" POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}" # App health check APP_HEALTH_URL="${APP_HEALTH_URL:-http://localhost:8001/health}" # Thresholds MAX_SNAPSHOT_AGE_HOURS=8 MIN_SNAPSHOT_SIZE_MB=1 MAX_DISK_USAGE_PERCENT=85 # Telegram TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}" TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}" # Track failures FAILURES=0 WARNINGS=0 # ============================================================================== # 2. TELEGRAM # ============================================================================== send_telegram() { local message="$1" if [ -n "${TELEGRAM_BOT_TOKEN}" ] && [ -n "${TELEGRAM_CHAT_ID}" ]; then curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d "chat_id=${TELEGRAM_CHAT_ID}" \ -d "text=${message}" \ -d "parse_mode=Markdown" \ >/dev/null 2>&1 || true else log_warning "Telegram not configured (TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID missing)" fi } # ============================================================================== # 3. CHECK FUNCTIONS # ============================================================================== check_recent_snapshot() { log "Check 1/8: Recent snapshot exists (< ${MAX_SNAPSHOT_AGE_HOURS}h)..." # Look in both NAS and local backup directories local search_dirs=("${LOCAL_BACKUP_DIR}/daily") if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then search_dirs+=("${NAS_MOUNT}/snapshots") fi local newest_snapshot="" for dir in "${search_dirs[@]}"; do if [ -d "${dir}" ]; then local candidate candidate=$(ls -t "${dir}"/*.gz 2>/dev/null | head -n1 || true) if [ -n "${candidate}" ]; then newest_snapshot="${candidate}" break fi fi done if [ -z "${newest_snapshot}" ]; then log_error "No snapshot found in backup directories!" FAILURES=$((FAILURES + 1)) return fi # Check age local snapshot_time snapshot_time=$(stat -c %Y "${newest_snapshot}" 2>/dev/null || stat -f %m "${newest_snapshot}" 2>/dev/null) local now now=$(date +%s) local age_hours=$(( (now - snapshot_time) / 3600 )) if [ "${age_hours}" -ge "${MAX_SNAPSHOT_AGE_HOURS}" ]; then log_error "Newest snapshot is ${age_hours}h old (max: ${MAX_SNAPSHOT_AGE_HOURS}h): $(basename "${newest_snapshot}")" FAILURES=$((FAILURES + 1)) else log_success "Snapshot found: $(basename "${newest_snapshot}") (${age_hours}h old)" fi echo "${newest_snapshot}" } check_snapshot_size() { local snapshot_path="$1" log "Check 2/8: Snapshot size > ${MIN_SNAPSHOT_SIZE_MB}MB..." if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then log_warning "No snapshot to size-check." return fi local size_bytes size_bytes=$(stat -c %s "${snapshot_path}" 2>/dev/null || stat -f %z "${snapshot_path}" 2>/dev/null) local min_bytes=$((MIN_SNAPSHOT_SIZE_MB * 1024 * 1024)) if [ "${size_bytes}" -lt "${min_bytes}" ]; then log_error "Snapshot size is $(numfmt --to=iec ${size_bytes}) which is below minimum ${MIN_SNAPSHOT_SIZE_MB}MB — likely empty dump!" FAILURES=$((FAILURES + 1)) else log_success "Snapshot size: $(numfmt --to=iec ${size_bytes})" fi } check_snapshot_integrity() { local snapshot_path="$1" log "Check 3/8: Snapshot gzip integrity..." if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then log_warning "No snapshot to integrity-check." return fi if gzip -t "${snapshot_path}" 2>/dev/null; then log_success "Snapshot gzip integrity OK" else log_error "Snapshot is CORRUPTED: $(basename "${snapshot_path}")" FAILURES=$((FAILURES + 1)) fi } check_postgres_running() { log "Check 4/8: PostgreSQL container is running and healthy..." if ! command -v docker &>/dev/null; then log_warning "Docker not found — skipping PostgreSQL check." return fi if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then log_error "PostgreSQL container '${POSTGRES_CONTAINER}' is NOT running!" FAILURES=$((FAILURES + 1)) return fi local health health=$(docker inspect --format='{{.State.Health.Status}}' "${POSTGRES_CONTAINER}" 2>/dev/null || echo "unknown") if [ "${health}" = "healthy" ]; then log_success "PostgreSQL container is healthy" elif [ "${health}" = "unknown" ]; then log_warning "PostgreSQL health status unknown (no healthcheck configured?)" WARNINGS=$((WARNINGS + 1)) else log_error "PostgreSQL container health status: ${health}" FAILURES=$((FAILURES + 1)) fi } check_db_has_data() { log "Check 5/8: Database contains data (COUNT > 0)..." if ! command -v docker &>/dev/null; then log_warning "Docker not found — skipping DB data check." return fi if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then log_warning "PostgreSQL container not running — skipping data check." return fi # Count rows across key tables (gracefully handle missing tables) local count count=$(docker exec -e PGPASSWORD="${POSTGRES_PASSWORD}" "${POSTGRES_CONTAINER}" \ psql -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -t -A \ -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" \ 2>/dev/null || echo "0") count=$(echo "${count}" | tr -d '[:space:]') if [ "${count}" = "0" ] || [ -z "${count}" ]; then log_error "Database appears to be empty (no public tables found)!" FAILURES=$((FAILURES + 1)) else log_success "Database has ${count} tables in public schema" fi } check_nas_mounted() { log "Check 6/8: NAS is mounted and writable at ${NAS_MOUNT}..." if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then log_error "NAS is NOT mounted at ${NAS_MOUNT}!" log "Attempting emergency remount..." mount "${NAS_MOUNT}" 2>/dev/null || true if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then log_error "Emergency remount FAILED. NAS is unavailable." FAILURES=$((FAILURES + 1)) return fi log_warning "NAS remounted successfully (was temporarily unmounted)." WARNINGS=$((WARNINGS + 1)) fi # Test write access local test_file="${NAS_MOUNT}/.write_test_${TIMESTAMP}" if touch "${test_file}" 2>/dev/null && rm -f "${test_file}" 2>/dev/null; then log_success "NAS is mounted and writable" else log_error "NAS is mounted but NOT writable!" FAILURES=$((FAILURES + 1)) fi } check_disk_space() { log "Check 7/8: Disk usage < ${MAX_DISK_USAGE_PERCENT}%..." # Check NAS disk if mounted if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then local nas_usage nas_usage=$(df "${NAS_MOUNT}" | awk 'NR==2 {gsub(/%/,""); print $5}') if [ "${nas_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then log_error "NAS disk usage is ${nas_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)" FAILURES=$((FAILURES + 1)) else log_success "NAS disk usage: ${nas_usage}%" fi fi # Check local disk local local_usage local_usage=$(df /opt 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}' || df / | awk 'NR==2 {gsub(/%/,""); print $5}') if [ "${local_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then log_error "Local disk usage is ${local_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)" WARNINGS=$((WARNINGS + 1)) else log_success "Local disk usage: ${local_usage}%" fi } check_app_health() { log "Check 8/8: App HTTP health check at ${APP_HEALTH_URL}..." if ! command -v curl &>/dev/null; then log_warning "curl not found — skipping HTTP health check." return fi local http_code http_code=$(curl -s -o /dev/null -w "%{http_code}" \ --connect-timeout 5 \ --max-time 10 \ "${APP_HEALTH_URL}" 2>/dev/null || echo "000") if [ "${http_code}" = "200" ]; then log_success "App health check passed (HTTP ${http_code})" elif [ "${http_code}" = "000" ]; then log_error "App is unreachable (connection timeout)" FAILURES=$((FAILURES + 1)) else log_error "App health check returned HTTP ${http_code}" FAILURES=$((FAILURES + 1)) fi } # ============================================================================== # 4. MAIN # ============================================================================== main() { echo "" echo "=========================================================" echo " Wordly.art — Backup Verification" echo " $(date '+%Y-%m-%d %H:%M:%S')" echo "=========================================================" echo "" # Run all checks local newest_snapshot newest_snapshot=$(check_recent_snapshot) check_snapshot_size "${newest_snapshot}" check_snapshot_integrity "${newest_snapshot}" check_postgres_running check_db_has_data check_nas_mounted check_disk_space check_app_health echo "" echo "=========================================================" echo " Results: ${FAILURES} failure(s), ${WARNINGS} warning(s)" echo "=========================================================" echo "" # Send Telegram report if [ "${FAILURES}" -gt 0 ]; then local msg="🚨 *Wordly.art — Backup Verification FAILED* Date: $(date '+%Y-%m-%d %H:%M:%S') Failures: ${FAILURES} Warnings: ${WARNINGS} Check logs on 192.168.1.151: \`cat /var/log/wordly-verify.log\`" send_telegram "${msg}" log_error "Verification FAILED with ${FAILURES} error(s). Telegram alert sent." exit 1 elif [ "${WARNINGS}" -gt 0 ]; then local msg="⚠️ *Wordly.art — Backup Verification passed with warnings* Date: $(date '+%Y-%m-%d %H:%M:%S') Failures: 0 Warnings: ${WARNINGS}" send_telegram "${msg}" log_warning "Verification passed with ${WARNINGS} warning(s)." else # Only send success alert once per day (at 06:30) local hour hour=$(date +%H) if [ "${hour}" = "06" ]; then local msg="✅ *Wordly.art — Daily backup check OK* Date: $(date '+%Y-%m-%d %H:%M:%S') All 8 checks passed." send_telegram "${msg}" fi log_success "All checks passed." fi } main "$@"