All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m35s
373 lines
12 KiB
Bash
373 lines
12 KiB
Bash
#!/bin/bash
|
|
# ==============================================================================
|
|
# Wordly.art - Backup Verification & Telegram Alerts
|
|
# ==============================================================================
|
|
# Runs after every backup to validate integrity and alert on failure.
|
|
# CRON: 30 */6 * * * (30 minutes after each backup)
|
|
#
|
|
# Checks:
|
|
# - Recent snapshot exists (< 8h)
|
|
# - Snapshot size > 1MB (not empty)
|
|
# - Snapshot gzip integrity
|
|
# - PostgreSQL is responding
|
|
# - DB contains data (COUNT > 0)
|
|
# - NAS is mounted and writable
|
|
# - Disk usage < 85%
|
|
# - App HTTP health check
|
|
# ==============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
|
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m'
|
|
|
|
log() { echo "[Verify ${TIMESTAMP}] $1"; }
|
|
log_success() { echo -e "[Verify ${TIMESTAMP}] ${GREEN}✅ $1${NC}"; }
|
|
log_warning() { echo -e "[Verify ${TIMESTAMP}] ${YELLOW}⚠️ WARNING: $1${NC}"; }
|
|
log_error() { echo -e "[Verify ${TIMESTAMP}] ${RED}❌ ERROR: $1${NC}"; }
|
|
|
|
# ==============================================================================
|
|
# 1. LOAD CONFIGURATION
|
|
# ==============================================================================
|
|
ENV_FILE="${PROJECT_ROOT}/.env"
|
|
if [ -f "${ENV_FILE}" ]; then
|
|
set -a
|
|
set +u
|
|
source "${ENV_FILE}"
|
|
set -u
|
|
set +a
|
|
fi
|
|
|
|
# Directories
|
|
NAS_MOUNT="${NAS_MOUNT:-/mnt/nas-wordly}"
|
|
LOCAL_BACKUP_DIR="${BACKUP_DIR:-/opt/wordly/backups}"
|
|
|
|
# PostgreSQL
|
|
POSTGRES_CONTAINER="${POSTGRES_CONTAINER:-wordly-postgres}"
|
|
POSTGRES_USER="${POSTGRES_USER:-translate}"
|
|
POSTGRES_DB="${POSTGRES_DB:-translate_db}"
|
|
POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}"
|
|
|
|
# App health check
|
|
APP_HEALTH_URL="${APP_HEALTH_URL:-http://localhost:8001/health}"
|
|
|
|
# Thresholds
|
|
MAX_SNAPSHOT_AGE_HOURS=8
|
|
MIN_SNAPSHOT_SIZE_MB=1
|
|
MAX_DISK_USAGE_PERCENT=85
|
|
|
|
# Telegram
|
|
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
|
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
|
|
|
# Track failures
|
|
FAILURES=0
|
|
WARNINGS=0
|
|
|
|
# ==============================================================================
|
|
# 2. TELEGRAM
|
|
# ==============================================================================
|
|
send_telegram() {
|
|
local message="$1"
|
|
if [ -n "${TELEGRAM_BOT_TOKEN}" ] && [ -n "${TELEGRAM_CHAT_ID}" ]; then
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
|
-d "text=${message}" \
|
|
-d "parse_mode=Markdown" \
|
|
>/dev/null 2>&1 || true
|
|
else
|
|
log_warning "Telegram not configured (TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID missing)"
|
|
fi
|
|
}
|
|
|
|
# ==============================================================================
|
|
# 3. CHECK FUNCTIONS
|
|
# ==============================================================================
|
|
|
|
check_recent_snapshot() {
|
|
log "Check 1/8: Recent snapshot exists (< ${MAX_SNAPSHOT_AGE_HOURS}h)..."
|
|
|
|
# Look in both NAS and local backup directories
|
|
local search_dirs=("${LOCAL_BACKUP_DIR}/daily")
|
|
if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
|
|
search_dirs+=("${NAS_MOUNT}/snapshots")
|
|
fi
|
|
|
|
local newest_snapshot=""
|
|
for dir in "${search_dirs[@]}"; do
|
|
if [ -d "${dir}" ]; then
|
|
local candidate
|
|
candidate=$(ls -t "${dir}"/*.gz 2>/dev/null | head -n1 || true)
|
|
if [ -n "${candidate}" ]; then
|
|
newest_snapshot="${candidate}"
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ -z "${newest_snapshot}" ]; then
|
|
log_error "No snapshot found in backup directories!"
|
|
FAILURES=$((FAILURES + 1))
|
|
return
|
|
fi
|
|
|
|
# Check age
|
|
local snapshot_time
|
|
snapshot_time=$(stat -c %Y "${newest_snapshot}" 2>/dev/null || stat -f %m "${newest_snapshot}" 2>/dev/null)
|
|
local now
|
|
now=$(date +%s)
|
|
local age_hours=$(( (now - snapshot_time) / 3600 ))
|
|
|
|
if [ "${age_hours}" -ge "${MAX_SNAPSHOT_AGE_HOURS}" ]; then
|
|
log_error "Newest snapshot is ${age_hours}h old (max: ${MAX_SNAPSHOT_AGE_HOURS}h): $(basename "${newest_snapshot}")"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log_success "Snapshot found: $(basename "${newest_snapshot}") (${age_hours}h old)"
|
|
fi
|
|
|
|
echo "${newest_snapshot}"
|
|
}
|
|
|
|
check_snapshot_size() {
|
|
local snapshot_path="$1"
|
|
log "Check 2/8: Snapshot size > ${MIN_SNAPSHOT_SIZE_MB}MB..."
|
|
|
|
if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then
|
|
log_warning "No snapshot to size-check."
|
|
return
|
|
fi
|
|
|
|
local size_bytes
|
|
size_bytes=$(stat -c %s "${snapshot_path}" 2>/dev/null || stat -f %z "${snapshot_path}" 2>/dev/null)
|
|
local min_bytes=$((MIN_SNAPSHOT_SIZE_MB * 1024 * 1024))
|
|
|
|
if [ "${size_bytes}" -lt "${min_bytes}" ]; then
|
|
log_error "Snapshot size is $(numfmt --to=iec ${size_bytes}) which is below minimum ${MIN_SNAPSHOT_SIZE_MB}MB — likely empty dump!"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log_success "Snapshot size: $(numfmt --to=iec ${size_bytes})"
|
|
fi
|
|
}
|
|
|
|
check_snapshot_integrity() {
|
|
local snapshot_path="$1"
|
|
log "Check 3/8: Snapshot gzip integrity..."
|
|
|
|
if [ -z "${snapshot_path}" ] || [ ! -f "${snapshot_path}" ]; then
|
|
log_warning "No snapshot to integrity-check."
|
|
return
|
|
fi
|
|
|
|
if gzip -t "${snapshot_path}" 2>/dev/null; then
|
|
log_success "Snapshot gzip integrity OK"
|
|
else
|
|
log_error "Snapshot is CORRUPTED: $(basename "${snapshot_path}")"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
}
|
|
|
|
check_postgres_running() {
|
|
log "Check 4/8: PostgreSQL container is running and healthy..."
|
|
|
|
if ! command -v docker &>/dev/null; then
|
|
log_warning "Docker not found — skipping PostgreSQL check."
|
|
return
|
|
fi
|
|
|
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then
|
|
log_error "PostgreSQL container '${POSTGRES_CONTAINER}' is NOT running!"
|
|
FAILURES=$((FAILURES + 1))
|
|
return
|
|
fi
|
|
|
|
local health
|
|
health=$(docker inspect --format='{{.State.Health.Status}}' "${POSTGRES_CONTAINER}" 2>/dev/null || echo "unknown")
|
|
if [ "${health}" = "healthy" ]; then
|
|
log_success "PostgreSQL container is healthy"
|
|
elif [ "${health}" = "unknown" ]; then
|
|
log_warning "PostgreSQL health status unknown (no healthcheck configured?)"
|
|
WARNINGS=$((WARNINGS + 1))
|
|
else
|
|
log_error "PostgreSQL container health status: ${health}"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
}
|
|
|
|
check_db_has_data() {
|
|
log "Check 5/8: Database contains data (COUNT > 0)..."
|
|
|
|
if ! command -v docker &>/dev/null; then
|
|
log_warning "Docker not found — skipping DB data check."
|
|
return
|
|
fi
|
|
|
|
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${POSTGRES_CONTAINER}$"; then
|
|
log_warning "PostgreSQL container not running — skipping data check."
|
|
return
|
|
fi
|
|
|
|
# Count rows across key tables (gracefully handle missing tables)
|
|
local count
|
|
count=$(docker exec -e PGPASSWORD="${POSTGRES_PASSWORD}" "${POSTGRES_CONTAINER}" \
|
|
psql -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -t -A \
|
|
-c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" \
|
|
2>/dev/null || echo "0")
|
|
|
|
count=$(echo "${count}" | tr -d '[:space:]')
|
|
|
|
if [ "${count}" = "0" ] || [ -z "${count}" ]; then
|
|
log_error "Database appears to be empty (no public tables found)!"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log_success "Database has ${count} tables in public schema"
|
|
fi
|
|
}
|
|
|
|
check_nas_mounted() {
|
|
log "Check 6/8: NAS is mounted and writable at ${NAS_MOUNT}..."
|
|
|
|
if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
|
|
log_error "NAS is NOT mounted at ${NAS_MOUNT}!"
|
|
log "Attempting emergency remount..."
|
|
mount "${NAS_MOUNT}" 2>/dev/null || true
|
|
|
|
if ! mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
|
|
log_error "Emergency remount FAILED. NAS is unavailable."
|
|
FAILURES=$((FAILURES + 1))
|
|
return
|
|
fi
|
|
log_warning "NAS remounted successfully (was temporarily unmounted)."
|
|
WARNINGS=$((WARNINGS + 1))
|
|
fi
|
|
|
|
# Test write access
|
|
local test_file="${NAS_MOUNT}/.write_test_${TIMESTAMP}"
|
|
if touch "${test_file}" 2>/dev/null && rm -f "${test_file}" 2>/dev/null; then
|
|
log_success "NAS is mounted and writable"
|
|
else
|
|
log_error "NAS is mounted but NOT writable!"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
}
|
|
|
|
check_disk_space() {
|
|
log "Check 7/8: Disk usage < ${MAX_DISK_USAGE_PERCENT}%..."
|
|
|
|
# Check NAS disk if mounted
|
|
if mountpoint -q "${NAS_MOUNT}" 2>/dev/null; then
|
|
local nas_usage
|
|
nas_usage=$(df "${NAS_MOUNT}" | awk 'NR==2 {gsub(/%/,""); print $5}')
|
|
if [ "${nas_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then
|
|
log_error "NAS disk usage is ${nas_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log_success "NAS disk usage: ${nas_usage}%"
|
|
fi
|
|
fi
|
|
|
|
# Check local disk
|
|
local local_usage
|
|
local_usage=$(df /opt 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}' || df / | awk 'NR==2 {gsub(/%/,""); print $5}')
|
|
if [ "${local_usage}" -ge "${MAX_DISK_USAGE_PERCENT}" ]; then
|
|
log_error "Local disk usage is ${local_usage}% (threshold: ${MAX_DISK_USAGE_PERCENT}%)"
|
|
WARNINGS=$((WARNINGS + 1))
|
|
else
|
|
log_success "Local disk usage: ${local_usage}%"
|
|
fi
|
|
}
|
|
|
|
check_app_health() {
|
|
log "Check 8/8: App HTTP health check at ${APP_HEALTH_URL}..."
|
|
|
|
if ! command -v curl &>/dev/null; then
|
|
log_warning "curl not found — skipping HTTP health check."
|
|
return
|
|
fi
|
|
|
|
local http_code
|
|
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
|
--connect-timeout 5 \
|
|
--max-time 10 \
|
|
"${APP_HEALTH_URL}" 2>/dev/null || echo "000")
|
|
|
|
if [ "${http_code}" = "200" ]; then
|
|
log_success "App health check passed (HTTP ${http_code})"
|
|
elif [ "${http_code}" = "000" ]; then
|
|
log_error "App is unreachable (connection timeout)"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log_error "App health check returned HTTP ${http_code}"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
}
|
|
|
|
# ==============================================================================
|
|
# 4. MAIN
|
|
# ==============================================================================
|
|
main() {
|
|
echo ""
|
|
echo "========================================================="
|
|
echo " Wordly.art — Backup Verification"
|
|
echo " $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "========================================================="
|
|
echo ""
|
|
|
|
# Run all checks
|
|
local newest_snapshot
|
|
newest_snapshot=$(check_recent_snapshot)
|
|
|
|
check_snapshot_size "${newest_snapshot}"
|
|
check_snapshot_integrity "${newest_snapshot}"
|
|
check_postgres_running
|
|
check_db_has_data
|
|
check_nas_mounted
|
|
check_disk_space
|
|
check_app_health
|
|
|
|
echo ""
|
|
echo "========================================================="
|
|
echo " Results: ${FAILURES} failure(s), ${WARNINGS} warning(s)"
|
|
echo "========================================================="
|
|
echo ""
|
|
|
|
# Send Telegram report
|
|
if [ "${FAILURES}" -gt 0 ]; then
|
|
local msg="🚨 *Wordly.art — Backup Verification FAILED*
|
|
Date: $(date '+%Y-%m-%d %H:%M:%S')
|
|
Failures: ${FAILURES}
|
|
Warnings: ${WARNINGS}
|
|
|
|
Check logs on 192.168.1.151:
|
|
\`cat /var/log/wordly-verify.log\`"
|
|
send_telegram "${msg}"
|
|
log_error "Verification FAILED with ${FAILURES} error(s). Telegram alert sent."
|
|
exit 1
|
|
elif [ "${WARNINGS}" -gt 0 ]; then
|
|
local msg="⚠️ *Wordly.art — Backup Verification passed with warnings*
|
|
Date: $(date '+%Y-%m-%d %H:%M:%S')
|
|
Failures: 0
|
|
Warnings: ${WARNINGS}"
|
|
send_telegram "${msg}"
|
|
log_warning "Verification passed with ${WARNINGS} warning(s)."
|
|
else
|
|
# Only send success alert once per day (at 06:30)
|
|
local hour
|
|
hour=$(date +%H)
|
|
if [ "${hour}" = "06" ]; then
|
|
local msg="✅ *Wordly.art — Daily backup check OK*
|
|
Date: $(date '+%Y-%m-%d %H:%M:%S')
|
|
All 8 checks passed."
|
|
send_telegram "${msg}"
|
|
fi
|
|
log_success "All checks passed."
|
|
fi
|
|
}
|
|
|
|
main "$@"
|