feat: P0 backup system (WAL+snapshot+restore+verify), monitoring stack, admin health API
This commit is contained in:
98
scripts/backup/verify-backups.sh
Executable file
98
scripts/backup/verify-backups.sh
Executable file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_alert() {
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" \
|
||||
-d text="$1" \
|
||||
-d parse_mode="HTML" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
ISSUES=0
|
||||
|
||||
log "=== Backup verification ==="
|
||||
|
||||
LATEST_SNAPSHOT=$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -z "$LATEST_SNAPSHOT" ]; then
|
||||
log "CRITICAL: No snapshots found!"
|
||||
send_alert "🚨 MEMENTO: No database backups found! Data at risk!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SNAPSHOT_AGE=$(( ( $(date +%s) - $(stat -c%Y "$LATEST_SNAPSHOT") ) / 3600 ))
|
||||
if [ "$SNAPSHOT_AGE" -gt 8 ]; then
|
||||
log "CRITICAL: Latest snapshot is ${SNAPSHOT_AGE}h old (> 8h threshold)"
|
||||
send_alert "🚨 MEMENTO: Latest DB backup is ${SNAPSHOT_AGE}h old! Backup cron may be broken!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Latest snapshot is ${SNAPSHOT_AGE}h old"
|
||||
fi
|
||||
|
||||
SNAPSHOT_SIZE=$(stat -c%s "$LATEST_SNAPSHOT" 2>/dev/null || echo 0)
|
||||
if [ "$SNAPSHOT_SIZE" -lt 1048576 ]; then
|
||||
log "CRITICAL: Snapshot is only $(( SNAPSHOT_SIZE / 1024 ))KB (< 1MB)"
|
||||
send_alert "🚨 MEMENTO: DB backup is suspiciously small ($(( SNAPSHOT_SIZE / 1024 ))KB)! Possible failure!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot size $(( SNAPSHOT_SIZE / 1024 ))KB"
|
||||
fi
|
||||
|
||||
if ! gzip -t "$LATEST_SNAPSHOT" 2>/dev/null; then
|
||||
log "CRITICAL: Snapshot integrity check FAILED"
|
||||
send_alert "🚨 MEMENTO: DB backup integrity check FAILED! Backup is corrupted!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot integrity check passed"
|
||||
fi
|
||||
|
||||
DB_NOTES=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
DB_USERS=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
|
||||
if [ "$DB_NOTES" = "ERROR" ] || [ "$DB_NOTES" = "0" ]; then
|
||||
log "CRITICAL: DB query returned $DB_NOTES notes"
|
||||
send_alert "🚨 MEMENTO: Database appears empty or unreachable! Notes count: $DB_NOTES"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Database has $DB_NOTES notes, $DB_USERS users"
|
||||
fi
|
||||
|
||||
APP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$APP_CODE" = "000" ] || [ "$APP_CODE" -ge 500 ]; then
|
||||
log "CRITICAL: App health check failed (HTTP $APP_CODE)"
|
||||
send_alert "🚨 MEMENTO: App is down! Health check returned HTTP $APP_CODE"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: App is healthy (HTTP $APP_CODE)"
|
||||
fi
|
||||
|
||||
DISK_PCT=$(df -h /opt/memento | awk 'NR==2{print $5}' | tr -d '%')
|
||||
if [ "$DISK_PCT" -gt 85 ]; then
|
||||
log "WARNING: Disk usage at ${DISK_PCT}%"
|
||||
send_alert "⚠️ MEMENTO: Disk usage at ${DISK_PCT}%! Consider cleanup."
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Disk usage at ${DISK_PCT}%"
|
||||
fi
|
||||
|
||||
if [ "$ISSUES" -eq 0 ]; then
|
||||
log "=== All checks PASSED ==="
|
||||
else
|
||||
log "=== $ISSUES issue(s) found ==="
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user