feat: P0 backup system (WAL+snapshot+restore+verify), monitoring stack, admin health API
This commit is contained in:
114
scripts/backup/restore.sh
Executable file
114
scripts/backup/restore.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
RESTORE_TYPE="${1:-snapshot}"
|
||||
PITR_TIME="${2:-}"
|
||||
|
||||
if [ "$RESTORE_TYPE" = "pitr" ] && [ -z "$PITR_TIME" ]; then
|
||||
echo "Usage: $0 pitr \"2026-05-17 14:30:00\""
|
||||
echo " $0 snapshot [snapshot_file.sql.gz]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "=== EMERGENCY PRE-RESTORE: Dumping current state ==="
|
||||
PRE_RESTORE="$BACKUP_DIR/pre-restore-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
docker exec "$PG_CONTAINER" pg_dump -U "$PG_USER" -d "$PG_DB" --format=custom 2>/dev/null | gzip > "$PRE_RESTORE" || true
|
||||
|
||||
APP_CONTAINER="${APP_CONTAINER:-memento-web}"
|
||||
log "Stopping app container: $APP_CONTAINER"
|
||||
docker stop "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
if [ "$RESTORE_TYPE" = "snapshot" ]; then
|
||||
SNAPSHOT_FILE="${2:-$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)}"
|
||||
|
||||
if [ -z "$SNAPSHOT_FILE" ] || [ ! -f "$SNAPSHOT_FILE" ]; then
|
||||
log "ERROR: No snapshot file found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Restoring from snapshot: $SNAPSHOT_FILE"
|
||||
FILE_SIZE=$(stat -c%s "$SNAPSHOT_FILE")
|
||||
log "File size: $(( FILE_SIZE / 1024 ))KB"
|
||||
|
||||
log "Dropping existing connections..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB' AND pid <> pg_backend_pid();" 2>/dev/null || true
|
||||
|
||||
log "Terminating connections to template..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB';" 2>/dev/null || true
|
||||
|
||||
log "Recreating database..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_restore;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "CREATE DATABASE ${PG_DB}_restore;"
|
||||
|
||||
log "Restoring data..."
|
||||
gunzip -c "$SNAPSHOT_FILE" | docker exec -i "$PG_CONTAINER" pg_restore -U "$PG_USER" -d "${PG_DB}_restore" --no-owner --no-privileges 2>/dev/null || true
|
||||
|
||||
log "Swapping databases..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE \"$PG_DB\" RENAME TO ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE ${PG_DB}_restore RENAME TO \"$PG_DB\";"
|
||||
|
||||
log "Running migrations..."
|
||||
docker exec "$APP_CONTAINER" node ./node_modules/prisma/build/index.js migrate deploy 2>/dev/null || true
|
||||
|
||||
elif [ "$RESTORE_TYPE" = "pitr" ]; then
|
||||
log "PITR restore to: $PITR_TIME"
|
||||
log "Stopping PostgreSQL..."
|
||||
docker stop "$PG_CONTAINER"
|
||||
|
||||
log "WARNING: PITR restore requires manual intervention."
|
||||
log "Steps:"
|
||||
log " 1. Copy latest base backup to PGDATA"
|
||||
log " 2. Create recovery.signal in PGDATA"
|
||||
log " 3. Set restore_command and recovery_target_time in postgresql.conf"
|
||||
log " 4. Start PostgreSQL"
|
||||
log " 5. App will reconnect automatically"
|
||||
log ""
|
||||
log "Recovery target: $PITR_TIME"
|
||||
log "WAL files location: $BACKUP_DIR/wal/"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Verifying restore..."
|
||||
NOTE_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ')
|
||||
NOTEBOOK_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Notebook\";" 2>/dev/null | tr -d ' ')
|
||||
USER_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ')
|
||||
|
||||
log "After restore: $NOTE_COUNT notes, $NOTEBOOK_COUNT notebooks, $USER_COUNT users"
|
||||
|
||||
if [ "$NOTE_COUNT" -eq 0 ]; then
|
||||
log "ERROR: 0 notes after restore! Something went wrong."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Starting app container..."
|
||||
docker start "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
HEALTHY=false
|
||||
for i in $(seq 1 36); do
|
||||
CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$CODE" != "000" ] && [ "$CODE" -lt 500 ]; then
|
||||
HEALTHY=true
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ "$HEALTHY" = true ]; then
|
||||
log "=== RESTORE SUCCESSFUL === App is healthy"
|
||||
else
|
||||
log "WARNING: App not healthy after restore. Check manually."
|
||||
fi
|
||||
Reference in New Issue
Block a user