feat: P0 backup system (WAL+snapshot+restore+verify), monitoring stack, admin health API
This commit is contained in:
60
scripts/backup/backup.sh
Executable file
60
scripts/backup/backup.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
WAL_DIR="$BACKUP_DIR/wal"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
RETENTION_DAYS=7
|
||||
RETENTION_WEEKS=4
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
mkdir -p "$WAL_DIR" "$SNAPSHOT_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
log "=== Starting backup ==="
|
||||
|
||||
SNAPSHOT_FILE="$SNAPSHOT_DIR/memento-$TIMESTAMP.sql.gz"
|
||||
|
||||
log "Creating snapshot: $SNAPSHOT_FILE"
|
||||
docker exec "$PG_CONTAINER" pg_dump -U "$PG_USER" -d "$PG_DB" --format=custom | gzip > "$SNAPSHOT_FILE"
|
||||
|
||||
SNAPSHOT_SIZE=$(stat -c%s "$SNAPSHOT_FILE" 2>/dev/null || echo 0)
|
||||
if [ "$SNAPSHOT_SIZE" -lt 1048576 ]; then
|
||||
log "ERROR: Snapshot is only $SNAPSHOT_SIZE bytes (< 1MB). Possible failure!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Snapshot OK: $(( SNAPSHOT_SIZE / 1024 ))KB"
|
||||
|
||||
log "Verifying snapshot integrity..."
|
||||
if gzip -t "$SNAPSHOT_FILE" 2>/dev/null; then
|
||||
log "Integrity check: PASSED"
|
||||
else
|
||||
log "ERROR: Snapshot integrity check FAILED!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Cleaning old snapshots (keeping $RETENTION_DAYS days + $RETENTION_WEEKS weekly)..."
|
||||
cd "$SNAPSHOT_DIR"
|
||||
|
||||
find . -name "*.sql.gz" -mtime +$RETENTION_DAYS ! -name "*-000000.sql.gz" -delete 2>/dev/null || true
|
||||
|
||||
for i in $(seq 1 $RETENTION_WEEKS); do
|
||||
WEEK_DATE=$(date -d "$RETENTION_DAYS + $((i-1)) * 7 days" +%Y%m%d)
|
||||
WEEK_FILE=$(ls -t memento-${WEEK_DATE}*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -n "$WEEK_FILE" ]; then
|
||||
touch "$WEEK_FILE"
|
||||
fi
|
||||
done
|
||||
|
||||
SNAPSHOT_COUNT=$(ls -1 "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | wc -l)
|
||||
log "Cleanup done. $SNAPSHOT_COUNT snapshots remaining"
|
||||
|
||||
log "=== Backup complete ==="
|
||||
25
scripts/backup/install-crontab.sh
Executable file
25
scripts/backup/install-crontab.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
# Install crontab entries for automated backups
|
||||
# Usage: sudo bash install-crontab.sh
|
||||
|
||||
CRON_FILE="/opt/memento/backups/crontab"
|
||||
|
||||
cat > "$CRON_FILE" << 'EOF'
|
||||
# Memento Backup Schedule
|
||||
# Snapshots every 6 hours
|
||||
0 */6 * * * /opt/memento/scripts/backup/backup.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
|
||||
# Backup verification every 6 hours (offset by 30min)
|
||||
30 */6 * * * /opt/memento/scripts/backup/verify-backups.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
|
||||
# Offsite sync daily at 03:00
|
||||
0 3 * * * /opt/memento/scripts/backup/offsite-sync.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
EOF
|
||||
|
||||
echo "Crontab file created at $CRON_FILE"
|
||||
echo ""
|
||||
echo "To install, run as the memento user:"
|
||||
echo " crontab $CRON_FILE"
|
||||
echo ""
|
||||
echo "To verify:"
|
||||
echo " crontab -l"
|
||||
47
scripts/backup/offsite-sync.sh
Executable file
47
scripts/backup/offsite-sync.sh
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SNAPSHOT_DIR="/opt/memento/backups/snapshots"
|
||||
REMOTE_HOST="${BACKUP_REMOTE_HOST:-}"
|
||||
REMOTE_PATH="${BACKUP_REMOTE_PATH:-/opt/memento-offsite-backups}"
|
||||
LOG_FILE="/opt/memento/backups/backup.log"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
send_alert() {
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" -d text="$1" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
log "=== Offsite backup sync ==="
|
||||
|
||||
if [ -z "$REMOTE_HOST" ]; then
|
||||
log "WARNING: No BACKUP_REMOTE_HOST configured. Skipping offsite sync."
|
||||
log "Set BACKUP_REMOTE_HOST=user@server in /opt/memento/.env.docker for offsite backups"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
LATEST=$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -z "$LATEST" ]; then
|
||||
log "ERROR: No snapshots to sync"
|
||||
send_alert "🚨 MEMENTO: Offsite backup failed - no snapshots found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Syncing $LATEST to $REMOTE_HOST:$REMOTE_PATH"
|
||||
rsync -az --chmod=F600 "$SNAPSHOT_DIR/" "${REMOTE_HOST}:${REMOTE_PATH}/" 2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log "Offsite sync complete"
|
||||
else
|
||||
log "ERROR: Offsite sync failed!"
|
||||
send_alert "🚨 MEMENTO: Offsite backup sync FAILED! Check rsync connectivity."
|
||||
exit 1
|
||||
fi
|
||||
114
scripts/backup/restore.sh
Executable file
114
scripts/backup/restore.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
RESTORE_TYPE="${1:-snapshot}"
|
||||
PITR_TIME="${2:-}"
|
||||
|
||||
if [ "$RESTORE_TYPE" = "pitr" ] && [ -z "$PITR_TIME" ]; then
|
||||
echo "Usage: $0 pitr \"2026-05-17 14:30:00\""
|
||||
echo " $0 snapshot [snapshot_file.sql.gz]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "=== EMERGENCY PRE-RESTORE: Dumping current state ==="
|
||||
PRE_RESTORE="$BACKUP_DIR/pre-restore-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
docker exec "$PG_CONTAINER" pg_dump -U "$PG_USER" -d "$PG_DB" --format=custom 2>/dev/null | gzip > "$PRE_RESTORE" || true
|
||||
|
||||
APP_CONTAINER="${APP_CONTAINER:-memento-web}"
|
||||
log "Stopping app container: $APP_CONTAINER"
|
||||
docker stop "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
if [ "$RESTORE_TYPE" = "snapshot" ]; then
|
||||
SNAPSHOT_FILE="${2:-$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)}"
|
||||
|
||||
if [ -z "$SNAPSHOT_FILE" ] || [ ! -f "$SNAPSHOT_FILE" ]; then
|
||||
log "ERROR: No snapshot file found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Restoring from snapshot: $SNAPSHOT_FILE"
|
||||
FILE_SIZE=$(stat -c%s "$SNAPSHOT_FILE")
|
||||
log "File size: $(( FILE_SIZE / 1024 ))KB"
|
||||
|
||||
log "Dropping existing connections..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB' AND pid <> pg_backend_pid();" 2>/dev/null || true
|
||||
|
||||
log "Terminating connections to template..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB';" 2>/dev/null || true
|
||||
|
||||
log "Recreating database..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_restore;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "CREATE DATABASE ${PG_DB}_restore;"
|
||||
|
||||
log "Restoring data..."
|
||||
gunzip -c "$SNAPSHOT_FILE" | docker exec -i "$PG_CONTAINER" pg_restore -U "$PG_USER" -d "${PG_DB}_restore" --no-owner --no-privileges 2>/dev/null || true
|
||||
|
||||
log "Swapping databases..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE \"$PG_DB\" RENAME TO ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE ${PG_DB}_restore RENAME TO \"$PG_DB\";"
|
||||
|
||||
log "Running migrations..."
|
||||
docker exec "$APP_CONTAINER" node ./node_modules/prisma/build/index.js migrate deploy 2>/dev/null || true
|
||||
|
||||
elif [ "$RESTORE_TYPE" = "pitr" ]; then
|
||||
log "PITR restore to: $PITR_TIME"
|
||||
log "Stopping PostgreSQL..."
|
||||
docker stop "$PG_CONTAINER"
|
||||
|
||||
log "WARNING: PITR restore requires manual intervention."
|
||||
log "Steps:"
|
||||
log " 1. Copy latest base backup to PGDATA"
|
||||
log " 2. Create recovery.signal in PGDATA"
|
||||
log " 3. Set restore_command and recovery_target_time in postgresql.conf"
|
||||
log " 4. Start PostgreSQL"
|
||||
log " 5. App will reconnect automatically"
|
||||
log ""
|
||||
log "Recovery target: $PITR_TIME"
|
||||
log "WAL files location: $BACKUP_DIR/wal/"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Verifying restore..."
|
||||
NOTE_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ')
|
||||
NOTEBOOK_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Notebook\";" 2>/dev/null | tr -d ' ')
|
||||
USER_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ')
|
||||
|
||||
log "After restore: $NOTE_COUNT notes, $NOTEBOOK_COUNT notebooks, $USER_COUNT users"
|
||||
|
||||
if [ "$NOTE_COUNT" -eq 0 ]; then
|
||||
log "ERROR: 0 notes after restore! Something went wrong."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Starting app container..."
|
||||
docker start "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
HEALTHY=false
|
||||
for i in $(seq 1 36); do
|
||||
CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$CODE" != "000" ] && [ "$CODE" -lt 500 ]; then
|
||||
HEALTHY=true
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ "$HEALTHY" = true ]; then
|
||||
log "=== RESTORE SUCCESSFUL === App is healthy"
|
||||
else
|
||||
log "WARNING: App not healthy after restore. Check manually."
|
||||
fi
|
||||
35
scripts/backup/setup-wal.sh
Executable file
35
scripts/backup/setup-wal.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
WAL_DIR="$BACKUP_DIR/wal"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
PG_PASSWORD="${POSTGRES_PASSWORD:-}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
mkdir -p "$WAL_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
log "=== Setting up WAL archiving ==="
|
||||
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "
|
||||
ALTER SYSTEM SET wal_level = replica;
|
||||
ALTER SYSTEM SET archive_mode = on;
|
||||
ALTER SYSTEM SET archive_command = 'cp %p /var/lib/postgresql/backups/wal/%f';
|
||||
ALTER SYSTEM SET max_wal_senders = 3;
|
||||
ALTER SYSTEM SET wal_keep_size = '1GB';
|
||||
"
|
||||
|
||||
docker exec "$PG_CONTAINER" mkdir -p /var/lib/postgresql/backups/wal
|
||||
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "SELECT pg_reload_conf();"
|
||||
|
||||
log "WAL archiving enabled. Archives stored in /var/lib/postgresql/backups/wal/"
|
||||
|
||||
WAL_COUNT=$(docker exec "$PG_CONTAINER" ls /var/lib/postgresql/backups/wal/ 2>/dev/null | wc -l)
|
||||
log "Existing WAL segments: $WAL_COUNT"
|
||||
98
scripts/backup/verify-backups.sh
Executable file
98
scripts/backup/verify-backups.sh
Executable file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_alert() {
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" \
|
||||
-d text="$1" \
|
||||
-d parse_mode="HTML" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
ISSUES=0
|
||||
|
||||
log "=== Backup verification ==="
|
||||
|
||||
LATEST_SNAPSHOT=$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -z "$LATEST_SNAPSHOT" ]; then
|
||||
log "CRITICAL: No snapshots found!"
|
||||
send_alert "🚨 MEMENTO: No database backups found! Data at risk!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SNAPSHOT_AGE=$(( ( $(date +%s) - $(stat -c%Y "$LATEST_SNAPSHOT") ) / 3600 ))
|
||||
if [ "$SNAPSHOT_AGE" -gt 8 ]; then
|
||||
log "CRITICAL: Latest snapshot is ${SNAPSHOT_AGE}h old (> 8h threshold)"
|
||||
send_alert "🚨 MEMENTO: Latest DB backup is ${SNAPSHOT_AGE}h old! Backup cron may be broken!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Latest snapshot is ${SNAPSHOT_AGE}h old"
|
||||
fi
|
||||
|
||||
SNAPSHOT_SIZE=$(stat -c%s "$LATEST_SNAPSHOT" 2>/dev/null || echo 0)
|
||||
if [ "$SNAPSHOT_SIZE" -lt 1048576 ]; then
|
||||
log "CRITICAL: Snapshot is only $(( SNAPSHOT_SIZE / 1024 ))KB (< 1MB)"
|
||||
send_alert "🚨 MEMENTO: DB backup is suspiciously small ($(( SNAPSHOT_SIZE / 1024 ))KB)! Possible failure!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot size $(( SNAPSHOT_SIZE / 1024 ))KB"
|
||||
fi
|
||||
|
||||
if ! gzip -t "$LATEST_SNAPSHOT" 2>/dev/null; then
|
||||
log "CRITICAL: Snapshot integrity check FAILED"
|
||||
send_alert "🚨 MEMENTO: DB backup integrity check FAILED! Backup is corrupted!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot integrity check passed"
|
||||
fi
|
||||
|
||||
DB_NOTES=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
DB_USERS=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
|
||||
if [ "$DB_NOTES" = "ERROR" ] || [ "$DB_NOTES" = "0" ]; then
|
||||
log "CRITICAL: DB query returned $DB_NOTES notes"
|
||||
send_alert "🚨 MEMENTO: Database appears empty or unreachable! Notes count: $DB_NOTES"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Database has $DB_NOTES notes, $DB_USERS users"
|
||||
fi
|
||||
|
||||
APP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$APP_CODE" = "000" ] || [ "$APP_CODE" -ge 500 ]; then
|
||||
log "CRITICAL: App health check failed (HTTP $APP_CODE)"
|
||||
send_alert "🚨 MEMENTO: App is down! Health check returned HTTP $APP_CODE"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: App is healthy (HTTP $APP_CODE)"
|
||||
fi
|
||||
|
||||
DISK_PCT=$(df -h /opt/memento | awk 'NR==2{print $5}' | tr -d '%')
|
||||
if [ "$DISK_PCT" -gt 85 ]; then
|
||||
log "WARNING: Disk usage at ${DISK_PCT}%"
|
||||
send_alert "⚠️ MEMENTO: Disk usage at ${DISK_PCT}%! Consider cleanup."
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Disk usage at ${DISK_PCT}%"
|
||||
fi
|
||||
|
||||
if [ "$ISSUES" -eq 0 ]; then
|
||||
log "=== All checks PASSED ==="
|
||||
else
|
||||
log "=== $ISSUES issue(s) found ==="
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user