Files
office_translator/scripts/disaster-recovery.sh
sepehr ddf6b8f6bc
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m35s
fix: ignore unbound variables when sourcing .env in all backup/DR scripts
2026-06-07 11:14:04 +02:00

404 lines
15 KiB
Bash
Executable File

#!/bin/bash
# ==============================================================================
# Wordly.art - Disaster Recovery (DR) Backup & Restore Playbook (V3)
# ==============================================================================
# Archives app configs (.env, docker-compose), database backup, and exports
# to the NAS at 192.168.1.146.
#
# On RESTORE: deploys app on the new server and automatically updates NPM
# (192.168.1.184) to reroute traffic via API — no manual intervention needed.
#
# Usage:
# ./disaster-recovery.sh --backup # Create DR archive → NAS
# ./disaster-recovery.sh --restore <archive> # Restore on THIS machine
# ==============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() { echo -e "[DR ${TIMESTAMP}] $1"; }
log_success() { echo -e "[DR ${TIMESTAMP}] ${GREEN}$1${NC}"; }
log_warning() { echo -e "[DR ${TIMESTAMP}] ${YELLOW}WARNING: $1${NC}"; }
log_error() { echo -e "[DR ${TIMESTAMP}] ${RED}ERROR: $1${NC}"; }
# Sourcing .env
ENV_FILE="${PROJECT_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
set -a
set +u
source "${ENV_FILE}"
set -u
set +a
fi
# NAS SSH (même config que backup-to-nas.sh)
NAS_HOST="${NAS_HOST:-192.168.1.146}"
NAS_USER="${NAS_USER:-wordly-backup}"
NAS_PATH="${NAS_PATH:-/volume1/backups/wordly}"
NAS_SSH_PORT="${NAS_SSH_PORT:-22}"
NAS_SSH_KEY="${NAS_SSH_KEY:-/root/.ssh/wordly_nas_key}"
BACKUP_DEST_PATH="${NAS_PATH}/snapshots"
DR_RETENTION_DAYS=${DR_RETENTION_DAYS:-30}
# IP of THIS server (used during restore to configure NPM failover)
SERVER_IP="${SERVER_IP:-}"
# Telegram
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
# ==============================================================================
# SEND TELEGRAM NOTIFICATION
# ==============================================================================
send_telegram() {
local message="$1"
if [ -n "${TELEGRAM_BOT_TOKEN}" ] && [ -n "${TELEGRAM_CHAT_ID}" ]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=Markdown" \
>/dev/null 2>&1 || true
fi
}
# ==============================================================================
# DESTINATION PREPARATION (backup mode)
# ==============================================================================
prepare_destination() {
local ssh_cmd="ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=10"
log "Vérification de la connectivité SSH vers le NAS ${NAS_HOST}..."
if ! ${ssh_cmd} "${NAS_USER}@${NAS_HOST}" "echo OK" >/dev/null 2>&1; then
log_error "Impossible de joindre le NAS ${NAS_HOST} via SSH."
log_error "Lancez d'abord : sudo bash scripts/setup-nas.sh"
exit 1
fi
# S'assurer que le dossier snapshots existe sur le NAS
${ssh_cmd} "${NAS_USER}@${NAS_HOST}" \
"mkdir -p ${NAS_PATH}/snapshots" 2>/dev/null || true
log_success "NAS SSH OK — Destination : ${NAS_USER}@${NAS_HOST}:${NAS_PATH}/snapshots"
}
# ==============================================================================
# BACKUP ACTION
# ==============================================================================
perform_backup() {
prepare_destination
log "Starting Disaster Recovery backup (Destination Mode: ${BACKUP_DEST_TYPE})..."
# 1. Trigger DB Backup
log "Triggering database dump..."
if ! bash "${SCRIPT_DIR}/backup-database.sh" --full; then
log_error "Database backup failed. Aborting DR packaging."
exit 1
fi
# 2. Locate DB Backup file
local local_backup_dir="${BACKUP_DIR:-${PROJECT_ROOT}/backups}"
local latest_db_backup
latest_db_backup=$(ls -t "${local_backup_dir}/daily/"*.gz 2>/dev/null | head -n 1 || true)
if [ -z "${latest_db_backup}" ]; then
log_error "Could not find database backup file."
exit 1
fi
log "Database backup file loaded: $(basename "${latest_db_backup}")"
# 3. Create temp packaging folder
local packing_dir="${PROJECT_ROOT}/temp_dr_pack_${TIMESTAMP}"
mkdir -p "${packing_dir}"
# 4. Pack Configurations
log "Packing application configuration (.env & docker-compose)..."
if [ -f "${PROJECT_ROOT}/.env" ]; then
cp "${PROJECT_ROOT}/.env" "${packing_dir}/.env.production"
fi
for f in docker-compose.yml docker-compose.local.yml docker-compose.monitoring.yml docker-compose.dev.yml; do
if [ -f "${PROJECT_ROOT}/${f}" ]; then
cp "${PROJECT_ROOT}/${f}" "${packing_dir}/"
fi
done
if [ -d "${PROJECT_ROOT}/docker" ]; then
cp -r "${PROJECT_ROOT}/docker" "${packing_dir}/"
fi
if [ -d "${PROJECT_ROOT}/scripts" ]; then
cp -r "${PROJECT_ROOT}/scripts" "${packing_dir}/"
fi
mkdir -p "${packing_dir}/db_backup"
cp "${latest_db_backup}" "${packing_dir}/db_backup/"
# 5. Note: NPM config is NOT backed up here.
# NPM runs on its own dedicated server (192.168.1.184) and is stable.
# Only the forward_host IP needs to change during failover, which is
# done automatically via the NPM API by npm-failover.sh during restore.
log "NPM is on dedicated server 192.168.1.184 — no NPM config to backup."
# 6. Compress DR Archive
local dr_archive_name="wordly_dr_${TIMESTAMP}.tar.gz"
local local_archive_path="${PROJECT_ROOT}/${dr_archive_name}"
log "Compressing configurations, database, and NPM data into DR archive..."
tar -czf "${local_archive_path}" -C "${packing_dir}" .
rm -rf "${packing_dir}"
if [ ! -f "${local_archive_path}" ] || [ ! -s "${local_archive_path}" ]; then
log_error "Failed to compress archive."
exit 1
fi
local size
size=$(du -h "${local_archive_path}" | cut -f1)
# 7. Envoyer l'archive sur le NAS via rsync SSH
local ssh_cmd="ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=30"
local dest_path="${BACKUP_DEST_PATH}/${dr_archive_name}"
log "Transfert de l'archive DR vers le NAS via rsync SSH..."
if ! rsync -az \
-e "ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=30" \
"${local_archive_path}" \
"${NAS_USER}@${NAS_HOST}:${BACKUP_DEST_PATH}/${dr_archive_name}"; then
log_error "rsync SSH vers le NAS a échoué !"
log_warning "Archive conservée localement : ${local_archive_path}"
send_telegram "🚨 *Wordly DR Backup FAILED*
rsync NAS échoué : ${NAS_HOST}
Fichier local : ${local_archive_path}
Date: $(date '+%Y-%m-%d %H:%M:%S')"
exit 1
fi
rm -f "${local_archive_path}"
log_success "Archive DR transférée (${size}) → ${NAS_USER}@${NAS_HOST}:${dest_path}"
# Retention policy sur le NAS
log "Rotation des archives (>${DR_RETENTION_DAYS} jours) sur le NAS..."
${ssh_cmd} "${NAS_USER}@${NAS_HOST}" \
"find ${BACKUP_DEST_PATH} -name 'wordly_dr_*.tar.gz' -mtime +${DR_RETENTION_DAYS} -delete 2>/dev/null; echo OK" | grep -q "OK" || true
# Sync scripts
if command -v rsync &>/dev/null; then
rsync -az \
-e "ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes" \
--exclude="__pycache__" \
"${SCRIPT_DIR}/" \
"${NAS_USER}@${NAS_HOST}:${NAS_PATH}/scripts/" 2>/dev/null || true
fi
send_telegram "✅ *Wordly.art DR Backup OK*
Archive: \`${dr_archive_name}\`
Taille: ${size}
NAS: \`${dest_path}\`
Date: $(date '+%Y-%m-%d %H:%M:%S')"
log_success "Disaster Recovery backup complete."
}
# ==============================================================================
# RESTORE ACTION
# ==============================================================================
perform_restore() {
local dr_package="$1"
if [ -z "${dr_package}" ]; then
log_error "No DR package archive specified."
echo "Usage: $0 --restore <path_to_archive.tar.gz>"
exit 1
fi
if [ ! -f "${dr_package}" ]; then
log_error "DR Archive file not found: ${dr_package}"
exit 1
fi
echo ""
log_warning "RESTORE DISASTER RECOVERY PACKAGE - THIS WILL OVERWRITE ENVIRONMENT CONFIGURATIONS, DATABASES, AND NPM FILES!"
echo " Archive: ${dr_package}"
echo ""
read -p "Type 'RESTORE-ALL' to confirm complete system restore: " confirm_val
if [ "${confirm_val}" != "RESTORE-ALL" ]; then
log "System restore cancelled."
exit 0
fi
log "Extracting DR archive contents..."
# Safety backup of existing .env
if [ -f "${PROJECT_ROOT}/.env" ]; then
cp "${PROJECT_ROOT}/.env" "${PROJECT_ROOT}/.env.bak_before_dr_restore_${TIMESTAMP}"
log "Created backup of existing .env: .env.bak_before_dr_restore_${TIMESTAMP}"
fi
# Extract all
tar -xzf "${dr_package}" -C "${PROJECT_ROOT}"
# Restore .env
if [ -f "${PROJECT_ROOT}/.env.production" ]; then
mv "${PROJECT_ROOT}/.env.production" "${PROJECT_ROOT}/.env"
log "Restored .env configuration"
fi
# Reload variables from restored .env
set -a
source "${PROJECT_ROOT}/.env"
set +a
log_success "Docker configurations and env keys restored."
# Boot Docker Compose Services
log "Spinning up Docker containers (database, redis, backend, frontend, NPM if configured)..."
local compose_cmd="docker compose"
if ! docker compose version &>/dev/null; then
compose_cmd="docker-compose"
fi
${compose_cmd} up -d
# Locate the embedded database backup
local db_backup_archive
db_backup_archive=$(ls "${PROJECT_ROOT}/db_backup/"*.gz 2>/dev/null | head -n 1 || true)
if [ -z "${db_backup_archive}" ]; then
log_error "Database backup archive not found inside the DR package extraction."
exit 1
fi
log "Database backup located: $(basename "${db_backup_archive}")"
# Wait for database container to be healthy (PostgreSQL)
local db_type="sqlite"
if [[ "${DATABASE_URL:-}" =~ ^postgres ]]; then
db_type="postgres"
fi
if [ "${db_type}" = "postgres" ]; then
local postgres_container="${POSTGRES_CONTAINER:-wordly-postgres}"
log "Waiting for PostgreSQL container (${postgres_container}) to be healthy..."
for i in $(seq 1 30); do
if docker inspect --format='{{.State.Health.Status}}' "${postgres_container}" 2>/dev/null | grep -q "healthy"; then
log_success "Database container is healthy."
break
fi
echo " Waiting for database... ($i/30)"
sleep 2
done
else
sleep 2
fi
# Restore the database using the database backup script
log "Triggering database restore..."
local local_backup_dir="${BACKUP_DIR:-${PROJECT_ROOT}/backups}"
mkdir -p "${local_backup_dir}/daily"
cp "${db_backup_archive}" "${local_backup_dir}/daily/"
local db_archive_filename
db_archive_filename=$(basename "${db_backup_archive}")
# Run DB restore
log "Restoring DB contents..."
bash "${SCRIPT_DIR}/backup-database.sh" --restore "${db_archive_filename}"
# Clean up extracted temporary folder
rm -rf "${PROJECT_ROOT}/db_backup"
# Restart app to clear connection caches
log "Restarting application backend..."
${compose_cmd} restart backend
# HTTP Health check (wait up to 3 minutes)
log "Waiting for application health check (max 180s)..."
local app_url="http://localhost:8001/health"
local health_ok=false
for i in $(seq 1 36); do
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 5 "${app_url}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
health_ok=true
log_success "App is healthy (HTTP 200) after $((i * 5))s"
break
fi
echo " Health check attempt ${i}/36... (HTTP ${http_code})"
sleep 5
done
if [ "${health_ok}" = "false" ]; then
log_error "App did NOT become healthy within 180s!"
log_error "NPM failover will NOT be triggered automatically."
log_error "Investigate: docker compose logs backend"
send_telegram "🚨 *Wordly.art DR FAILED — App unhealthy*
Serveur: \`$(hostname -I | awk '{print $1}')\`
Date: $(date '+%Y-%m-%d %H:%M:%S')
Action: vérifiez les logs Docker"
exit 1
fi
# ==============================================================================
# NPM AUTOMATIC FAILOVER
# ==============================================================================
log "App is healthy. Triggering NPM failover..."
local this_server_ip
this_server_ip="${SERVER_IP:-$(hostname -I | awk '{print $1}')}"
if bash "${SCRIPT_DIR}/npm-failover.sh" --target-ip "${this_server_ip}"; then
log_success "NPM now routes traffic to this server (${this_server_ip})"
send_telegram "✅ *Wordly.art DR COMPLET*
Serveur actif: \`${this_server_ip}\`
NPM redirigé automatiquement
Date: $(date '+%Y-%m-%d %H:%M:%S')"
else
log_error "NPM failover script FAILED."
log_warning "Manual failover required:"
log_warning " → Go to http://192.168.1.184:81"
log_warning " → Edit proxy host for ${NPM_PROXY_HOST_DOMAIN:-wordly.art}"
log_warning " → Change Forward Hostname to: ${this_server_ip}"
send_telegram "⚠️ *Wordly.art DR — NPM manuel requis*
App OK sur: \`${this_server_ip}\`
NPM failover automatique a échoué
Action: http://192.168.1.184:81 → modifier Forward Host"
fi
log_success "=========================================================================="
log_success "DISASTER RECOVERY SYSTEM RESTORE COMPLETE!"
log_success "=========================================================================="
log_success " App: http://${this_server_ip}:8001/health"
log_success " NPM: http://192.168.1.184:81"
echo ""
}
# ==============================================================================
# MAIN ENTRY
# ==============================================================================
main() {
case "${1:-}" in
--backup)
perform_backup
;;
--restore)
perform_restore "${2:-}"
;;
*)
echo "Wordly Disaster Recovery Utility (V2)"
echo "Usage:"
echo " $0 --backup # Package configs, db dump, NPM configurations, and export"
echo " $0 --restore <archive.tar.gz> # Extract and restore full stack on new machine"
exit 1
;;
esac
}
main "$@"