office_translator/scripts/disaster-recovery.sh

#!/bin/bash
# ==============================================================================
# Wordly.art - Disaster Recovery (DR) Backup & Restore Playbook (V3)
# ==============================================================================
# Archives app configs (.env, docker-compose), database backup, and exports
# to the NAS at 192.168.1.146.
#
# On RESTORE: deploys app on the new server and automatically updates NPM
# (192.168.1.184) to reroute traffic via API — no manual intervention needed.
#
# Usage:
#   ./disaster-recovery.sh --backup              # Create DR archive → NAS
#   ./disaster-recovery.sh --restore <archive>  # Restore on THIS machine
# ==============================================================================

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log() { echo -e "[DR ${TIMESTAMP}] $1"; }
log_success() { echo -e "[DR ${TIMESTAMP}] ${GREEN}$1${NC}"; }
log_warning() { echo -e "[DR ${TIMESTAMP}] ${YELLOW}WARNING: $1${NC}"; }
log_error() { echo -e "[DR ${TIMESTAMP}] ${RED}ERROR: $1${NC}"; }

# Sourcing .env
ENV_FILE="${PROJECT_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
    set -a
    set +u
    source "${ENV_FILE}"
    set -u
    set +a
fi

# NAS SSH (même config que backup-to-nas.sh)
NAS_HOST="${NAS_HOST:-192.168.1.146}"
NAS_USER="${NAS_USER:-wordly-backup}"
NAS_PATH="${NAS_PATH:-/volume1/backups/wordly}"
NAS_SSH_PORT="${NAS_SSH_PORT:-22}"
NAS_SSH_KEY="${NAS_SSH_KEY:-/root/.ssh/wordly_nas_key}"
BACKUP_DEST_PATH="${NAS_PATH}/snapshots"
DR_RETENTION_DAYS=${DR_RETENTION_DAYS:-30}

# IP of THIS server (used during restore to configure NPM failover)
SERVER_IP="${SERVER_IP:-}"

# Telegram
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"

# ==============================================================================
# SEND TELEGRAM NOTIFICATION
# ==============================================================================
send_telegram() {
    local message="$1"
    if [ -n "${TELEGRAM_BOT_TOKEN}" ] && [ -n "${TELEGRAM_CHAT_ID}" ]; then
        curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
            -d "chat_id=${TELEGRAM_CHAT_ID}" \
            -d "text=${message}" \
            -d "parse_mode=Markdown" \
            >/dev/null 2>&1 || true
    fi
}

# ==============================================================================
# DESTINATION PREPARATION (backup mode)
# ==============================================================================
prepare_destination() {
    local ssh_cmd="ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=10"

    log "Vérification de la connectivité SSH vers le NAS ${NAS_HOST}..."
    if ! ${ssh_cmd} "${NAS_USER}@${NAS_HOST}" "echo OK" >/dev/null 2>&1; then
        log_error "Impossible de joindre le NAS ${NAS_HOST} via SSH."
        log_error "Lancez d'abord : sudo bash scripts/setup-nas.sh"
        exit 1
    fi

    # S'assurer que le dossier snapshots existe sur le NAS
    ${ssh_cmd} "${NAS_USER}@${NAS_HOST}" \
        "mkdir -p ${NAS_PATH}/snapshots" 2>/dev/null || true

    log_success "NAS SSH OK — Destination : ${NAS_USER}@${NAS_HOST}:${NAS_PATH}/snapshots"
}

# ==============================================================================
# BACKUP ACTION
# ==============================================================================
perform_backup() {
    prepare_destination
    log "Starting Disaster Recovery backup (Destination Mode: ${BACKUP_DEST_TYPE})..."

    # 1. Trigger DB Backup
    log "Triggering database dump..."
    if ! bash "${SCRIPT_DIR}/backup-database.sh" --full; then
        log_error "Database backup failed. Aborting DR packaging."
        exit 1
    fi

    # 2. Locate DB Backup file
    local local_backup_dir="${BACKUP_DIR:-${PROJECT_ROOT}/backups}"
    local latest_db_backup
    latest_db_backup=$(ls -t "${local_backup_dir}/daily/"*.gz 2>/dev/null | head -n 1 || true)

    if [ -z "${latest_db_backup}" ]; then
        log_error "Could not find database backup file."
        exit 1
    fi
    log "Database backup file loaded: $(basename "${latest_db_backup}")"

    # 3. Create temp packaging folder
    local packing_dir="${PROJECT_ROOT}/temp_dr_pack_${TIMESTAMP}"
    mkdir -p "${packing_dir}"

    # 4. Pack Configurations
    log "Packing application configuration (.env & docker-compose)..."
    if [ -f "${PROJECT_ROOT}/.env" ]; then
        cp "${PROJECT_ROOT}/.env" "${packing_dir}/.env.production"
    fi

    for f in docker-compose.yml docker-compose.local.yml docker-compose.monitoring.yml docker-compose.dev.yml; do
        if [ -f "${PROJECT_ROOT}/${f}" ]; then
            cp "${PROJECT_ROOT}/${f}" "${packing_dir}/"
        fi
    done

    if [ -d "${PROJECT_ROOT}/docker" ]; then
        cp -r "${PROJECT_ROOT}/docker" "${packing_dir}/"
    fi
    if [ -d "${PROJECT_ROOT}/scripts" ]; then
        cp -r "${PROJECT_ROOT}/scripts" "${packing_dir}/"
    fi

    mkdir -p "${packing_dir}/db_backup"
    cp "${latest_db_backup}" "${packing_dir}/db_backup/"

    # 5. Note: NPM config is NOT backed up here.
    #    NPM runs on its own dedicated server (192.168.1.184) and is stable.
    #    Only the forward_host IP needs to change during failover, which is
    #    done automatically via the NPM API by npm-failover.sh during restore.
    log "NPM is on dedicated server 192.168.1.184 — no NPM config to backup."

    # 6. Compress DR Archive
    local dr_archive_name="wordly_dr_${TIMESTAMP}.tar.gz"
    local local_archive_path="${PROJECT_ROOT}/${dr_archive_name}"

    log "Compressing configurations, database, and NPM data into DR archive..."
    tar -czf "${local_archive_path}" -C "${packing_dir}" .
    rm -rf "${packing_dir}"

    if [ ! -f "${local_archive_path}" ] || [ ! -s "${local_archive_path}" ]; then
        log_error "Failed to compress archive."
        exit 1
    fi

    local size
    size=$(du -h "${local_archive_path}" | cut -f1)

    # 7. Envoyer l'archive sur le NAS via rsync SSH
    local ssh_cmd="ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=30"
    local dest_path="${BACKUP_DEST_PATH}/${dr_archive_name}"

    log "Transfert de l'archive DR vers le NAS via rsync SSH..."
    if ! rsync -az \
        -e "ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes -o ConnectTimeout=30" \
        "${local_archive_path}" \
        "${NAS_USER}@${NAS_HOST}:${BACKUP_DEST_PATH}/${dr_archive_name}"; then
        log_error "rsync SSH vers le NAS a échoué !"
        log_warning "Archive conservée localement : ${local_archive_path}"
        send_telegram "🚨 *Wordly DR Backup FAILED*
rsync NAS échoué : ${NAS_HOST}
Fichier local : ${local_archive_path}
Date: $(date '+%Y-%m-%d %H:%M:%S')"
        exit 1
    fi

    rm -f "${local_archive_path}"
    log_success "Archive DR transférée (${size}) → ${NAS_USER}@${NAS_HOST}:${dest_path}"

    # Retention policy sur le NAS
    log "Rotation des archives (>${DR_RETENTION_DAYS} jours) sur le NAS..."
    ${ssh_cmd} "${NAS_USER}@${NAS_HOST}" \
        "find ${BACKUP_DEST_PATH} -name 'wordly_dr_*.tar.gz' -mtime +${DR_RETENTION_DAYS} -delete 2>/dev/null; echo OK" | grep -q "OK" || true

    # Sync scripts
    if command -v rsync &>/dev/null; then
        rsync -az \
            -e "ssh -i ${NAS_SSH_KEY} -p ${NAS_SSH_PORT} -o BatchMode=yes" \
            --exclude="__pycache__" \
            "${SCRIPT_DIR}/" \
            "${NAS_USER}@${NAS_HOST}:${NAS_PATH}/scripts/" 2>/dev/null || true
    fi

    send_telegram "✅ *Wordly.art DR Backup OK*
Archive: \`${dr_archive_name}\`
Taille: ${size}
NAS: \`${dest_path}\`
Date: $(date '+%Y-%m-%d %H:%M:%S')"

    log_success "Disaster Recovery backup complete."
}

# ==============================================================================
# RESTORE ACTION
# ==============================================================================
perform_restore() {
    local dr_package="$1"

    if [ -z "${dr_package}" ]; then
        log_error "No DR package archive specified."
        echo "Usage: $0 --restore <path_to_archive.tar.gz>"
        exit 1
    fi

    if [ ! -f "${dr_package}" ]; then
        log_error "DR Archive file not found: ${dr_package}"
        exit 1
    fi

    echo ""
    log_warning "RESTORE DISASTER RECOVERY PACKAGE - THIS WILL OVERWRITE ENVIRONMENT CONFIGURATIONS, DATABASES, AND NPM FILES!"
    echo "  Archive: ${dr_package}"
    echo ""
    read -p "Type 'RESTORE-ALL' to confirm complete system restore: " confirm_val
    if [ "${confirm_val}" != "RESTORE-ALL" ]; then
        log "System restore cancelled."
        exit 0
    fi

    log "Extracting DR archive contents..."

    # Safety backup of existing .env
    if [ -f "${PROJECT_ROOT}/.env" ]; then
        cp "${PROJECT_ROOT}/.env" "${PROJECT_ROOT}/.env.bak_before_dr_restore_${TIMESTAMP}"
        log "Created backup of existing .env: .env.bak_before_dr_restore_${TIMESTAMP}"
    fi

    # Extract all
    tar -xzf "${dr_package}" -C "${PROJECT_ROOT}"

    # Restore .env
    if [ -f "${PROJECT_ROOT}/.env.production" ]; then
        mv "${PROJECT_ROOT}/.env.production" "${PROJECT_ROOT}/.env"
        log "Restored .env configuration"
    fi

    # Reload variables from restored .env
    set -a
    source "${PROJECT_ROOT}/.env"
    set +a

    log_success "Docker configurations and env keys restored."

    # Boot Docker Compose Services
    log "Spinning up Docker containers (database, redis, backend, frontend, NPM if configured)..."
    local compose_cmd="docker compose"
    if ! docker compose version &>/dev/null; then
        compose_cmd="docker-compose"
    fi

    ${compose_cmd} up -d

    # Locate the embedded database backup
    local db_backup_archive
    db_backup_archive=$(ls "${PROJECT_ROOT}/db_backup/"*.gz 2>/dev/null | head -n 1 || true)

    if [ -z "${db_backup_archive}" ]; then
        log_error "Database backup archive not found inside the DR package extraction."
        exit 1
    fi

    log "Database backup located: $(basename "${db_backup_archive}")"

    # Wait for database container to be healthy (PostgreSQL)
    local db_type="sqlite"
    if [[ "${DATABASE_URL:-}" =~ ^postgres ]]; then
        db_type="postgres"
    fi

    if [ "${db_type}" = "postgres" ]; then
        local postgres_container="${POSTGRES_CONTAINER:-wordly-postgres}"
        log "Waiting for PostgreSQL container (${postgres_container}) to be healthy..."
        for i in $(seq 1 30); do
            if docker inspect --format='{{.State.Health.Status}}' "${postgres_container}" 2>/dev/null | grep -q "healthy"; then
                log_success "Database container is healthy."
                break
            fi
            echo "  Waiting for database... ($i/30)"
            sleep 2
        done
    else
        sleep 2
    fi

    # Restore the database using the database backup script
    log "Triggering database restore..."
    local local_backup_dir="${BACKUP_DIR:-${PROJECT_ROOT}/backups}"
    mkdir -p "${local_backup_dir}/daily"
    cp "${db_backup_archive}" "${local_backup_dir}/daily/"

    local db_archive_filename
    db_archive_filename=$(basename "${db_backup_archive}")

    # Run DB restore
    log "Restoring DB contents..."
    bash "${SCRIPT_DIR}/backup-database.sh" --restore "${db_archive_filename}"

    # Clean up extracted temporary folder
    rm -rf "${PROJECT_ROOT}/db_backup"

    # Restart app to clear connection caches
    log "Restarting application backend..."
    ${compose_cmd} restart backend

    # HTTP Health check (wait up to 3 minutes)
    log "Waiting for application health check (max 180s)..."
    local app_url="http://localhost:8001/health"
    local health_ok=false
    for i in $(seq 1 36); do
        local http_code
        http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 5 "${app_url}" 2>/dev/null || echo "000")
        if [ "${http_code}" = "200" ]; then
            health_ok=true
            log_success "App is healthy (HTTP 200) after $((i * 5))s"
            break
        fi
        echo "  Health check attempt ${i}/36... (HTTP ${http_code})"
        sleep 5
    done

    if [ "${health_ok}" = "false" ]; then
        log_error "App did NOT become healthy within 180s!"
        log_error "NPM failover will NOT be triggered automatically."
        log_error "Investigate: docker compose logs backend"
        send_telegram "🚨 *Wordly.art DR FAILED — App unhealthy*
Serveur: \`$(hostname -I | awk '{print $1}')\`
Date: $(date '+%Y-%m-%d %H:%M:%S')
Action: vérifiez les logs Docker"
        exit 1
    fi

    # ==============================================================================
    # NPM AUTOMATIC FAILOVER
    # ==============================================================================
    log "App is healthy. Triggering NPM failover..."
    local this_server_ip
    this_server_ip="${SERVER_IP:-$(hostname -I | awk '{print $1}')}"

    if bash "${SCRIPT_DIR}/npm-failover.sh" --target-ip "${this_server_ip}"; then
        log_success "NPM now routes traffic to this server (${this_server_ip})"
        send_telegram "✅ *Wordly.art DR COMPLET*
Serveur actif: \`${this_server_ip}\`
NPM redirigé automatiquement
Date: $(date '+%Y-%m-%d %H:%M:%S')"
    else
        log_error "NPM failover script FAILED."
        log_warning "Manual failover required:"
        log_warning "  → Go to http://192.168.1.184:81"
        log_warning "  → Edit proxy host for ${NPM_PROXY_HOST_DOMAIN:-wordly.art}"
        log_warning "  → Change Forward Hostname to: ${this_server_ip}"
        send_telegram "⚠️ *Wordly.art DR — NPM manuel requis*
App OK sur: \`${this_server_ip}\`
NPM failover automatique a échoué
Action: http://192.168.1.184:81 → modifier Forward Host"
    fi

    log_success "=========================================================================="
    log_success "DISASTER RECOVERY SYSTEM RESTORE COMPLETE!"
    log_success "=========================================================================="
    log_success "  App: http://${this_server_ip}:8001/health"
    log_success "  NPM: http://192.168.1.184:81"
    echo ""
}

# ==============================================================================
# MAIN ENTRY
# ==============================================================================
main() {
    case "${1:-}" in
        --backup)
            perform_backup
            ;;
        --restore)
            perform_restore "${2:-}"
            ;;
        *)
            echo "Wordly Disaster Recovery Utility (V2)"
            echo "Usage:"
            echo "  $0 --backup                 # Package configs, db dump, NPM configurations, and export"
            echo "  $0 --restore <archive.tar.gz> # Extract and restore full stack on new machine"
            exit 1
            ;;
    esac
}

main "$@"