From 9c9b6fe36284357f916ef83716e05e5713684ff2 Mon Sep 17 00:00:00 2001
From: Antigravity <antigravity@gemini.ai>
Date: Sun, 17 May 2026 14:18:25 +0000
Subject: [PATCH] docs: complete production guide + fix WAL setup transaction
 error

---
 docs/production-guide.md    | 342 ++++++++++++++++++++++++++++++++++++
 scripts/backup/setup-wal.sh |  15 +-
 2 files changed, 349 insertions(+), 8 deletions(-)
 create mode 100644 docs/production-guide.md

diff --git a/docs/production-guide.md b/docs/production-guide.md
new file mode 100644
index 0000000..f462c6a
--- /dev/null
+++ b/docs/production-guide.md
@@ -0,0 +1,342 @@
+# Memento — Guide Production : Monitoring, Backups & Résilience
+
+## Architecture de production
+
+```
+┌─────────────────────────────────────────────────────────┐
+│  Serveur 192.168.1.190 (Proxmox)                        │
+│                                                         │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐  │
+│  │ memento-note │  │ memento-     │  │ memento-     │  │
+│  │  :3000       │  │ postgres     │  │ redis        │  │
+│  │  (Next.js)   │  │  :5433       │  │  :6379       │  │
+│  └──────────────┘  └──────────────┘  └──────────────┘  │
+│                                                         │
+│  ┌──────────────────────────────────────────────────┐   │
+│  │  Stack Monitoring                                 │   │
+│  │  Prometheus:9090 / Grafana:3001 / Alertmanager   │   │
+│  │  node-exporter / postgres-exporter / redis-exp.  │   │
+│  │  cadvisor                                        │   │
+│  └──────────────────────────────────────────────────┘   │
+│                                                         │
+│  ┌──────────────────────────────────────────────────┐   │
+│  │  Backups                                          │   │
+│  │  /opt/memento/backups/snapshots/  (pg_dump 6h)   │   │
+│  │  /opt/memento/backups/wal/        (WAL continu)   │   │
+│  │  Serveur externe                  (rsync journalier)│
+│  └──────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────┘
+         │                                    │
+         │ Cloudflare                         │ rsync chiffré
+         ▼                                    ▼
+   note.parsanet.org                    Serveur hors-site
+```
+
+---
+
+## 1. Protection des données
+
+### 1.1 Niveaux de sauvegarde
+
+| Niveau | Méthode | Fréquence | RPO | Rétention |
+|--------|---------|-----------|-----|-----------|
+| 1 | WAL archiving (PITR) | Continu | ~1 seconde | 30 jours WAL |
+| 2 | pg_dump --format=custom | Toutes les 6h | 6h max | 7 jours + 4 snapshots hebdo |
+| 3 | rsync chiffré hors-site | Journalier 03h00 | 24h max | Même rétention que niveau 2 |
+| 4 | Pre-migration dump | Avant chaque deploy | Déploiement | 10 derniers |
+
+**RPO** (Recovery Point Objective) : quantité maximale de données qu'on peut perdre.
+**RTO** (Recovery Time Objective) : temps pour retrouver un service fonctionnel.
+
+### 1.2 Scénarios de reprise
+
+| Scénario | RPO | RTO | Procédure |
+|----------|-----|-----|-----------|
+| Crash container app | 0 | ~30s | Docker restart automatique |
+| Crash process PostgreSQL | 0-5s | ~1min | Autorestart + WAL replay |
+| Corruption DB partielle | 0-6h | ~5min | `restore.sh snapshot` |
+| Corruption DB totale | 0-24h | ~15min | `restore.sh snapshot` + backup externe |
+| Serveur physique perdu | 0-24h | ~2h | Restore from backup externe |
+| Erreur humaine (DROP) | 0-6h | ~5min | `restore.sh pitr "YYYY-MM-DD HH:MM:SS"` |
+
+### 1.3 Installation des sauvegardes
+
+```bash
+# Sur le serveur 192.168.1.190
+cd /opt/memento
+git pull
+
+# 1. Configurer les variables dans .env.docker
+# Ajouter :
+#   TELEGRAM_BOT_TOKEN=your_bot_token
+#   TELEGRAM_CHAT_ID=your_chat_id
+#   BACKUP_REMOTE_HOST=user@backup-server  (optionnel, pour hors-site)
+
+# 2. Activer le WAL archiving
+bash scripts/backup/setup-wal.sh
+
+# 3. Tester un snapshot manuel
+bash scripts/backup/backup.sh
+
+# 4. Vérifier le snapshot
+bash scripts/backup/verify-backups.sh
+
+# 5. Installer les crons
+bash scripts/backup/install-crontab.sh
+crontab /opt/memento/backups/crontab
+
+# 6. Vérifier les crons
+crontab -l
+```
+
+### 1.4 Crons installés
+
+```
+# Snapshots toutes les 6h (00h00, 06h00, 12h00, 18h00)
+0 */6 * * * /opt/memento/scripts/backup/backup.sh
+
+# Vérification 30min après chaque snapshot
+30 */6 * * * /opt/memento/scripts/backup/verify-backups.sh
+
+# Synchronisation hors-site quotidienne à 03h00
+0 3 * * * /opt/memento/scripts/backup/offsite-sync.sh
+```
+
+### 1.5 Restauration
+
+```bash
+# Restaurer le dernier snapshot (automatique)
+bash scripts/backup/restore.sh snapshot
+
+# Restaurer un snapshot spécifique
+bash scripts/backup/restore.sh snapshot /opt/memento/backups/snapshots/memento-20260517-120000.sql.gz
+
+# Point-in-time recovery (PITR)
+bash scripts/backup/restore.sh pitr "2026-05-17 14:30:00"
+```
+
+Le script `restore.sh` :
+1. Dump la DB actuelle (safety net)
+2. Stoppe l'app
+3. Restore la DB
+4. Vérifie le count de notes (abort si 0)
+5. Redémarre l'app
+6. Health check (180s timeout)
+7. Log le résultat
+
+### 1.6 Script de vérification
+
+`verify-backups.sh` vérifie toutes les 6h :
+
+- ✅ Un snapshot récent existe (< 8h)
+- ✅ Taille du snapshot > 1MB
+- ✅ Intégrité gzip du snapshot
+- ✅ DB contient des notes (count > 0)
+- ✅ App répond (HTTP < 500)
+- ✅ Espace disque < 85%
+
+Si un check échoue → **alerte Telegram immédiate**.
+
+---
+
+## 2. Monitoring
+
+### 2.1 Installation
+
+```bash
+# Sur le serveur 192.168.1.190
+cd /opt/memento/monitoring
+
+# Configurer le mot de passe Grafana
+# Dans .env.docker, ajouter :
+#   GRAFANA_ADMIN_PASSWORD=your_secure_password
+
+# Lancer la stack
+docker compose -f docker-compose.monitoring.yml up -d
+
+# Vérifier que tout est up
+docker compose -f docker-compose.monitoring.yml ps
+```
+
+### 2.2 Accès
+
+| Service | URL | Identifiants |
+|---------|-----|-------------|
+| Grafana | http://192.168.1.190:3001 | admin / (mot de passe configuré) |
+| Prometheus | http://192.168.1.190:9090 | Aucun |
+| Alertmanager | http://192.168.1.190:9093 | Aucun |
+| cAdvisor | http://192.168.1.190:8080 | Aucun |
+
+### 2.3 Exporters
+
+| Exporter | Port | Ce qu'il monitor |
+|----------|------|-----------------|
+| node-exporter | 9100 | CPU, RAM, disque, réseau du serveur |
+| postgres-exporter | 9187 | Connexions, requêtes, locks, taille DB |
+| redis-exporter | 9121 | Hit rate, mémoire, clés, TTL |
+| cadvisor | 8080 | CPU/RAM par conteneur Docker |
+
+### 2.4 Alertes configurées
+
+| Alerte | Condition | Sévérité |
+|--------|-----------|----------|
+| MementoAppDown | App injoignable > 2min | Critical |
+| PostgresDown | PostgreSQL down > 1min | Critical |
+| RedisDown | Redis down > 1min | Critical |
+| DiskSpaceLow | Disque < 15% libre > 5min | Warning |
+| HighMemoryUsage | RAM > 90% > 5min | Warning |
+| PostgresConnectionsHigh | > 80 connexions > 5min | Warning |
+| PostgresSlowQueries | Requête moyenne > 5s | Warning |
+| HighErrorRate | Erreurs 5xx > 5% > 3min | Warning |
+| ContainerRestarted | Restart dans la dernière heure | Warning |
+
+### 2.5 Configuration Grafana
+
+1. Accéder à http://192.168.1.190:3001
+2. Ajouter la datasource Prometheus : `http://prometheus:9090`
+3. Importer les dashboards (IDs Grafana) :
+   - **1860** — Node Exporter Full (serveur)
+   - **9628** — PostgreSQL Database (DB)
+   - **763** — Redis Dashboard (cache)
+   - **193** — Docker Monitoring (conteneurs)
+
+### 2.6 Health API
+
+```
+GET https://note.parsanet.org/api/admin/health
+```
+
+Retourne :
+```json
+{
+  "status": "healthy",
+  "uptime": 86400,
+  "version": "0.2.0",
+  "components": {
+    "database": { "status": "healthy", "latency": "12ms", "notes": 107 },
+    "redis": { "status": "healthy", "latency": "2ms", "keys": 847 },
+    "ai": { "status": "configured", "embedding": {"provider": "openrouter"} },
+    "storage": { "status": "healthy", "usagePercent": "23%" }
+  }
+}
+```
+
+Utilisable par Prometheus, Grafana, ou tout outil de monitoring externe.
+
+---
+
+## 3. Déploiement
+
+### 3.1 Pipeline CI/CD
+
+```
+Push → Gitea Actions CI → Lint + Test + Build
+                            ↓ (si succès)
+                       Deploy.yaml
+                            ↓
+                       Dump DB (vérif > 1MB)
+                            ↓
+                       Git pull + Docker build
+                            ↓
+                       Health check (180s)
+                            ↓ échec
+                       Rollback automatique
+                            ↓ succès
+                       Notification Telegram
+```
+
+### 3.2 Variables à configurer dans Gitea
+
+**Secrets :**
+- `SSH_PRIVATE_KEY` — clé SSH pour le serveur
+- `POSTGRES_PASSWORD`
+- `NEXTAUTH_SECRET`
+- `TELEGRAM_BOT_TOKEN`
+- `TELEGRAM_CHAT_ID`
+- `CUSTOM_OPENAI_API_KEY`
+
+**Variables :**
+- `APP_URL` — https://note.parsanet.org
+- `ADMIN_EMAIL`
+- `POSTGRES_USER`, `POSTGRES_DB`, `POSTGRES_PORT`
+- Toutes les variables AI provider
+
+### 3.3 Rollback
+
+Le CI tag l'image Docker actuelle en `:rollback` avant chaque deploy.
+Si le health check échoue → restore automatique de l'image `:rollback`.
+
+En cas de problème DB :
+```bash
+bash scripts/backup/restore.sh snapshot
+```
+
+---
+
+## 4. Checklist production
+
+### Premier setup (à faire une seule fois)
+
+- [ ] `bash scripts/backup/setup-wal.sh` — activer WAL archiving
+- [ ] `bash scripts/backup/backup.sh` — tester le premier snapshot
+- [ ] `bash scripts/backup/verify-backups.sh` — vérifier le setup complet
+- [ ] `crontab /opt/memento/backups/crontab` — installer les crons
+- [ ] Configurer `TELEGRAM_BOT_TOKEN` + `TELEGRAM_CHAT_ID` dans `.env.docker`
+- [ ] `cd monitoring && docker compose -f docker-compose.monitoring.yml up -d`
+- [ ] Configurer Grafana (datasource + dashboards)
+- [ ] Configurer `BACKUP_REMOTE_HOST` pour le backup hors-site
+
+### Vérification quotidienne (automatique)
+
+- [ ] Snapshots créés toutes les 6h ✓ (cron)
+- [ ] Vérification automatique ✓ (cron)
+- [ ] Sync hors-site journalier ✓ (cron)
+- [ ] Alertes Telegram si problème ✓ (Alertmanager)
+
+### Vérification hebdomadaire (manuelle)
+
+- [ ] Ouvrir Grafana, vérifier les dashboards
+- [ ] Tester un restore sur une DB temporaire
+- [ ] Vérifier l'espace disque et la rotation des backups
+- [ ] Vérifier que les WAL ne grossissent pas trop
+
+---
+
+## 5. Sécurité
+
+### 5.1 PostgreSQL
+
+- WAL archiving activé (PITR possible)
+- `archive_command` stocke les WAL dans `/var/lib/postgresql/backups/wal/`
+- Pas de connexion TRUST — md5/scram-sha-256 uniquement
+- Container non exposé sur internet (port 5433 interne uniquement)
+
+### 5.2 Backups
+
+- Snapshots chiffrés si le filesystem est chiffré (recommandé LUKS)
+- rsync hors-site via SSH (chiffré en transit)
+- Permissions fichiers : 600 sur les snapshots
+- Les scripts vérifient l'intégrité avant de trust un backup
+
+### 5.3 App
+
+- `DATABASE_URL` jamais exposé côté client
+- Secrets dans Gitea (encrypted at rest)
+- `.env.docker` en permission 600 sur le serveur
+- Rate limiting à configurer (P3)
+
+---
+
+## 6. Scripts de référence
+
+| Script | Usage | Fréquence |
+|--------|-------|-----------|
+| `scripts/backup/setup-wal.sh` | Activer WAL archiving | Once |
+| `scripts/backup/backup.sh` | Créer un snapshot | 6h (cron) |
+| `scripts/backup/verify-backups.sh` | Vérifier les backups | 6h (cron) |
+| `scripts/backup/restore.sh` | Restaurer la DB | Manuel (urgence) |
+| `scripts/backup/offsite-sync.sh` | Sync hors-site | 24h (cron) |
+| `scripts/backup/install-crontab.sh` | Installer les crons | Once |
+| `dump-db.sh` | Dump rapide (dev) | Manuel |
+| `scripts/deploy-prod.sh` | Deploy manuel | Manuel |
+| `scripts/migrate-docker.sh` | Migrer la DB | Après deploy |
diff --git a/scripts/backup/setup-wal.sh b/scripts/backup/setup-wal.sh
index 400d1ac..a9ca0e5 100755
--- a/scripts/backup/setup-wal.sh
+++ b/scripts/backup/setup-wal.sh
@@ -17,16 +17,15 @@ log() {
 
 log "=== Setting up WAL archiving ==="
 
-docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "
-ALTER SYSTEM SET wal_level = replica;
-ALTER SYSTEM SET archive_mode = on;
-ALTER SYSTEM SET archive_command = 'cp %p /var/lib/postgresql/backups/wal/%f';
-ALTER SYSTEM SET max_wal_senders = 3;
-ALTER SYSTEM SET wal_keep_size = '1GB';
-"
+docker exec "$PG_CONTAINER" bash -c "mkdir -p /var/lib/postgresql/backups/wal"
 
-docker exec "$PG_CONTAINER" mkdir -p /var/lib/postgresql/backups/wal
+docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "ALTER SYSTEM SET wal_level = replica;"
+docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "ALTER SYSTEM SET archive_mode = on;"
+docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "ALTER SYSTEM SET archive_command = 'cp %p /var/lib/postgresql/backups/wal/%f';"
+docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "ALTER SYSTEM SET max_wal_senders = 3;"
+docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "ALTER SYSTEM SET wal_keep_size = '1GB';"
 
+log "Reloading PostgreSQL configuration..."
 docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "SELECT pg_reload_conf();"
 
 log "WAL archiving enabled. Archives stored in /var/lib/postgresql/backups/wal/"