feat(glossaries): add backup + delete scripts and Gitea workflow for duplicate cleanup
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s

- scripts/backup_duplicate_glossaries.py : exporte en JSON les doublons
  (meme user_id + template_id) sans rien supprimer. Schema validation,
  tri stable, mode degrade si colonne template_id absente.
- scripts/delete_duplicate_glossaries.py : lit un backup JSON et supprime
  les doublons listes. Validation IDs, confirmation interactive,
  commit par user, mode --dry-run / --yes.
- .gitea/workflows/cleanup-glossaries.yml : workflow_dispatch qui SSH
  sur le serveur de prod et execute le script dans le conteneur backend
  (postgres demarre, .env charge, env_file docker-compose).
This commit is contained in:
Sepehr
2026-06-03 21:21:11 +02:00
parent e1f9f3db04
commit cd32a42b1a
3 changed files with 715 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
name: Cleanup Duplicate Glossaries
on:
workflow_dispatch:
inputs:
user_id:
description: 'User ID dont on nettoie les doublons (obligatoire).'
required: true
type: string
dry_run:
description: 'Mode relecture seule — aucune suppression.'
required: false
default: true
type: boolean
yes:
description: 'Confirme la suppression (ignoré si dry_run=true).'
required: false
default: false
type: boolean
jobs:
cleanup:
name: Backup and delete duplicate glossaries on production
runs-on: ubuntu-24.04
steps:
- name: Setup SSH
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H 192.168.1.151 >> ~/.ssh/known_hosts
- name: Run cleanup on production server
env:
USER_ID: ${{ inputs.user_id }}
DRY_RUN: ${{ inputs.dry_run }}
YES: ${{ inputs.yes }}
run: |
ssh root@192.168.1.151 << ENDSSH
set -euo pipefail
cd /opt/wordly
# S'assurer que le code local est à jour (les scripts font partie du repo).
git config --global --add safe.directory /opt/wordly
git fetch origin production-deployment
git reset --hard origin/production-deployment
# S'assurer que postgres tourne (le script lit via DATABASE_URL du .env).
docker compose up -d postgres redis
# Attendre postgres (max 60s).
for i in \$(seq 1 30); do
if docker compose exec -T postgres pg_isready -U translate >/dev/null 2>&1; then
echo "Postgres ready after \$((i * 2))s"
break
fi
[ "\$i" -eq 30 ] && { echo "Postgres not ready after 60s"; exit 1; }
sleep 2
done
# Construire les flags.
FLAGS="--user \${USER_ID} --allow-missing-template-id"
if [ "\${DRY_RUN}" = "true" ]; then
FLAGS="\${FLAGS} --dry-run"
fi
if [ "\${YES}" = "true" ]; then
FLAGS="\${FLAGS} --yes"
fi
echo "════════════════════════════════════════"
echo " USER_ID=\${USER_ID}"
echo " DRY_RUN=\${DRY_RUN}"
echo " YES=\${YES}"
echo " FLAGS=\${FLAGS}"
echo "════════════════════════════════════════"
# Le service "backend" charge .env via env_file et l'entrypoint
# fait `exec "\$@"` si on passe des args. DATABASE_URL est résolu
# vers le hostname docker "postgres:5432" via la variable d'env.
docker compose run --rm backend \
python scripts/delete_duplicate_glossaries.py \${FLAGS}
ENDSSH
- name: List backup artifacts
if: always()
run: |
ssh root@192.168.1.151 'ls -la /opt/wordly/backups/ 2>/dev/null || echo "(no backups dir)"'

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""
Sauvegarde en JSON les glossaires dupliqués (même user_id + template_id) créés
avant la mise en place de la garde anti-doublon dans le backend.
⚠️ Ce script ne supprime RIEN — il produit uniquement un fichier de backup
contenant l'intégralité des doublons (métadonnées + termes) en vue d'une
analyse ou d'une suppression manuelle ultérieure.
Pour chaque couple (user_id, template_id) avec > 1 glossaire, le plus ancien
(premier créé) est marqué "keeper" et les copies sont listées dans "duplicates"
avec tous leurs termes.
Usage:
# Cible la base de prod PostgreSQL (lu via DATABASE_URL) :
DATABASE_URL=postgresql://user:pass@host:5432/db python scripts/backup_duplicate_glossaries.py
# Ou préciser une base SQLite spécifique :
SQLITE_PATH=/path/to/translate.db python scripts/backup_duplicate_glossaries.py
# Limiter à un seul utilisateur :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --user <user_id>
# Choisir le fichier de sortie :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --output backups/dupes.json
# Forcer l'exécution même si la colonne template_id est absente du schéma
# (utile pour un dump partiel des glossaires sans template_id) :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --allow-missing-template-id
"""
import argparse
import json
import logging
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from sqlalchemy import inspect, text
from database.connection import sync_engine
from database.models import Glossary
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("backup_dup_glossaries")
def _has_template_id_column() -> bool:
"""Vérifie que la colonne `template_id` existe sur la table `glossaries`."""
try:
inspector = inspect(sync_engine)
cols = {c["name"] for c in inspector.get_columns("glossaries")}
return "template_id" in cols
except Exception as e:
logger.error("Impossible d'inspecter le schéma : %s", e)
return False
def find_duplicates(
session, user_id: str | None = None, include_no_template: bool = False,
use_raw_query: bool = False,
) -> dict[tuple[str, str | None], list[Glossary]]:
"""Group glossaries by (user_id, template_id). By default, only template-linked ones.
Si `include_no_template=True`, les glossaires sans template_id sont groupés sous
la clé (user_id, None).
Si `use_raw_query=True`, on bypass l'ORM (utile quand la colonne `template_id`
n'existe pas dans le schéma).
"""
if use_raw_query:
# Bypass ORM : SELECT * ne dépend pas du mapping modèle.
sql = "SELECT id, user_id, name, source_language, target_language, " \
"created_at, updated_at FROM glossaries"
params: dict = {}
clauses: list[str] = []
if user_id:
clauses.append("user_id = :user_id")
params["user_id"] = user_id
if clauses:
sql += " WHERE " + " AND ".join(clauses)
rows = session.execute(text(sql), params).fetchall()
groups: dict[tuple[str, str | None], list[Glossary]] = defaultdict(list)
for r in rows:
# Sans colonne template_id, on groupe par (user_id, None).
g = Glossary(
id=r.id, user_id=r.user_id, name=r.name,
source_language=r.source_language, target_language=r.target_language,
created_at=r.created_at, updated_at=r.updated_at,
)
groups[(g.user_id, None)].append(g)
return {k: v for k, v in groups.items() if len(v) > 1}
q = session.query(Glossary)
if not include_no_template:
q = q.filter(Glossary.template_id.isnot(None))
if user_id:
q = q.filter(Glossary.user_id == user_id)
groups = defaultdict(list)
for g in q.all():
groups[(g.user_id, g.template_id)].append(g)
return {k: v for k, v in groups.items() if len(v) > 1}
def _stable_sort(glossaries: list[Glossary]) -> tuple[list[Glossary], int]:
"""Sort glossaries by (created_at ASC, id ASC) for deterministic ordering.
Returns the sorted list and the number of entries with None created_at.
"""
none_count = sum(1 for g in glossaries if g.created_at is None)
if none_count:
logger.warning(
"⚠️ %d glossaire(s) ont un created_at NULL — tri secondaire par id.",
none_count,
)
return sorted(glossaries, key=lambda g: (g.created_at or datetime.min.replace(tzinfo=timezone.utc), g.id)), none_count
def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glossary]) -> dict:
"""Convert a duplicate group to a JSON-serializable dict."""
sorted_glossaries, _ = _stable_sort(glossaries)
keeper = sorted_glossaries[0]
duplicates = sorted_glossaries[1:]
def to_iso(value) -> str | None:
if value is None:
return None
if isinstance(value, datetime):
return value.isoformat()
# SQLite renvoie parfois des strings ; on tente de normaliser.
if isinstance(value, str):
try:
return datetime.fromisoformat(value).isoformat()
except ValueError:
return value
return str(value)
def serialize_glossary(g: Glossary, include_terms: bool) -> dict:
data = {
"id": g.id,
"name": g.name,
"source_language": g.source_language,
"target_language": g.target_language,
"template_id": getattr(g, "template_id", None),
"created_at": to_iso(g.created_at),
"updated_at": to_iso(g.updated_at),
}
if include_terms:
data["terms"] = [
{
"id": t.id,
"source": t.source,
"target": t.target,
"translations": t.translations or {},
}
for t in g.terms
] if g.terms else []
else:
data["terms_count"] = len(g.terms) if g.terms else 0
return data
return {
"user_id": user_id,
"template_id": template_id,
"keep": serialize_glossary(keeper, include_terms=False),
"duplicates_count": len(duplicates),
"duplicates": [serialize_glossary(d, include_terms=True) for d in duplicates],
}
def write_backup(groups: dict[tuple[str, str | None], list[Glossary]], output_path: Path) -> dict:
"""Write the full backup to `output_path` and return a stats dict."""
output_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"schema_version": 1,
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons.",
"total_groups": len(groups),
"total_duplicates": sum(len(v) - 1 for v in groups.values()),
"groups": [serialize_group(uid, tid, gs) for (uid, tid), gs in
sorted(groups.items(), key=lambda x: (x[0][1] or "", x[0][0]))],
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
return payload
def print_report(payload: dict) -> None:
"""Print a human-readable summary on stdout."""
logger.info("=" * 78)
logger.info("Récapitulatif du backup")
logger.info("=" * 78)
logger.info("Généré le : %s", payload["generated_at"])
logger.info("Groupes concernés: %d", payload["total_groups"])
logger.info("Doublons totaux : %d", payload["total_duplicates"])
by_template: dict[str, int] = defaultdict(int)
for g in payload["groups"]:
tid = g["template_id"] or "(no template)"
by_template[tid] += g["duplicates_count"]
if by_template:
logger.info("")
logger.info("Par template :")
for tid, count in sorted(by_template.items()):
logger.info(" %-14s %d doublon(s)", tid, count)
logger.info("=" * 78)
def main() -> int:
parser = argparse.ArgumentParser(
description="Sauvegarde (sans suppression) les glossaires dupliqués en JSON."
)
parser.add_argument(
"--user",
metavar="USER_ID",
help="Limite le backup à un seul utilisateur.",
)
parser.add_argument(
"--output",
metavar="PATH",
help="Chemin du fichier JSON de sortie (défaut : backups/glossary_duplicates_<timestamp>.json).",
)
parser.add_argument(
"--include-no-template",
action="store_true",
help="Inclut aussi les glossaires sans template_id dans la recherche de doublons.",
)
parser.add_argument(
"--allow-missing-template-id",
action="store_true",
help="Continue sans erreur si la colonne `template_id` est absente du schéma "
"(équivaut à --include-no-template, mais le script ne plantera pas).",
)
args = parser.parse_args()
if not _has_template_id_column():
if args.allow_missing_template_id or args.include_no_template:
logger.warning("⚠️ Colonne `template_id` absente — bascule en mode sans-template.")
args.include_no_template = True
else:
logger.error(
"❌ La colonne `glossaries.template_id` est absente du schéma actuel. "
"Appliquez d'abord la migration Alembic (alembic upgrade head) ou relancez "
"avec --allow-missing-template-id pour ne sauvegarder que les glossaires sans template_id."
)
return 2
from database.connection import get_sync_session
use_raw = args.allow_missing_template_id or args.include_no_template
if args.allow_missing_template_id and not args.include_no_template:
args.include_no_template = True
logger.info(
"🔍 Recherche de doublons%s%s",
f" pour user_id={args.user}" if args.user else "",
" (incl. sans template)" if args.include_no_template else "",
)
with get_sync_session() as session:
groups = find_duplicates(
session,
user_id=args.user,
include_no_template=args.include_no_template,
use_raw_query=use_raw,
)
if not groups:
if args.user:
logger.info("✅ Aucun doublon trouvé pour user_id=%s.", args.user)
else:
logger.info("✅ Aucun doublon trouvé.")
return 0
if args.output:
output_path = Path(args.output)
else:
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
output_path = ROOT / "backups" / f"glossary_duplicates_{ts}.json"
logger.info("💾 Écriture du backup vers %s", output_path)
payload = write_backup(groups, output_path)
print_report(payload)
logger.info("✅ Backup écrit : %s (%d octets)", output_path, output_path.stat().st_size)
logger.info(" Aucune suppression effectuée. Relire le JSON pour décider de l'action manuelle.")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,323 @@
#!/usr/bin/env python3
"""
Supprime les glossaires dupliqués en se basant sur un fichier JSON de backup
produit par `scripts/backup_duplicate_glossaries.py`.
⚠️ DESTRUCTIF. Par défaut, demande confirmation interactive avant chaque
suppression. Utiliser --yes pour les exécutions automatisées.
Pour chaque groupe dans le JSON, le glossaire listé dans "keep" est conservé,
ceux listés dans "duplicates" sont supprimés (ainsi que leurs termes via cascade).
Usage:
# Dry-run (relecture) :
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --dry-run
# Confirmation interactive :
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json
# Sans confirmation (CI / cron) :
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --yes
# Re-génère le backup à la volée (si --input absent) :
DATABASE_URL=... python scripts/delete_duplicate_glossaries.py --user <USER_ID> --dry-run
# Back-up automatique avant suppression (recommandé) :
python scripts/delete_duplicate_glossaries.py --user <USER_ID> --yes
"""
import argparse
import json
import logging
import sys
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from sqlalchemy import inspect, text
from database.connection import get_sync_session, sync_engine
from database.models import Glossary, User
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("delete_dup_glossaries")
def load_backup(path: Path) -> dict:
"""Load and validate a backup JSON file."""
if not path.exists():
logger.error("❌ Fichier de backup introuvable : %s", path)
sys.exit(2)
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if "groups" not in data:
logger.error("❌ Le fichier %s n'a pas de clé 'groups' — format invalide.", path)
sys.exit(2)
return data
def validate_ids(session, backup: dict) -> tuple[list[dict], list[str], list[str]]:
"""Check that every duplicate id exists in the DB and matches the expected user.
Returns (valid_groups, errors, warnings).
- errors: bloquants (mismatch owner / id introuvable)
- warnings: informatifs (user parent absent — données orphelines)
Utilise du SQL brut pour ne pas dépendre du mapping ORM (qui planterait
si la colonne `template_id` n'existe pas dans le schéma).
"""
errors: list[str] = []
warnings: list[str] = []
valid: list[dict] = []
for group in backup["groups"]:
user_id = group["user_id"]
user_exists = session.execute(
text("SELECT id FROM users WHERE id = :uid"), {"uid": user_id}
).first()
if not user_exists:
warnings.append(
f"User parent introuvable (données orphelines) : user_id={user_id}"
f"la suppression des glossaires associés sera tentée."
)
for dup in group["duplicates"]:
dup_id = dup["id"]
row = session.execute(
text("SELECT id FROM glossaries WHERE id = :gid AND user_id = :uid"),
{"gid": dup_id, "uid": user_id},
).first()
if not row:
errors.append(
f"Glossary introuvable ou owner mismatch : id={dup_id} user={user_id}"
)
valid.append(group)
return valid, errors, warnings
def print_preview(valid_groups: list[dict]) -> tuple[int, int]:
"""Print what would be deleted. Returns (total_dupes, total_user_groups)."""
total_dupes = 0
total_terms = 0
logger.info("=" * 78)
logger.info("Aperçu de la suppression")
logger.info("=" * 78)
by_template: dict[str, int] = {}
for g in valid_groups:
tid = g.get("template_id") or "(no template)"
by_template[tid] = by_template.get(tid, 0) + g["duplicates_count"]
total_dupes += g["duplicates_count"]
total_terms += sum(len(d.get("terms", [])) for d in g["duplicates"])
logger.info("Groupes à traiter : %d", len(valid_groups))
logger.info("Glossaires à supprimer : %d", total_dupes)
logger.info("Termes concernés (estim.) : %d", total_terms)
logger.info("")
logger.info("Détail par template :")
for tid, count in sorted(by_template.items()):
logger.info(" %-14s %d doublon(s) à supprimer", tid, count)
logger.info("=" * 78)
return total_dupes, total_terms
def delete_group(session, group: dict) -> tuple[int, int]:
"""Delete the duplicates of a single group. Returns (glossaries_deleted, terms_deleted).
Utilise SQL brut pour ne pas dépendre du mapping ORM (qui planterait
si la colonne `template_id` n'existe pas dans le schéma).
"""
deleted = 0
terms_deleted = 0
for dup in group["duplicates"]:
# 1. Compter les termes (avant suppression, pour les logs).
term_count = session.execute(
text("SELECT COUNT(*) FROM glossary_terms WHERE glossary_id = :gid"),
{"gid": dup["id"]},
).scalar() or 0
# 2. Récupérer le template_id pour le log (best effort).
try:
tpl = session.execute(
text("SELECT template_id FROM glossaries WHERE id = :gid"),
{"gid": dup["id"]},
).scalar()
except Exception:
tpl = None
# 3. Supprimer d'abord les termes (FK), puis le glossaire.
try:
session.execute(
text("DELETE FROM glossary_terms WHERE glossary_id = :gid"),
{"gid": dup["id"]},
)
session.execute(
text("DELETE FROM glossaries WHERE id = :gid"),
{"gid": dup["id"]},
)
session.flush()
except Exception as e:
session.rollback()
raise RuntimeError(f"Échec suppression glossary {dup['id']}: {e}") from e
deleted += 1
terms_deleted += term_count
logger.info(
" 🗑️ Supprimé id=%s (template=%s, %d termes)",
dup["id"], tpl, term_count,
)
return deleted, terms_deleted
def perform_deletion(backup: dict, dry_run: bool) -> int:
"""Run the full deletion flow. Returns process exit code."""
with get_sync_session() as session:
valid_groups, errors, warnings = validate_ids(session, backup)
if errors:
logger.error("%d erreur(s) de validation :", len(errors))
for e in errors:
logger.error(" - %s", e)
logger.error("Annulation. Corrigez le backup ou la DB puis ré-essayez.")
return 3
for w in warnings:
logger.warning("⚠️ %s", w)
if not valid_groups:
logger.info("✅ Aucun groupe à supprimer.")
return 0
total_dupes, total_terms = print_preview(valid_groups)
if dry_run:
logger.info("⚠️ Mode --dry-run : aucune suppression effectuée.")
return 0
if total_dupes == 0:
return 0
# Commit par user pour limiter l'impact d'une erreur partielle (F6).
grand_deleted = 0
grand_terms = 0
for group in valid_groups:
user_id = group["user_id"]
tid = group.get("template_id")
logger.info("👤 user=%s template=%s — suppression…", user_id, tid)
try:
deleted, terms = delete_group(session, group)
session.commit()
grand_deleted += deleted
grand_terms += terms
except Exception as e:
session.rollback()
logger.error("❌ Échec pour user=%s template=%s : %s", user_id, tid, e)
logger.error(" Transaction annulée pour ce groupe, on continue.")
logger.info("=" * 78)
logger.info(
"✅ Terminé : %d glossaire(s) supprimé(s), %d termes supprimé(s).",
grand_deleted, grand_terms,
)
logger.info("=" * 78)
return 0
def confirm(prompt: str) -> bool:
"""Ask the user for confirmation. Returns True if user accepts."""
try:
answer = input(f"{prompt} [oui/non] : ").strip().lower()
except EOFError:
return False
return answer in ("oui", "o", "yes", "y")
def regenerate_backup(user_id: str | None, allow_missing_template_id: bool = False) -> Path:
"""Run the backup script as a subprocess to get a fresh JSON."""
import subprocess
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
out_path = ROOT / "backups" / f"glossary_duplicates_{timestamp}.json"
cmd = [
sys.executable,
str(ROOT / "scripts" / "backup_duplicate_glossaries.py"),
]
if user_id:
cmd += ["--user", user_id]
if allow_missing_template_id:
cmd += ["--allow-missing-template-id"]
cmd += ["--output", str(out_path)]
logger.info("🔄 Génération d'un backup frais : %s", " ".join(cmd))
res = subprocess.run(cmd, env=__import__("os").environ.copy())
if res.returncode != 0:
logger.error("❌ Échec de la génération du backup (code=%d).", res.returncode)
sys.exit(2)
return out_path
def main() -> int:
parser = argparse.ArgumentParser(
description="Supprime les doublons de glossaires en se basant sur un backup JSON."
)
parser.add_argument(
"input",
nargs="?",
metavar="BACKUP_JSON",
help="Fichier JSON de backup. Si absent, --user doit être fourni pour en générer un.",
)
parser.add_argument(
"--user",
metavar="USER_ID",
help="Génère un backup frais limité à cet utilisateur (utilisé si BACKUP_JSON absent).",
)
parser.add_argument(
"--allow-missing-template-id",
action="store_true",
help="Transmis au script de backup si le schéma DB n'a pas la colonne template_id.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Affiche ce qui serait supprimé sans rien modifier.",
)
parser.add_argument(
"--yes",
action="store_true",
help="Ne demande pas de confirmation interactive.",
)
args = parser.parse_args()
# Résoudre le fichier d'entrée
if args.input:
backup_path = Path(args.input)
elif args.user:
backup_path = regenerate_backup(args.user, args.allow_missing_template_id)
else:
parser.error("Fournissez un BACKUP_JSON ou bien --user USER_ID.")
backup = load_backup(backup_path)
logger.info("📄 Backup chargé : %s (généré le %s)", backup_path, backup.get("generated_at"))
if not args.dry_run and not args.yes:
total = backup.get("total_duplicates", 0)
if total == 0:
logger.info("✅ Aucun doublon à supprimer dans ce backup.")
return 0
if not confirm(f"Supprimer {total} glossaire(s) listé(s) dans le backup ?"):
logger.info("Annulé par l'utilisateur.")
return 1
return perform_deletion(backup, dry_run=args.dry_run)
if __name__ == "__main__":
sys.exit(main())