Files
office_translator/scripts/backup_duplicate_glossaries.py
Sepehr cd32a42b1a
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s
feat(glossaries): add backup + delete scripts and Gitea workflow for duplicate cleanup
- scripts/backup_duplicate_glossaries.py : exporte en JSON les doublons
  (meme user_id + template_id) sans rien supprimer. Schema validation,
  tri stable, mode degrade si colonne template_id absente.
- scripts/delete_duplicate_glossaries.py : lit un backup JSON et supprime
  les doublons listes. Validation IDs, confirmation interactive,
  commit par user, mode --dry-run / --yes.
- .gitea/workflows/cleanup-glossaries.yml : workflow_dispatch qui SSH
  sur le serveur de prod et execute le script dans le conteneur backend
  (postgres demarre, .env charge, env_file docker-compose).
2026-06-03 21:21:11 +02:00

306 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Sauvegarde en JSON les glossaires dupliqués (même user_id + template_id) créés
avant la mise en place de la garde anti-doublon dans le backend.
⚠️ Ce script ne supprime RIEN — il produit uniquement un fichier de backup
contenant l'intégralité des doublons (métadonnées + termes) en vue d'une
analyse ou d'une suppression manuelle ultérieure.
Pour chaque couple (user_id, template_id) avec > 1 glossaire, le plus ancien
(premier créé) est marqué "keeper" et les copies sont listées dans "duplicates"
avec tous leurs termes.
Usage:
# Cible la base de prod PostgreSQL (lu via DATABASE_URL) :
DATABASE_URL=postgresql://user:pass@host:5432/db python scripts/backup_duplicate_glossaries.py
# Ou préciser une base SQLite spécifique :
SQLITE_PATH=/path/to/translate.db python scripts/backup_duplicate_glossaries.py
# Limiter à un seul utilisateur :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --user <user_id>
# Choisir le fichier de sortie :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --output backups/dupes.json
# Forcer l'exécution même si la colonne template_id est absente du schéma
# (utile pour un dump partiel des glossaires sans template_id) :
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --allow-missing-template-id
"""
import argparse
import json
import logging
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from sqlalchemy import inspect, text
from database.connection import sync_engine
from database.models import Glossary
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("backup_dup_glossaries")
def _has_template_id_column() -> bool:
"""Vérifie que la colonne `template_id` existe sur la table `glossaries`."""
try:
inspector = inspect(sync_engine)
cols = {c["name"] for c in inspector.get_columns("glossaries")}
return "template_id" in cols
except Exception as e:
logger.error("Impossible d'inspecter le schéma : %s", e)
return False
def find_duplicates(
session, user_id: str | None = None, include_no_template: bool = False,
use_raw_query: bool = False,
) -> dict[tuple[str, str | None], list[Glossary]]:
"""Group glossaries by (user_id, template_id). By default, only template-linked ones.
Si `include_no_template=True`, les glossaires sans template_id sont groupés sous
la clé (user_id, None).
Si `use_raw_query=True`, on bypass l'ORM (utile quand la colonne `template_id`
n'existe pas dans le schéma).
"""
if use_raw_query:
# Bypass ORM : SELECT * ne dépend pas du mapping modèle.
sql = "SELECT id, user_id, name, source_language, target_language, " \
"created_at, updated_at FROM glossaries"
params: dict = {}
clauses: list[str] = []
if user_id:
clauses.append("user_id = :user_id")
params["user_id"] = user_id
if clauses:
sql += " WHERE " + " AND ".join(clauses)
rows = session.execute(text(sql), params).fetchall()
groups: dict[tuple[str, str | None], list[Glossary]] = defaultdict(list)
for r in rows:
# Sans colonne template_id, on groupe par (user_id, None).
g = Glossary(
id=r.id, user_id=r.user_id, name=r.name,
source_language=r.source_language, target_language=r.target_language,
created_at=r.created_at, updated_at=r.updated_at,
)
groups[(g.user_id, None)].append(g)
return {k: v for k, v in groups.items() if len(v) > 1}
q = session.query(Glossary)
if not include_no_template:
q = q.filter(Glossary.template_id.isnot(None))
if user_id:
q = q.filter(Glossary.user_id == user_id)
groups = defaultdict(list)
for g in q.all():
groups[(g.user_id, g.template_id)].append(g)
return {k: v for k, v in groups.items() if len(v) > 1}
def _stable_sort(glossaries: list[Glossary]) -> tuple[list[Glossary], int]:
"""Sort glossaries by (created_at ASC, id ASC) for deterministic ordering.
Returns the sorted list and the number of entries with None created_at.
"""
none_count = sum(1 for g in glossaries if g.created_at is None)
if none_count:
logger.warning(
"⚠️ %d glossaire(s) ont un created_at NULL — tri secondaire par id.",
none_count,
)
return sorted(glossaries, key=lambda g: (g.created_at or datetime.min.replace(tzinfo=timezone.utc), g.id)), none_count
def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glossary]) -> dict:
"""Convert a duplicate group to a JSON-serializable dict."""
sorted_glossaries, _ = _stable_sort(glossaries)
keeper = sorted_glossaries[0]
duplicates = sorted_glossaries[1:]
def to_iso(value) -> str | None:
if value is None:
return None
if isinstance(value, datetime):
return value.isoformat()
# SQLite renvoie parfois des strings ; on tente de normaliser.
if isinstance(value, str):
try:
return datetime.fromisoformat(value).isoformat()
except ValueError:
return value
return str(value)
def serialize_glossary(g: Glossary, include_terms: bool) -> dict:
data = {
"id": g.id,
"name": g.name,
"source_language": g.source_language,
"target_language": g.target_language,
"template_id": getattr(g, "template_id", None),
"created_at": to_iso(g.created_at),
"updated_at": to_iso(g.updated_at),
}
if include_terms:
data["terms"] = [
{
"id": t.id,
"source": t.source,
"target": t.target,
"translations": t.translations or {},
}
for t in g.terms
] if g.terms else []
else:
data["terms_count"] = len(g.terms) if g.terms else 0
return data
return {
"user_id": user_id,
"template_id": template_id,
"keep": serialize_glossary(keeper, include_terms=False),
"duplicates_count": len(duplicates),
"duplicates": [serialize_glossary(d, include_terms=True) for d in duplicates],
}
def write_backup(groups: dict[tuple[str, str | None], list[Glossary]], output_path: Path) -> dict:
"""Write the full backup to `output_path` and return a stats dict."""
output_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"schema_version": 1,
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons.",
"total_groups": len(groups),
"total_duplicates": sum(len(v) - 1 for v in groups.values()),
"groups": [serialize_group(uid, tid, gs) for (uid, tid), gs in
sorted(groups.items(), key=lambda x: (x[0][1] or "", x[0][0]))],
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
return payload
def print_report(payload: dict) -> None:
"""Print a human-readable summary on stdout."""
logger.info("=" * 78)
logger.info("Récapitulatif du backup")
logger.info("=" * 78)
logger.info("Généré le : %s", payload["generated_at"])
logger.info("Groupes concernés: %d", payload["total_groups"])
logger.info("Doublons totaux : %d", payload["total_duplicates"])
by_template: dict[str, int] = defaultdict(int)
for g in payload["groups"]:
tid = g["template_id"] or "(no template)"
by_template[tid] += g["duplicates_count"]
if by_template:
logger.info("")
logger.info("Par template :")
for tid, count in sorted(by_template.items()):
logger.info(" %-14s %d doublon(s)", tid, count)
logger.info("=" * 78)
def main() -> int:
parser = argparse.ArgumentParser(
description="Sauvegarde (sans suppression) les glossaires dupliqués en JSON."
)
parser.add_argument(
"--user",
metavar="USER_ID",
help="Limite le backup à un seul utilisateur.",
)
parser.add_argument(
"--output",
metavar="PATH",
help="Chemin du fichier JSON de sortie (défaut : backups/glossary_duplicates_<timestamp>.json).",
)
parser.add_argument(
"--include-no-template",
action="store_true",
help="Inclut aussi les glossaires sans template_id dans la recherche de doublons.",
)
parser.add_argument(
"--allow-missing-template-id",
action="store_true",
help="Continue sans erreur si la colonne `template_id` est absente du schéma "
"(équivaut à --include-no-template, mais le script ne plantera pas).",
)
args = parser.parse_args()
if not _has_template_id_column():
if args.allow_missing_template_id or args.include_no_template:
logger.warning("⚠️ Colonne `template_id` absente — bascule en mode sans-template.")
args.include_no_template = True
else:
logger.error(
"❌ La colonne `glossaries.template_id` est absente du schéma actuel. "
"Appliquez d'abord la migration Alembic (alembic upgrade head) ou relancez "
"avec --allow-missing-template-id pour ne sauvegarder que les glossaires sans template_id."
)
return 2
from database.connection import get_sync_session
use_raw = args.allow_missing_template_id or args.include_no_template
if args.allow_missing_template_id and not args.include_no_template:
args.include_no_template = True
logger.info(
"🔍 Recherche de doublons%s%s",
f" pour user_id={args.user}" if args.user else "",
" (incl. sans template)" if args.include_no_template else "",
)
with get_sync_session() as session:
groups = find_duplicates(
session,
user_id=args.user,
include_no_template=args.include_no_template,
use_raw_query=use_raw,
)
if not groups:
if args.user:
logger.info("✅ Aucun doublon trouvé pour user_id=%s.", args.user)
else:
logger.info("✅ Aucun doublon trouvé.")
return 0
if args.output:
output_path = Path(args.output)
else:
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
output_path = ROOT / "backups" / f"glossary_duplicates_{ts}.json"
logger.info("💾 Écriture du backup vers %s", output_path)
payload = write_backup(groups, output_path)
print_report(payload)
logger.info("✅ Backup écrit : %s (%d octets)", output_path, output_path.stat().st_size)
logger.info(" Aucune suppression effectuée. Relire le JSON pour décider de l'action manuelle.")
return 0
if __name__ == "__main__":
sys.exit(main())