All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s
- scripts/backup_duplicate_glossaries.py : exporte en JSON les doublons (meme user_id + template_id) sans rien supprimer. Schema validation, tri stable, mode degrade si colonne template_id absente. - scripts/delete_duplicate_glossaries.py : lit un backup JSON et supprime les doublons listes. Validation IDs, confirmation interactive, commit par user, mode --dry-run / --yes. - .gitea/workflows/cleanup-glossaries.yml : workflow_dispatch qui SSH sur le serveur de prod et execute le script dans le conteneur backend (postgres demarre, .env charge, env_file docker-compose).
306 lines
11 KiB
Python
306 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Sauvegarde en JSON les glossaires dupliqués (même user_id + template_id) créés
|
||
avant la mise en place de la garde anti-doublon dans le backend.
|
||
|
||
⚠️ Ce script ne supprime RIEN — il produit uniquement un fichier de backup
|
||
contenant l'intégralité des doublons (métadonnées + termes) en vue d'une
|
||
analyse ou d'une suppression manuelle ultérieure.
|
||
|
||
Pour chaque couple (user_id, template_id) avec > 1 glossaire, le plus ancien
|
||
(premier créé) est marqué "keeper" et les copies sont listées dans "duplicates"
|
||
avec tous leurs termes.
|
||
|
||
Usage:
|
||
# Cible la base de prod PostgreSQL (lu via DATABASE_URL) :
|
||
DATABASE_URL=postgresql://user:pass@host:5432/db python scripts/backup_duplicate_glossaries.py
|
||
|
||
# Ou préciser une base SQLite spécifique :
|
||
SQLITE_PATH=/path/to/translate.db python scripts/backup_duplicate_glossaries.py
|
||
|
||
# Limiter à un seul utilisateur :
|
||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --user <user_id>
|
||
|
||
# Choisir le fichier de sortie :
|
||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --output backups/dupes.json
|
||
|
||
# Forcer l'exécution même si la colonne template_id est absente du schéma
|
||
# (utile pour un dump partiel des glossaires sans template_id) :
|
||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --allow-missing-template-id
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import sys
|
||
from collections import defaultdict
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
ROOT = Path(__file__).resolve().parent.parent
|
||
sys.path.insert(0, str(ROOT))
|
||
|
||
from sqlalchemy import inspect, text
|
||
|
||
from database.connection import sync_engine
|
||
from database.models import Glossary
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
)
|
||
logger = logging.getLogger("backup_dup_glossaries")
|
||
|
||
|
||
def _has_template_id_column() -> bool:
|
||
"""Vérifie que la colonne `template_id` existe sur la table `glossaries`."""
|
||
try:
|
||
inspector = inspect(sync_engine)
|
||
cols = {c["name"] for c in inspector.get_columns("glossaries")}
|
||
return "template_id" in cols
|
||
except Exception as e:
|
||
logger.error("Impossible d'inspecter le schéma : %s", e)
|
||
return False
|
||
|
||
|
||
def find_duplicates(
|
||
session, user_id: str | None = None, include_no_template: bool = False,
|
||
use_raw_query: bool = False,
|
||
) -> dict[tuple[str, str | None], list[Glossary]]:
|
||
"""Group glossaries by (user_id, template_id). By default, only template-linked ones.
|
||
|
||
Si `include_no_template=True`, les glossaires sans template_id sont groupés sous
|
||
la clé (user_id, None).
|
||
|
||
Si `use_raw_query=True`, on bypass l'ORM (utile quand la colonne `template_id`
|
||
n'existe pas dans le schéma).
|
||
"""
|
||
if use_raw_query:
|
||
# Bypass ORM : SELECT * ne dépend pas du mapping modèle.
|
||
sql = "SELECT id, user_id, name, source_language, target_language, " \
|
||
"created_at, updated_at FROM glossaries"
|
||
params: dict = {}
|
||
clauses: list[str] = []
|
||
if user_id:
|
||
clauses.append("user_id = :user_id")
|
||
params["user_id"] = user_id
|
||
if clauses:
|
||
sql += " WHERE " + " AND ".join(clauses)
|
||
rows = session.execute(text(sql), params).fetchall()
|
||
|
||
groups: dict[tuple[str, str | None], list[Glossary]] = defaultdict(list)
|
||
for r in rows:
|
||
# Sans colonne template_id, on groupe par (user_id, None).
|
||
g = Glossary(
|
||
id=r.id, user_id=r.user_id, name=r.name,
|
||
source_language=r.source_language, target_language=r.target_language,
|
||
created_at=r.created_at, updated_at=r.updated_at,
|
||
)
|
||
groups[(g.user_id, None)].append(g)
|
||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||
|
||
q = session.query(Glossary)
|
||
if not include_no_template:
|
||
q = q.filter(Glossary.template_id.isnot(None))
|
||
if user_id:
|
||
q = q.filter(Glossary.user_id == user_id)
|
||
|
||
groups = defaultdict(list)
|
||
for g in q.all():
|
||
groups[(g.user_id, g.template_id)].append(g)
|
||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||
|
||
|
||
def _stable_sort(glossaries: list[Glossary]) -> tuple[list[Glossary], int]:
|
||
"""Sort glossaries by (created_at ASC, id ASC) for deterministic ordering.
|
||
|
||
Returns the sorted list and the number of entries with None created_at.
|
||
"""
|
||
none_count = sum(1 for g in glossaries if g.created_at is None)
|
||
if none_count:
|
||
logger.warning(
|
||
"⚠️ %d glossaire(s) ont un created_at NULL — tri secondaire par id.",
|
||
none_count,
|
||
)
|
||
return sorted(glossaries, key=lambda g: (g.created_at or datetime.min.replace(tzinfo=timezone.utc), g.id)), none_count
|
||
|
||
|
||
def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glossary]) -> dict:
|
||
"""Convert a duplicate group to a JSON-serializable dict."""
|
||
sorted_glossaries, _ = _stable_sort(glossaries)
|
||
keeper = sorted_glossaries[0]
|
||
duplicates = sorted_glossaries[1:]
|
||
|
||
def to_iso(value) -> str | None:
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, datetime):
|
||
return value.isoformat()
|
||
# SQLite renvoie parfois des strings ; on tente de normaliser.
|
||
if isinstance(value, str):
|
||
try:
|
||
return datetime.fromisoformat(value).isoformat()
|
||
except ValueError:
|
||
return value
|
||
return str(value)
|
||
|
||
def serialize_glossary(g: Glossary, include_terms: bool) -> dict:
|
||
data = {
|
||
"id": g.id,
|
||
"name": g.name,
|
||
"source_language": g.source_language,
|
||
"target_language": g.target_language,
|
||
"template_id": getattr(g, "template_id", None),
|
||
"created_at": to_iso(g.created_at),
|
||
"updated_at": to_iso(g.updated_at),
|
||
}
|
||
if include_terms:
|
||
data["terms"] = [
|
||
{
|
||
"id": t.id,
|
||
"source": t.source,
|
||
"target": t.target,
|
||
"translations": t.translations or {},
|
||
}
|
||
for t in g.terms
|
||
] if g.terms else []
|
||
else:
|
||
data["terms_count"] = len(g.terms) if g.terms else 0
|
||
return data
|
||
|
||
return {
|
||
"user_id": user_id,
|
||
"template_id": template_id,
|
||
"keep": serialize_glossary(keeper, include_terms=False),
|
||
"duplicates_count": len(duplicates),
|
||
"duplicates": [serialize_glossary(d, include_terms=True) for d in duplicates],
|
||
}
|
||
|
||
|
||
def write_backup(groups: dict[tuple[str, str | None], list[Glossary]], output_path: Path) -> dict:
|
||
"""Write the full backup to `output_path` and return a stats dict."""
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
payload = {
|
||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||
"schema_version": 1,
|
||
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons.",
|
||
"total_groups": len(groups),
|
||
"total_duplicates": sum(len(v) - 1 for v in groups.values()),
|
||
"groups": [serialize_group(uid, tid, gs) for (uid, tid), gs in
|
||
sorted(groups.items(), key=lambda x: (x[0][1] or "", x[0][0]))],
|
||
}
|
||
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
||
|
||
return payload
|
||
|
||
|
||
def print_report(payload: dict) -> None:
|
||
"""Print a human-readable summary on stdout."""
|
||
logger.info("=" * 78)
|
||
logger.info("Récapitulatif du backup")
|
||
logger.info("=" * 78)
|
||
logger.info("Généré le : %s", payload["generated_at"])
|
||
logger.info("Groupes concernés: %d", payload["total_groups"])
|
||
logger.info("Doublons totaux : %d", payload["total_duplicates"])
|
||
|
||
by_template: dict[str, int] = defaultdict(int)
|
||
for g in payload["groups"]:
|
||
tid = g["template_id"] or "(no template)"
|
||
by_template[tid] += g["duplicates_count"]
|
||
|
||
if by_template:
|
||
logger.info("")
|
||
logger.info("Par template :")
|
||
for tid, count in sorted(by_template.items()):
|
||
logger.info(" %-14s %d doublon(s)", tid, count)
|
||
logger.info("=" * 78)
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(
|
||
description="Sauvegarde (sans suppression) les glossaires dupliqués en JSON."
|
||
)
|
||
parser.add_argument(
|
||
"--user",
|
||
metavar="USER_ID",
|
||
help="Limite le backup à un seul utilisateur.",
|
||
)
|
||
parser.add_argument(
|
||
"--output",
|
||
metavar="PATH",
|
||
help="Chemin du fichier JSON de sortie (défaut : backups/glossary_duplicates_<timestamp>.json).",
|
||
)
|
||
parser.add_argument(
|
||
"--include-no-template",
|
||
action="store_true",
|
||
help="Inclut aussi les glossaires sans template_id dans la recherche de doublons.",
|
||
)
|
||
parser.add_argument(
|
||
"--allow-missing-template-id",
|
||
action="store_true",
|
||
help="Continue sans erreur si la colonne `template_id` est absente du schéma "
|
||
"(équivaut à --include-no-template, mais le script ne plantera pas).",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if not _has_template_id_column():
|
||
if args.allow_missing_template_id or args.include_no_template:
|
||
logger.warning("⚠️ Colonne `template_id` absente — bascule en mode sans-template.")
|
||
args.include_no_template = True
|
||
else:
|
||
logger.error(
|
||
"❌ La colonne `glossaries.template_id` est absente du schéma actuel. "
|
||
"Appliquez d'abord la migration Alembic (alembic upgrade head) ou relancez "
|
||
"avec --allow-missing-template-id pour ne sauvegarder que les glossaires sans template_id."
|
||
)
|
||
return 2
|
||
|
||
from database.connection import get_sync_session
|
||
|
||
use_raw = args.allow_missing_template_id or args.include_no_template
|
||
if args.allow_missing_template_id and not args.include_no_template:
|
||
args.include_no_template = True
|
||
|
||
logger.info(
|
||
"🔍 Recherche de doublons%s%s…",
|
||
f" pour user_id={args.user}" if args.user else "",
|
||
" (incl. sans template)" if args.include_no_template else "",
|
||
)
|
||
|
||
with get_sync_session() as session:
|
||
groups = find_duplicates(
|
||
session,
|
||
user_id=args.user,
|
||
include_no_template=args.include_no_template,
|
||
use_raw_query=use_raw,
|
||
)
|
||
|
||
if not groups:
|
||
if args.user:
|
||
logger.info("✅ Aucun doublon trouvé pour user_id=%s.", args.user)
|
||
else:
|
||
logger.info("✅ Aucun doublon trouvé.")
|
||
return 0
|
||
|
||
if args.output:
|
||
output_path = Path(args.output)
|
||
else:
|
||
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||
output_path = ROOT / "backups" / f"glossary_duplicates_{ts}.json"
|
||
|
||
logger.info("💾 Écriture du backup vers %s …", output_path)
|
||
payload = write_backup(groups, output_path)
|
||
|
||
print_report(payload)
|
||
logger.info("✅ Backup écrit : %s (%d octets)", output_path, output_path.stat().st_size)
|
||
logger.info("ℹ️ Aucune suppression effectuée. Relire le JSON pour décider de l'action manuelle.")
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|