fix(glossaries): dedup by (user_id, name) instead of (user_id, template_id)
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 3m1s
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 3m1s
Le groupement par template_id etait faux sur la prod :
- Les doublons historiques ont template_id=NULL (crees avant la migration)
- Deux glossaires 'Finance - FR->Anglais' et 'Finance - FR->Multilingue'
partagent le meme template_id mais DOIVENT etre conserves separement.
Changements :
- Groupement par (user_id, name) -> c'est ce que l'utilisateur voit dans l'UI
et la definition reelle d'un doublon.
- Les glossaires multilingues ('-> Multilingue') ont un nom distinct des
versions '-> Anglais' : ils ne sont jamais fusionnes (preserve par design).
- Fallback automatique si la colonne template_id est absente du schema
(dev DB) : warning + requete sans la colonne, aucun crash.
- Suppression du flag --allow-missing-template-id devenu inutile.
- Nettoyage des imports ORM inutiles (text brut uniquement, plus rapide).
This commit is contained in:
@@ -59,7 +59,7 @@ jobs:
|
||||
done
|
||||
|
||||
# Construire les flags.
|
||||
FLAGS="--user \${USER_ID} --allow-missing-template-id"
|
||||
FLAGS="--user \${USER_ID}"
|
||||
if [ "\${DRY_RUN}" = "true" ]; then
|
||||
FLAGS="\${FLAGS} --dry-run"
|
||||
fi
|
||||
|
||||
@@ -1,16 +1,19 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sauvegarde en JSON les glossaires dupliqués (même user_id + template_id) créés
|
||||
Sauvegarde en JSON les glossaires dupliqués (même user_id + même nom) créés
|
||||
avant la mise en place de la garde anti-doublon dans le backend.
|
||||
|
||||
⚠️ Ce script ne supprime RIEN — il produit uniquement un fichier de backup
|
||||
contenant l'intégralité des doublons (métadonnées + termes) en vue d'une
|
||||
analyse ou d'une suppression manuelle ultérieure.
|
||||
|
||||
Pour chaque couple (user_id, template_id) avec > 1 glossaire, le plus ancien
|
||||
Pour chaque couple (user_id, name) avec > 1 glossaire, le plus ancien
|
||||
(premier créé) est marqué "keeper" et les copies sont listées dans "duplicates"
|
||||
avec tous leurs termes.
|
||||
|
||||
Les glossaires multilingues (« Français → Multilingue ») ont un nom distinct
|
||||
de leurs homologues « Français → Anglais » : ils ne sont jamais fusionnés.
|
||||
|
||||
Usage:
|
||||
# Cible la base de prod PostgreSQL (lu via DATABASE_URL) :
|
||||
DATABASE_URL=postgresql://user:pass@host:5432/db python scripts/backup_duplicate_glossaries.py
|
||||
@@ -23,10 +26,6 @@ Usage:
|
||||
|
||||
# Choisir le fichier de sortie :
|
||||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --output backups/dupes.json
|
||||
|
||||
# Forcer l'exécution même si la colonne template_id est absente du schéma
|
||||
# (utile pour un dump partiel des glossaires sans template_id) :
|
||||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --allow-missing-template-id
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -40,10 +39,9 @@ from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from sqlalchemy import inspect, text
|
||||
from sqlalchemy import text
|
||||
|
||||
from database.connection import sync_engine
|
||||
from database.models import Glossary
|
||||
from database.connection import get_sync_session
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -52,80 +50,71 @@ logging.basicConfig(
|
||||
logger = logging.getLogger("backup_dup_glossaries")
|
||||
|
||||
|
||||
def _has_template_id_column() -> bool:
|
||||
"""Vérifie que la colonne `template_id` existe sur la table `glossaries`."""
|
||||
try:
|
||||
inspector = inspect(sync_engine)
|
||||
cols = {c["name"] for c in inspector.get_columns("glossaries")}
|
||||
return "template_id" in cols
|
||||
except Exception as e:
|
||||
logger.error("Impossible d'inspecter le schéma : %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def find_duplicates(
|
||||
session, user_id: str | None = None, include_no_template: bool = False,
|
||||
use_raw_query: bool = False,
|
||||
) -> dict[tuple[str, str | None], list[Glossary]]:
|
||||
"""Group glossaries by (user_id, template_id). By default, only template-linked ones.
|
||||
session, user_id: str | None = None,
|
||||
) -> dict[tuple[str, str], list[dict]]:
|
||||
"""Group glossaries by (user_id, name). Returns lightweight dicts (not ORM).
|
||||
|
||||
Si `include_no_template=True`, les glossaires sans template_id sont groupés sous
|
||||
la clé (user_id, None).
|
||||
|
||||
Si `use_raw_query=True`, on bypass l'ORM (utile quand la colonne `template_id`
|
||||
n'existe pas dans le schéma).
|
||||
Utilise du SQL brut pour rester robuste si la colonne `template_id` est
|
||||
absente du schéma (ex. ancienne DB de dev). En cas d'absence, retombe
|
||||
automatiquement sur une requête sans `template_id`.
|
||||
"""
|
||||
if use_raw_query:
|
||||
# Bypass ORM : SELECT * ne dépend pas du mapping modèle.
|
||||
sql = "SELECT id, user_id, name, source_language, target_language, " \
|
||||
"created_at, updated_at FROM glossaries"
|
||||
base_cols = "id, user_id, name, source_language, target_language, created_at, updated_at"
|
||||
try:
|
||||
sql = f"SELECT {base_cols}, template_id FROM glossaries"
|
||||
params: dict = {}
|
||||
clauses: list[str] = []
|
||||
if user_id:
|
||||
clauses.append("user_id = :user_id")
|
||||
sql += " WHERE user_id = :user_id"
|
||||
params["user_id"] = user_id
|
||||
if clauses:
|
||||
sql += " WHERE " + " AND ".join(clauses)
|
||||
sql += " ORDER BY user_id, name, created_at"
|
||||
rows = session.execute(text(sql), params).fetchall()
|
||||
has_template_id = True
|
||||
except Exception as e:
|
||||
if "no such column" not in str(e).lower() and "undefined column" not in str(e).lower():
|
||||
raise
|
||||
logger.warning("⚠️ Colonne `template_id` absente du schéma — fallback sans template_id.")
|
||||
sql = f"SELECT {base_cols} FROM glossaries"
|
||||
params = {}
|
||||
if user_id:
|
||||
sql += " WHERE user_id = :user_id"
|
||||
params["user_id"] = user_id
|
||||
sql += " ORDER BY user_id, name, created_at"
|
||||
rows = session.execute(text(sql), params).fetchall()
|
||||
has_template_id = False
|
||||
|
||||
groups: dict[tuple[str, str | None], list[Glossary]] = defaultdict(list)
|
||||
for r in rows:
|
||||
# Sans colonne template_id, on groupe par (user_id, None).
|
||||
g = Glossary(
|
||||
id=r.id, user_id=r.user_id, name=r.name,
|
||||
source_language=r.source_language, target_language=r.target_language,
|
||||
created_at=r.created_at, updated_at=r.updated_at,
|
||||
)
|
||||
groups[(g.user_id, None)].append(g)
|
||||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||||
|
||||
q = session.query(Glossary)
|
||||
if not include_no_template:
|
||||
q = q.filter(Glossary.template_id.isnot(None))
|
||||
if user_id:
|
||||
q = q.filter(Glossary.user_id == user_id)
|
||||
|
||||
groups = defaultdict(list)
|
||||
for g in q.all():
|
||||
groups[(g.user_id, g.template_id)].append(g)
|
||||
groups: dict[tuple[str, str], list[dict]] = defaultdict(list)
|
||||
for r in rows:
|
||||
groups[(r.user_id, r.name)].append({
|
||||
"id": r.id,
|
||||
"user_id": r.user_id,
|
||||
"name": r.name,
|
||||
"source_language": r.source_language,
|
||||
"target_language": r.target_language,
|
||||
"template_id": r.template_id if has_template_id else None,
|
||||
"created_at": r.created_at,
|
||||
"updated_at": r.updated_at,
|
||||
})
|
||||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||||
|
||||
|
||||
def _stable_sort(glossaries: list[Glossary]) -> tuple[list[Glossary], int]:
|
||||
def _stable_sort(glossaries: list[dict]) -> tuple[list[dict], int]:
|
||||
"""Sort glossaries by (created_at ASC, id ASC) for deterministic ordering.
|
||||
|
||||
Returns the sorted list and the number of entries with None created_at.
|
||||
"""
|
||||
none_count = sum(1 for g in glossaries if g.created_at is None)
|
||||
none_count = sum(1 for g in glossaries if g["created_at"] is None)
|
||||
if none_count:
|
||||
logger.warning(
|
||||
"⚠️ %d glossaire(s) ont un created_at NULL — tri secondaire par id.",
|
||||
none_count,
|
||||
)
|
||||
return sorted(glossaries, key=lambda g: (g.created_at or datetime.min.replace(tzinfo=timezone.utc), g.id)), none_count
|
||||
return sorted(
|
||||
glossaries,
|
||||
key=lambda g: (g["created_at"] or datetime.min.replace(tzinfo=timezone.utc), g["id"]),
|
||||
), none_count
|
||||
|
||||
|
||||
def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glossary]) -> dict:
|
||||
def serialize_group(user_id: str, name: str, glossaries: list[dict]) -> dict:
|
||||
"""Convert a duplicate group to a JSON-serializable dict."""
|
||||
sorted_glossaries, _ = _stable_sort(glossaries)
|
||||
keeper = sorted_glossaries[0]
|
||||
@@ -136,7 +125,6 @@ def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glos
|
||||
return None
|
||||
if isinstance(value, datetime):
|
||||
return value.isoformat()
|
||||
# SQLite renvoie parfois des strings ; on tente de normaliser.
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return datetime.fromisoformat(value).isoformat()
|
||||
@@ -144,51 +132,79 @@ def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glos
|
||||
return value
|
||||
return str(value)
|
||||
|
||||
def serialize_glossary(g: Glossary, include_terms: bool) -> dict:
|
||||
data = {
|
||||
"id": g.id,
|
||||
"name": g.name,
|
||||
"source_language": g.source_language,
|
||||
"target_language": g.target_language,
|
||||
"template_id": getattr(g, "template_id", None),
|
||||
"created_at": to_iso(g.created_at),
|
||||
"updated_at": to_iso(g.updated_at),
|
||||
}
|
||||
if include_terms:
|
||||
data["terms"] = [
|
||||
{
|
||||
"id": t.id,
|
||||
"source": t.source,
|
||||
"target": t.target,
|
||||
"translations": t.translations or {},
|
||||
}
|
||||
for t in g.terms
|
||||
] if g.terms else []
|
||||
else:
|
||||
data["terms_count"] = len(g.terms) if g.terms else 0
|
||||
return data
|
||||
def count_terms(session, glossary_id: str) -> int:
|
||||
return session.execute(
|
||||
text("SELECT COUNT(*) FROM glossary_terms WHERE glossary_id = :id"),
|
||||
{"id": glossary_id},
|
||||
).scalar() or 0
|
||||
|
||||
with get_sync_session() as session:
|
||||
keeper_terms = count_terms(session, keeper["id"])
|
||||
duplicate_payload = []
|
||||
for d in duplicates:
|
||||
tcount = count_terms(session, d["id"])
|
||||
duplicate_payload.append({
|
||||
**d,
|
||||
"created_at": to_iso(d["created_at"]),
|
||||
"updated_at": to_iso(d["updated_at"]),
|
||||
"terms": _fetch_terms(session, d["id"]),
|
||||
"terms_count": tcount,
|
||||
})
|
||||
|
||||
return {
|
||||
"user_id": user_id,
|
||||
"template_id": template_id,
|
||||
"keep": serialize_glossary(keeper, include_terms=False),
|
||||
"name": name,
|
||||
"keep": {
|
||||
"id": keeper["id"],
|
||||
"name": keeper["name"],
|
||||
"source_language": keeper["source_language"],
|
||||
"target_language": keeper["target_language"],
|
||||
"template_id": keeper["template_id"],
|
||||
"created_at": to_iso(keeper["created_at"]),
|
||||
"updated_at": to_iso(keeper["updated_at"]),
|
||||
"terms_count": keeper_terms,
|
||||
},
|
||||
"duplicates_count": len(duplicates),
|
||||
"duplicates": [serialize_glossary(d, include_terms=True) for d in duplicates],
|
||||
"duplicates": duplicate_payload,
|
||||
}
|
||||
|
||||
|
||||
def write_backup(groups: dict[tuple[str, str | None], list[Glossary]], output_path: Path) -> dict:
|
||||
def _fetch_terms(session, glossary_id: str) -> list[dict]:
|
||||
"""Fetch all terms for a glossary (used to back up duplicates before deletion)."""
|
||||
rows = session.execute(
|
||||
text(
|
||||
"SELECT id, source, target, translations "
|
||||
"FROM glossary_terms WHERE glossary_id = :id ORDER BY id"
|
||||
),
|
||||
{"id": glossary_id},
|
||||
).fetchall()
|
||||
return [
|
||||
{
|
||||
"id": r.id,
|
||||
"source": r.source,
|
||||
"target": r.target,
|
||||
"translations": r.translations or {},
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
def write_backup(groups: dict[tuple[str, str], list[dict]], output_path: Path) -> dict:
|
||||
"""Write the full backup to `output_path` and return a stats dict."""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schema_version": 1,
|
||||
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons.",
|
||||
"schema_version": 2,
|
||||
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons. "
|
||||
"Les glossaires multilingues (« → Multilingue ») ont un nom distinct "
|
||||
"et ne sont jamais fusionnés avec leurs homologues « → Anglais ».",
|
||||
"total_groups": len(groups),
|
||||
"total_duplicates": sum(len(v) - 1 for v in groups.values()),
|
||||
"groups": [serialize_group(uid, tid, gs) for (uid, tid), gs in
|
||||
sorted(groups.items(), key=lambda x: (x[0][1] or "", x[0][0]))],
|
||||
"groups": [
|
||||
serialize_group(uid, name, gs)
|
||||
for (uid, name), gs in sorted(groups.items(), key=lambda x: (x[0][1], x[0][0]))
|
||||
],
|
||||
}
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
@@ -205,17 +221,10 @@ def print_report(payload: dict) -> None:
|
||||
logger.info("Généré le : %s", payload["generated_at"])
|
||||
logger.info("Groupes concernés: %d", payload["total_groups"])
|
||||
logger.info("Doublons totaux : %d", payload["total_duplicates"])
|
||||
|
||||
by_template: dict[str, int] = defaultdict(int)
|
||||
logger.info("")
|
||||
logger.info("Détail par nom :")
|
||||
for g in payload["groups"]:
|
||||
tid = g["template_id"] or "(no template)"
|
||||
by_template[tid] += g["duplicates_count"]
|
||||
|
||||
if by_template:
|
||||
logger.info("")
|
||||
logger.info("Par template :")
|
||||
for tid, count in sorted(by_template.items()):
|
||||
logger.info(" %-14s %d doublon(s)", tid, count)
|
||||
logger.info(" '%s' → %d doublon(s) à supprimer", g["name"], g["duplicates_count"])
|
||||
logger.info("=" * 78)
|
||||
|
||||
|
||||
@@ -233,50 +242,15 @@ def main() -> int:
|
||||
metavar="PATH",
|
||||
help="Chemin du fichier JSON de sortie (défaut : backups/glossary_duplicates_<timestamp>.json).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-no-template",
|
||||
action="store_true",
|
||||
help="Inclut aussi les glossaires sans template_id dans la recherche de doublons.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allow-missing-template-id",
|
||||
action="store_true",
|
||||
help="Continue sans erreur si la colonne `template_id` est absente du schéma "
|
||||
"(équivaut à --include-no-template, mais le script ne plantera pas).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not _has_template_id_column():
|
||||
if args.allow_missing_template_id or args.include_no_template:
|
||||
logger.warning("⚠️ Colonne `template_id` absente — bascule en mode sans-template.")
|
||||
args.include_no_template = True
|
||||
else:
|
||||
logger.error(
|
||||
"❌ La colonne `glossaries.template_id` est absente du schéma actuel. "
|
||||
"Appliquez d'abord la migration Alembic (alembic upgrade head) ou relancez "
|
||||
"avec --allow-missing-template-id pour ne sauvegarder que les glossaires sans template_id."
|
||||
)
|
||||
return 2
|
||||
|
||||
from database.connection import get_sync_session
|
||||
|
||||
use_raw = args.allow_missing_template_id or args.include_no_template
|
||||
if args.allow_missing_template_id and not args.include_no_template:
|
||||
args.include_no_template = True
|
||||
|
||||
logger.info(
|
||||
"🔍 Recherche de doublons%s%s…",
|
||||
"🔍 Recherche de doublons (user_id, name)%s…",
|
||||
f" pour user_id={args.user}" if args.user else "",
|
||||
" (incl. sans template)" if args.include_no_template else "",
|
||||
)
|
||||
|
||||
with get_sync_session() as session:
|
||||
groups = find_duplicates(
|
||||
session,
|
||||
user_id=args.user,
|
||||
include_no_template=args.include_no_template,
|
||||
use_raw_query=use_raw,
|
||||
)
|
||||
groups = find_duplicates(session, user_id=args.user)
|
||||
|
||||
if not groups:
|
||||
if args.user:
|
||||
|
||||
@@ -3,11 +3,15 @@
|
||||
Supprime les glossaires dupliqués en se basant sur un fichier JSON de backup
|
||||
produit par `scripts/backup_duplicate_glossaries.py`.
|
||||
|
||||
⚠️ DESTRUCTIF. Par défaut, demande confirmation interactive avant chaque
|
||||
⚠️ DESTRUCTIF. Par défaut, demande confirmation interactive avant la
|
||||
suppression. Utiliser --yes pour les exécutions automatisées.
|
||||
|
||||
Pour chaque groupe dans le JSON, le glossaire listé dans "keep" est conservé,
|
||||
ceux listés dans "duplicates" sont supprimés (ainsi que leurs termes via cascade).
|
||||
Pour chaque groupe (user_id, name) dans le JSON, le glossaire listé dans "keep"
|
||||
est conservé, ceux listés dans "duplicates" sont supprimés (ainsi que leurs
|
||||
termes via cascade).
|
||||
|
||||
Les glossaires multilingues (« → Multilingue ») ont un nom distinct et ne
|
||||
peuvent pas être inclus dans un groupe de doublons : ils sont préservés.
|
||||
|
||||
Usage:
|
||||
# Dry-run (relecture) :
|
||||
@@ -19,16 +23,18 @@ Usage:
|
||||
# Sans confirmation (CI / cron) :
|
||||
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --yes
|
||||
|
||||
# Re-génère le backup à la volée (si --input absent) :
|
||||
# Re-génère le backup à la volée (si BACKUP_JSON absent) :
|
||||
DATABASE_URL=... python scripts/delete_duplicate_glossaries.py --user <USER_ID> --dry-run
|
||||
|
||||
# Back-up automatique avant suppression (recommandé) :
|
||||
python scripts/delete_duplicate_glossaries.py --user <USER_ID> --yes
|
||||
# Backup auto + suppression :
|
||||
DATABASE_URL=... python scripts/delete_duplicate_glossaries.py --user <USER_ID> --yes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
@@ -36,10 +42,9 @@ from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from sqlalchemy import inspect, text
|
||||
from sqlalchemy import text
|
||||
|
||||
from database.connection import get_sync_session, sync_engine
|
||||
from database.models import Glossary, User
|
||||
from database.connection import get_sync_session
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -67,9 +72,6 @@ def validate_ids(session, backup: dict) -> tuple[list[dict], list[str], list[str
|
||||
Returns (valid_groups, errors, warnings).
|
||||
- errors: bloquants (mismatch owner / id introuvable)
|
||||
- warnings: informatifs (user parent absent — données orphelines)
|
||||
|
||||
Utilise du SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
||||
si la colonne `template_id` n'existe pas dans le schéma).
|
||||
"""
|
||||
errors: list[str] = []
|
||||
warnings: list[str] = []
|
||||
@@ -104,7 +106,7 @@ def validate_ids(session, backup: dict) -> tuple[list[dict], list[str], list[str
|
||||
|
||||
|
||||
def print_preview(valid_groups: list[dict]) -> tuple[int, int]:
|
||||
"""Print what would be deleted. Returns (total_dupes, total_user_groups)."""
|
||||
"""Print what would be deleted. Returns (total_dupes, total_terms)."""
|
||||
total_dupes = 0
|
||||
total_terms = 0
|
||||
|
||||
@@ -112,50 +114,33 @@ def print_preview(valid_groups: list[dict]) -> tuple[int, int]:
|
||||
logger.info("Aperçu de la suppression")
|
||||
logger.info("=" * 78)
|
||||
|
||||
by_template: dict[str, int] = {}
|
||||
for g in valid_groups:
|
||||
tid = g.get("template_id") or "(no template)"
|
||||
by_template[tid] = by_template.get(tid, 0) + g["duplicates_count"]
|
||||
total_dupes += g["duplicates_count"]
|
||||
total_terms += sum(len(d.get("terms", [])) for d in g["duplicates"])
|
||||
for d in g["duplicates"]:
|
||||
total_terms += d.get("terms_count", len(d.get("terms", [])))
|
||||
|
||||
logger.info("Groupes à traiter : %d", len(valid_groups))
|
||||
logger.info("Glossaires à supprimer : %d", total_dupes)
|
||||
logger.info("Termes concernés (estim.) : %d", total_terms)
|
||||
logger.info("Groupes à traiter : %d", len(valid_groups))
|
||||
logger.info("Glossaires à supprimer : %d", total_dupes)
|
||||
logger.info("Termes concernés : %d", total_terms)
|
||||
logger.info("")
|
||||
logger.info("Détail par template :")
|
||||
for tid, count in sorted(by_template.items()):
|
||||
logger.info(" %-14s %d doublon(s) à supprimer", tid, count)
|
||||
logger.info("Détail par nom :")
|
||||
for g in valid_groups:
|
||||
logger.info(" '%s' → %d doublon(s)", g["name"], g["duplicates_count"])
|
||||
logger.info("=" * 78)
|
||||
return total_dupes, total_terms
|
||||
|
||||
|
||||
def delete_group(session, group: dict) -> tuple[int, int]:
|
||||
"""Delete the duplicates of a single group. Returns (glossaries_deleted, terms_deleted).
|
||||
|
||||
Utilise SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
||||
si la colonne `template_id` n'existe pas dans le schéma).
|
||||
"""
|
||||
"""Delete the duplicates of a single group. Returns (glossaries_deleted, terms_deleted)."""
|
||||
deleted = 0
|
||||
terms_deleted = 0
|
||||
|
||||
for dup in group["duplicates"]:
|
||||
# 1. Compter les termes (avant suppression, pour les logs).
|
||||
term_count = session.execute(
|
||||
text("SELECT COUNT(*) FROM glossary_terms WHERE glossary_id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
).scalar() or 0
|
||||
|
||||
# 2. Récupérer le template_id pour le log (best effort).
|
||||
try:
|
||||
tpl = session.execute(
|
||||
text("SELECT template_id FROM glossaries WHERE id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
).scalar()
|
||||
except Exception:
|
||||
tpl = None
|
||||
|
||||
# 3. Supprimer d'abord les termes (FK), puis le glossaire.
|
||||
try:
|
||||
session.execute(
|
||||
text("DELETE FROM glossary_terms WHERE glossary_id = :gid"),
|
||||
@@ -172,10 +157,7 @@ def delete_group(session, group: dict) -> tuple[int, int]:
|
||||
|
||||
deleted += 1
|
||||
terms_deleted += term_count
|
||||
logger.info(
|
||||
" 🗑️ Supprimé id=%s (template=%s, %d termes)",
|
||||
dup["id"], tpl, term_count,
|
||||
)
|
||||
logger.info(" 🗑️ Supprimé id=%s (%d termes)", dup["id"], term_count)
|
||||
|
||||
return deleted, terms_deleted
|
||||
|
||||
@@ -206,13 +188,13 @@ def perform_deletion(backup: dict, dry_run: bool) -> int:
|
||||
if total_dupes == 0:
|
||||
return 0
|
||||
|
||||
# Commit par user pour limiter l'impact d'une erreur partielle (F6).
|
||||
# Commit par groupe (user + name) pour limiter l'impact d'une erreur partielle.
|
||||
grand_deleted = 0
|
||||
grand_terms = 0
|
||||
for group in valid_groups:
|
||||
user_id = group["user_id"]
|
||||
tid = group.get("template_id")
|
||||
logger.info("👤 user=%s template=%s — suppression…", user_id, tid)
|
||||
name = group["name"]
|
||||
logger.info("👤 user=%s name=%s — suppression…", user_id, name)
|
||||
try:
|
||||
deleted, terms = delete_group(session, group)
|
||||
session.commit()
|
||||
@@ -220,7 +202,7 @@ def perform_deletion(backup: dict, dry_run: bool) -> int:
|
||||
grand_terms += terms
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error("❌ Échec pour user=%s template=%s : %s", user_id, tid, e)
|
||||
logger.error("❌ Échec pour user=%s name=%s : %s", user_id, name, e)
|
||||
logger.error(" Transaction annulée pour ce groupe, on continue.")
|
||||
|
||||
logger.info("=" * 78)
|
||||
@@ -241,10 +223,8 @@ def confirm(prompt: str) -> bool:
|
||||
return answer in ("oui", "o", "yes", "y")
|
||||
|
||||
|
||||
def regenerate_backup(user_id: str | None, allow_missing_template_id: bool = False) -> Path:
|
||||
def regenerate_backup(user_id: str | None) -> Path:
|
||||
"""Run the backup script as a subprocess to get a fresh JSON."""
|
||||
import subprocess
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
out_path = ROOT / "backups" / f"glossary_duplicates_{timestamp}.json"
|
||||
cmd = [
|
||||
@@ -253,11 +233,9 @@ def regenerate_backup(user_id: str | None, allow_missing_template_id: bool = Fal
|
||||
]
|
||||
if user_id:
|
||||
cmd += ["--user", user_id]
|
||||
if allow_missing_template_id:
|
||||
cmd += ["--allow-missing-template-id"]
|
||||
cmd += ["--output", str(out_path)]
|
||||
logger.info("🔄 Génération d'un backup frais : %s", " ".join(cmd))
|
||||
res = subprocess.run(cmd, env=__import__("os").environ.copy())
|
||||
res = subprocess.run(cmd, env=os.environ.copy())
|
||||
if res.returncode != 0:
|
||||
logger.error("❌ Échec de la génération du backup (code=%d).", res.returncode)
|
||||
sys.exit(2)
|
||||
@@ -279,11 +257,6 @@ def main() -> int:
|
||||
metavar="USER_ID",
|
||||
help="Génère un backup frais limité à cet utilisateur (utilisé si BACKUP_JSON absent).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allow-missing-template-id",
|
||||
action="store_true",
|
||||
help="Transmis au script de backup si le schéma DB n'a pas la colonne template_id.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
@@ -300,7 +273,7 @@ def main() -> int:
|
||||
if args.input:
|
||||
backup_path = Path(args.input)
|
||||
elif args.user:
|
||||
backup_path = regenerate_backup(args.user, args.allow_missing_template_id)
|
||||
backup_path = regenerate_backup(args.user)
|
||||
else:
|
||||
parser.error("Fournissez un BACKUP_JSON ou bien --user USER_ID.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user