All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s
- scripts/backup_duplicate_glossaries.py : exporte en JSON les doublons (meme user_id + template_id) sans rien supprimer. Schema validation, tri stable, mode degrade si colonne template_id absente. - scripts/delete_duplicate_glossaries.py : lit un backup JSON et supprime les doublons listes. Validation IDs, confirmation interactive, commit par user, mode --dry-run / --yes. - .gitea/workflows/cleanup-glossaries.yml : workflow_dispatch qui SSH sur le serveur de prod et execute le script dans le conteneur backend (postgres demarre, .env charge, env_file docker-compose).
324 lines
11 KiB
Python
324 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Supprime les glossaires dupliqués en se basant sur un fichier JSON de backup
|
|
produit par `scripts/backup_duplicate_glossaries.py`.
|
|
|
|
⚠️ DESTRUCTIF. Par défaut, demande confirmation interactive avant chaque
|
|
suppression. Utiliser --yes pour les exécutions automatisées.
|
|
|
|
Pour chaque groupe dans le JSON, le glossaire listé dans "keep" est conservé,
|
|
ceux listés dans "duplicates" sont supprimés (ainsi que leurs termes via cascade).
|
|
|
|
Usage:
|
|
# Dry-run (relecture) :
|
|
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --dry-run
|
|
|
|
# Confirmation interactive :
|
|
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json
|
|
|
|
# Sans confirmation (CI / cron) :
|
|
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --yes
|
|
|
|
# Re-génère le backup à la volée (si --input absent) :
|
|
DATABASE_URL=... python scripts/delete_duplicate_glossaries.py --user <USER_ID> --dry-run
|
|
|
|
# Back-up automatique avant suppression (recommandé) :
|
|
python scripts/delete_duplicate_glossaries.py --user <USER_ID> --yes
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from sqlalchemy import inspect, text
|
|
|
|
from database.connection import get_sync_session, sync_engine
|
|
from database.models import Glossary, User
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
)
|
|
logger = logging.getLogger("delete_dup_glossaries")
|
|
|
|
|
|
def load_backup(path: Path) -> dict:
|
|
"""Load and validate a backup JSON file."""
|
|
if not path.exists():
|
|
logger.error("❌ Fichier de backup introuvable : %s", path)
|
|
sys.exit(2)
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
if "groups" not in data:
|
|
logger.error("❌ Le fichier %s n'a pas de clé 'groups' — format invalide.", path)
|
|
sys.exit(2)
|
|
return data
|
|
|
|
|
|
def validate_ids(session, backup: dict) -> tuple[list[dict], list[str], list[str]]:
|
|
"""Check that every duplicate id exists in the DB and matches the expected user.
|
|
|
|
Returns (valid_groups, errors, warnings).
|
|
- errors: bloquants (mismatch owner / id introuvable)
|
|
- warnings: informatifs (user parent absent — données orphelines)
|
|
|
|
Utilise du SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
|
si la colonne `template_id` n'existe pas dans le schéma).
|
|
"""
|
|
errors: list[str] = []
|
|
warnings: list[str] = []
|
|
valid: list[dict] = []
|
|
|
|
for group in backup["groups"]:
|
|
user_id = group["user_id"]
|
|
|
|
user_exists = session.execute(
|
|
text("SELECT id FROM users WHERE id = :uid"), {"uid": user_id}
|
|
).first()
|
|
if not user_exists:
|
|
warnings.append(
|
|
f"User parent introuvable (données orphelines) : user_id={user_id} — "
|
|
f"la suppression des glossaires associés sera tentée."
|
|
)
|
|
|
|
for dup in group["duplicates"]:
|
|
dup_id = dup["id"]
|
|
row = session.execute(
|
|
text("SELECT id FROM glossaries WHERE id = :gid AND user_id = :uid"),
|
|
{"gid": dup_id, "uid": user_id},
|
|
).first()
|
|
if not row:
|
|
errors.append(
|
|
f"Glossary introuvable ou owner mismatch : id={dup_id} user={user_id}"
|
|
)
|
|
|
|
valid.append(group)
|
|
|
|
return valid, errors, warnings
|
|
|
|
|
|
def print_preview(valid_groups: list[dict]) -> tuple[int, int]:
|
|
"""Print what would be deleted. Returns (total_dupes, total_user_groups)."""
|
|
total_dupes = 0
|
|
total_terms = 0
|
|
|
|
logger.info("=" * 78)
|
|
logger.info("Aperçu de la suppression")
|
|
logger.info("=" * 78)
|
|
|
|
by_template: dict[str, int] = {}
|
|
for g in valid_groups:
|
|
tid = g.get("template_id") or "(no template)"
|
|
by_template[tid] = by_template.get(tid, 0) + g["duplicates_count"]
|
|
total_dupes += g["duplicates_count"]
|
|
total_terms += sum(len(d.get("terms", [])) for d in g["duplicates"])
|
|
|
|
logger.info("Groupes à traiter : %d", len(valid_groups))
|
|
logger.info("Glossaires à supprimer : %d", total_dupes)
|
|
logger.info("Termes concernés (estim.) : %d", total_terms)
|
|
logger.info("")
|
|
logger.info("Détail par template :")
|
|
for tid, count in sorted(by_template.items()):
|
|
logger.info(" %-14s %d doublon(s) à supprimer", tid, count)
|
|
logger.info("=" * 78)
|
|
return total_dupes, total_terms
|
|
|
|
|
|
def delete_group(session, group: dict) -> tuple[int, int]:
|
|
"""Delete the duplicates of a single group. Returns (glossaries_deleted, terms_deleted).
|
|
|
|
Utilise SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
|
si la colonne `template_id` n'existe pas dans le schéma).
|
|
"""
|
|
deleted = 0
|
|
terms_deleted = 0
|
|
|
|
for dup in group["duplicates"]:
|
|
# 1. Compter les termes (avant suppression, pour les logs).
|
|
term_count = session.execute(
|
|
text("SELECT COUNT(*) FROM glossary_terms WHERE glossary_id = :gid"),
|
|
{"gid": dup["id"]},
|
|
).scalar() or 0
|
|
|
|
# 2. Récupérer le template_id pour le log (best effort).
|
|
try:
|
|
tpl = session.execute(
|
|
text("SELECT template_id FROM glossaries WHERE id = :gid"),
|
|
{"gid": dup["id"]},
|
|
).scalar()
|
|
except Exception:
|
|
tpl = None
|
|
|
|
# 3. Supprimer d'abord les termes (FK), puis le glossaire.
|
|
try:
|
|
session.execute(
|
|
text("DELETE FROM glossary_terms WHERE glossary_id = :gid"),
|
|
{"gid": dup["id"]},
|
|
)
|
|
session.execute(
|
|
text("DELETE FROM glossaries WHERE id = :gid"),
|
|
{"gid": dup["id"]},
|
|
)
|
|
session.flush()
|
|
except Exception as e:
|
|
session.rollback()
|
|
raise RuntimeError(f"Échec suppression glossary {dup['id']}: {e}") from e
|
|
|
|
deleted += 1
|
|
terms_deleted += term_count
|
|
logger.info(
|
|
" 🗑️ Supprimé id=%s (template=%s, %d termes)",
|
|
dup["id"], tpl, term_count,
|
|
)
|
|
|
|
return deleted, terms_deleted
|
|
|
|
|
|
def perform_deletion(backup: dict, dry_run: bool) -> int:
|
|
"""Run the full deletion flow. Returns process exit code."""
|
|
with get_sync_session() as session:
|
|
valid_groups, errors, warnings = validate_ids(session, backup)
|
|
if errors:
|
|
logger.error("❌ %d erreur(s) de validation :", len(errors))
|
|
for e in errors:
|
|
logger.error(" - %s", e)
|
|
logger.error("Annulation. Corrigez le backup ou la DB puis ré-essayez.")
|
|
return 3
|
|
for w in warnings:
|
|
logger.warning("⚠️ %s", w)
|
|
|
|
if not valid_groups:
|
|
logger.info("✅ Aucun groupe à supprimer.")
|
|
return 0
|
|
|
|
total_dupes, total_terms = print_preview(valid_groups)
|
|
|
|
if dry_run:
|
|
logger.info("⚠️ Mode --dry-run : aucune suppression effectuée.")
|
|
return 0
|
|
|
|
if total_dupes == 0:
|
|
return 0
|
|
|
|
# Commit par user pour limiter l'impact d'une erreur partielle (F6).
|
|
grand_deleted = 0
|
|
grand_terms = 0
|
|
for group in valid_groups:
|
|
user_id = group["user_id"]
|
|
tid = group.get("template_id")
|
|
logger.info("👤 user=%s template=%s — suppression…", user_id, tid)
|
|
try:
|
|
deleted, terms = delete_group(session, group)
|
|
session.commit()
|
|
grand_deleted += deleted
|
|
grand_terms += terms
|
|
except Exception as e:
|
|
session.rollback()
|
|
logger.error("❌ Échec pour user=%s template=%s : %s", user_id, tid, e)
|
|
logger.error(" Transaction annulée pour ce groupe, on continue.")
|
|
|
|
logger.info("=" * 78)
|
|
logger.info(
|
|
"✅ Terminé : %d glossaire(s) supprimé(s), %d termes supprimé(s).",
|
|
grand_deleted, grand_terms,
|
|
)
|
|
logger.info("=" * 78)
|
|
return 0
|
|
|
|
|
|
def confirm(prompt: str) -> bool:
|
|
"""Ask the user for confirmation. Returns True if user accepts."""
|
|
try:
|
|
answer = input(f"{prompt} [oui/non] : ").strip().lower()
|
|
except EOFError:
|
|
return False
|
|
return answer in ("oui", "o", "yes", "y")
|
|
|
|
|
|
def regenerate_backup(user_id: str | None, allow_missing_template_id: bool = False) -> Path:
|
|
"""Run the backup script as a subprocess to get a fresh JSON."""
|
|
import subprocess
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
out_path = ROOT / "backups" / f"glossary_duplicates_{timestamp}.json"
|
|
cmd = [
|
|
sys.executable,
|
|
str(ROOT / "scripts" / "backup_duplicate_glossaries.py"),
|
|
]
|
|
if user_id:
|
|
cmd += ["--user", user_id]
|
|
if allow_missing_template_id:
|
|
cmd += ["--allow-missing-template-id"]
|
|
cmd += ["--output", str(out_path)]
|
|
logger.info("🔄 Génération d'un backup frais : %s", " ".join(cmd))
|
|
res = subprocess.run(cmd, env=__import__("os").environ.copy())
|
|
if res.returncode != 0:
|
|
logger.error("❌ Échec de la génération du backup (code=%d).", res.returncode)
|
|
sys.exit(2)
|
|
return out_path
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description="Supprime les doublons de glossaires en se basant sur un backup JSON."
|
|
)
|
|
parser.add_argument(
|
|
"input",
|
|
nargs="?",
|
|
metavar="BACKUP_JSON",
|
|
help="Fichier JSON de backup. Si absent, --user doit être fourni pour en générer un.",
|
|
)
|
|
parser.add_argument(
|
|
"--user",
|
|
metavar="USER_ID",
|
|
help="Génère un backup frais limité à cet utilisateur (utilisé si BACKUP_JSON absent).",
|
|
)
|
|
parser.add_argument(
|
|
"--allow-missing-template-id",
|
|
action="store_true",
|
|
help="Transmis au script de backup si le schéma DB n'a pas la colonne template_id.",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Affiche ce qui serait supprimé sans rien modifier.",
|
|
)
|
|
parser.add_argument(
|
|
"--yes",
|
|
action="store_true",
|
|
help="Ne demande pas de confirmation interactive.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Résoudre le fichier d'entrée
|
|
if args.input:
|
|
backup_path = Path(args.input)
|
|
elif args.user:
|
|
backup_path = regenerate_backup(args.user, args.allow_missing_template_id)
|
|
else:
|
|
parser.error("Fournissez un BACKUP_JSON ou bien --user USER_ID.")
|
|
|
|
backup = load_backup(backup_path)
|
|
logger.info("📄 Backup chargé : %s (généré le %s)", backup_path, backup.get("generated_at"))
|
|
|
|
if not args.dry_run and not args.yes:
|
|
total = backup.get("total_duplicates", 0)
|
|
if total == 0:
|
|
logger.info("✅ Aucun doublon à supprimer dans ce backup.")
|
|
return 0
|
|
if not confirm(f"Supprimer {total} glossaire(s) listé(s) dans le backup ?"):
|
|
logger.info("Annulé par l'utilisateur.")
|
|
return 1
|
|
|
|
return perform_deletion(backup, dry_run=args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|