feat(glossaries): add backup + delete scripts and Gitea workflow for duplicate cleanup
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 6m31s
- scripts/backup_duplicate_glossaries.py : exporte en JSON les doublons (meme user_id + template_id) sans rien supprimer. Schema validation, tri stable, mode degrade si colonne template_id absente. - scripts/delete_duplicate_glossaries.py : lit un backup JSON et supprime les doublons listes. Validation IDs, confirmation interactive, commit par user, mode --dry-run / --yes. - .gitea/workflows/cleanup-glossaries.yml : workflow_dispatch qui SSH sur le serveur de prod et execute le script dans le conteneur backend (postgres demarre, .env charge, env_file docker-compose).
This commit is contained in:
87
.gitea/workflows/cleanup-glossaries.yml
Normal file
87
.gitea/workflows/cleanup-glossaries.yml
Normal file
@@ -0,0 +1,87 @@
|
||||
name: Cleanup Duplicate Glossaries
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
user_id:
|
||||
description: 'User ID dont on nettoie les doublons (obligatoire).'
|
||||
required: true
|
||||
type: string
|
||||
dry_run:
|
||||
description: 'Mode relecture seule — aucune suppression.'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
yes:
|
||||
description: 'Confirme la suppression (ignoré si dry_run=true).'
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
name: Backup and delete duplicate glossaries on production
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Setup SSH
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
ssh-keyscan -H 192.168.1.151 >> ~/.ssh/known_hosts
|
||||
|
||||
- name: Run cleanup on production server
|
||||
env:
|
||||
USER_ID: ${{ inputs.user_id }}
|
||||
DRY_RUN: ${{ inputs.dry_run }}
|
||||
YES: ${{ inputs.yes }}
|
||||
run: |
|
||||
ssh root@192.168.1.151 << ENDSSH
|
||||
set -euo pipefail
|
||||
cd /opt/wordly
|
||||
|
||||
# S'assurer que le code local est à jour (les scripts font partie du repo).
|
||||
git config --global --add safe.directory /opt/wordly
|
||||
git fetch origin production-deployment
|
||||
git reset --hard origin/production-deployment
|
||||
|
||||
# S'assurer que postgres tourne (le script lit via DATABASE_URL du .env).
|
||||
docker compose up -d postgres redis
|
||||
|
||||
# Attendre postgres (max 60s).
|
||||
for i in \$(seq 1 30); do
|
||||
if docker compose exec -T postgres pg_isready -U translate >/dev/null 2>&1; then
|
||||
echo "Postgres ready after \$((i * 2))s"
|
||||
break
|
||||
fi
|
||||
[ "\$i" -eq 30 ] && { echo "Postgres not ready after 60s"; exit 1; }
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Construire les flags.
|
||||
FLAGS="--user \${USER_ID} --allow-missing-template-id"
|
||||
if [ "\${DRY_RUN}" = "true" ]; then
|
||||
FLAGS="\${FLAGS} --dry-run"
|
||||
fi
|
||||
if [ "\${YES}" = "true" ]; then
|
||||
FLAGS="\${FLAGS} --yes"
|
||||
fi
|
||||
|
||||
echo "════════════════════════════════════════"
|
||||
echo " USER_ID=\${USER_ID}"
|
||||
echo " DRY_RUN=\${DRY_RUN}"
|
||||
echo " YES=\${YES}"
|
||||
echo " FLAGS=\${FLAGS}"
|
||||
echo "════════════════════════════════════════"
|
||||
|
||||
# Le service "backend" charge .env via env_file et l'entrypoint
|
||||
# fait `exec "\$@"` si on passe des args. DATABASE_URL est résolu
|
||||
# vers le hostname docker "postgres:5432" via la variable d'env.
|
||||
docker compose run --rm backend \
|
||||
python scripts/delete_duplicate_glossaries.py \${FLAGS}
|
||||
ENDSSH
|
||||
|
||||
- name: List backup artifacts
|
||||
if: always()
|
||||
run: |
|
||||
ssh root@192.168.1.151 'ls -la /opt/wordly/backups/ 2>/dev/null || echo "(no backups dir)"'
|
||||
305
scripts/backup_duplicate_glossaries.py
Normal file
305
scripts/backup_duplicate_glossaries.py
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sauvegarde en JSON les glossaires dupliqués (même user_id + template_id) créés
|
||||
avant la mise en place de la garde anti-doublon dans le backend.
|
||||
|
||||
⚠️ Ce script ne supprime RIEN — il produit uniquement un fichier de backup
|
||||
contenant l'intégralité des doublons (métadonnées + termes) en vue d'une
|
||||
analyse ou d'une suppression manuelle ultérieure.
|
||||
|
||||
Pour chaque couple (user_id, template_id) avec > 1 glossaire, le plus ancien
|
||||
(premier créé) est marqué "keeper" et les copies sont listées dans "duplicates"
|
||||
avec tous leurs termes.
|
||||
|
||||
Usage:
|
||||
# Cible la base de prod PostgreSQL (lu via DATABASE_URL) :
|
||||
DATABASE_URL=postgresql://user:pass@host:5432/db python scripts/backup_duplicate_glossaries.py
|
||||
|
||||
# Ou préciser une base SQLite spécifique :
|
||||
SQLITE_PATH=/path/to/translate.db python scripts/backup_duplicate_glossaries.py
|
||||
|
||||
# Limiter à un seul utilisateur :
|
||||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --user <user_id>
|
||||
|
||||
# Choisir le fichier de sortie :
|
||||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --output backups/dupes.json
|
||||
|
||||
# Forcer l'exécution même si la colonne template_id est absente du schéma
|
||||
# (utile pour un dump partiel des glossaires sans template_id) :
|
||||
DATABASE_URL=... python scripts/backup_duplicate_glossaries.py --allow-missing-template-id
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from sqlalchemy import inspect, text
|
||||
|
||||
from database.connection import sync_engine
|
||||
from database.models import Glossary
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("backup_dup_glossaries")
|
||||
|
||||
|
||||
def _has_template_id_column() -> bool:
|
||||
"""Vérifie que la colonne `template_id` existe sur la table `glossaries`."""
|
||||
try:
|
||||
inspector = inspect(sync_engine)
|
||||
cols = {c["name"] for c in inspector.get_columns("glossaries")}
|
||||
return "template_id" in cols
|
||||
except Exception as e:
|
||||
logger.error("Impossible d'inspecter le schéma : %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def find_duplicates(
|
||||
session, user_id: str | None = None, include_no_template: bool = False,
|
||||
use_raw_query: bool = False,
|
||||
) -> dict[tuple[str, str | None], list[Glossary]]:
|
||||
"""Group glossaries by (user_id, template_id). By default, only template-linked ones.
|
||||
|
||||
Si `include_no_template=True`, les glossaires sans template_id sont groupés sous
|
||||
la clé (user_id, None).
|
||||
|
||||
Si `use_raw_query=True`, on bypass l'ORM (utile quand la colonne `template_id`
|
||||
n'existe pas dans le schéma).
|
||||
"""
|
||||
if use_raw_query:
|
||||
# Bypass ORM : SELECT * ne dépend pas du mapping modèle.
|
||||
sql = "SELECT id, user_id, name, source_language, target_language, " \
|
||||
"created_at, updated_at FROM glossaries"
|
||||
params: dict = {}
|
||||
clauses: list[str] = []
|
||||
if user_id:
|
||||
clauses.append("user_id = :user_id")
|
||||
params["user_id"] = user_id
|
||||
if clauses:
|
||||
sql += " WHERE " + " AND ".join(clauses)
|
||||
rows = session.execute(text(sql), params).fetchall()
|
||||
|
||||
groups: dict[tuple[str, str | None], list[Glossary]] = defaultdict(list)
|
||||
for r in rows:
|
||||
# Sans colonne template_id, on groupe par (user_id, None).
|
||||
g = Glossary(
|
||||
id=r.id, user_id=r.user_id, name=r.name,
|
||||
source_language=r.source_language, target_language=r.target_language,
|
||||
created_at=r.created_at, updated_at=r.updated_at,
|
||||
)
|
||||
groups[(g.user_id, None)].append(g)
|
||||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||||
|
||||
q = session.query(Glossary)
|
||||
if not include_no_template:
|
||||
q = q.filter(Glossary.template_id.isnot(None))
|
||||
if user_id:
|
||||
q = q.filter(Glossary.user_id == user_id)
|
||||
|
||||
groups = defaultdict(list)
|
||||
for g in q.all():
|
||||
groups[(g.user_id, g.template_id)].append(g)
|
||||
return {k: v for k, v in groups.items() if len(v) > 1}
|
||||
|
||||
|
||||
def _stable_sort(glossaries: list[Glossary]) -> tuple[list[Glossary], int]:
|
||||
"""Sort glossaries by (created_at ASC, id ASC) for deterministic ordering.
|
||||
|
||||
Returns the sorted list and the number of entries with None created_at.
|
||||
"""
|
||||
none_count = sum(1 for g in glossaries if g.created_at is None)
|
||||
if none_count:
|
||||
logger.warning(
|
||||
"⚠️ %d glossaire(s) ont un created_at NULL — tri secondaire par id.",
|
||||
none_count,
|
||||
)
|
||||
return sorted(glossaries, key=lambda g: (g.created_at or datetime.min.replace(tzinfo=timezone.utc), g.id)), none_count
|
||||
|
||||
|
||||
def serialize_group(user_id: str, template_id: str | None, glossaries: list[Glossary]) -> dict:
|
||||
"""Convert a duplicate group to a JSON-serializable dict."""
|
||||
sorted_glossaries, _ = _stable_sort(glossaries)
|
||||
keeper = sorted_glossaries[0]
|
||||
duplicates = sorted_glossaries[1:]
|
||||
|
||||
def to_iso(value) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, datetime):
|
||||
return value.isoformat()
|
||||
# SQLite renvoie parfois des strings ; on tente de normaliser.
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return datetime.fromisoformat(value).isoformat()
|
||||
except ValueError:
|
||||
return value
|
||||
return str(value)
|
||||
|
||||
def serialize_glossary(g: Glossary, include_terms: bool) -> dict:
|
||||
data = {
|
||||
"id": g.id,
|
||||
"name": g.name,
|
||||
"source_language": g.source_language,
|
||||
"target_language": g.target_language,
|
||||
"template_id": getattr(g, "template_id", None),
|
||||
"created_at": to_iso(g.created_at),
|
||||
"updated_at": to_iso(g.updated_at),
|
||||
}
|
||||
if include_terms:
|
||||
data["terms"] = [
|
||||
{
|
||||
"id": t.id,
|
||||
"source": t.source,
|
||||
"target": t.target,
|
||||
"translations": t.translations or {},
|
||||
}
|
||||
for t in g.terms
|
||||
] if g.terms else []
|
||||
else:
|
||||
data["terms_count"] = len(g.terms) if g.terms else 0
|
||||
return data
|
||||
|
||||
return {
|
||||
"user_id": user_id,
|
||||
"template_id": template_id,
|
||||
"keep": serialize_glossary(keeper, include_terms=False),
|
||||
"duplicates_count": len(duplicates),
|
||||
"duplicates": [serialize_glossary(d, include_terms=True) for d in duplicates],
|
||||
}
|
||||
|
||||
|
||||
def write_backup(groups: dict[tuple[str, str | None], list[Glossary]], output_path: Path) -> dict:
|
||||
"""Write the full backup to `output_path` and return a stats dict."""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schema_version": 1,
|
||||
"note": "Aucun glossaire n'a été supprimé. Ce fichier documente les doublons.",
|
||||
"total_groups": len(groups),
|
||||
"total_duplicates": sum(len(v) - 1 for v in groups.values()),
|
||||
"groups": [serialize_group(uid, tid, gs) for (uid, tid), gs in
|
||||
sorted(groups.items(), key=lambda x: (x[0][1] or "", x[0][0]))],
|
||||
}
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return payload
|
||||
|
||||
|
||||
def print_report(payload: dict) -> None:
|
||||
"""Print a human-readable summary on stdout."""
|
||||
logger.info("=" * 78)
|
||||
logger.info("Récapitulatif du backup")
|
||||
logger.info("=" * 78)
|
||||
logger.info("Généré le : %s", payload["generated_at"])
|
||||
logger.info("Groupes concernés: %d", payload["total_groups"])
|
||||
logger.info("Doublons totaux : %d", payload["total_duplicates"])
|
||||
|
||||
by_template: dict[str, int] = defaultdict(int)
|
||||
for g in payload["groups"]:
|
||||
tid = g["template_id"] or "(no template)"
|
||||
by_template[tid] += g["duplicates_count"]
|
||||
|
||||
if by_template:
|
||||
logger.info("")
|
||||
logger.info("Par template :")
|
||||
for tid, count in sorted(by_template.items()):
|
||||
logger.info(" %-14s %d doublon(s)", tid, count)
|
||||
logger.info("=" * 78)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Sauvegarde (sans suppression) les glossaires dupliqués en JSON."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user",
|
||||
metavar="USER_ID",
|
||||
help="Limite le backup à un seul utilisateur.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="PATH",
|
||||
help="Chemin du fichier JSON de sortie (défaut : backups/glossary_duplicates_<timestamp>.json).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-no-template",
|
||||
action="store_true",
|
||||
help="Inclut aussi les glossaires sans template_id dans la recherche de doublons.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allow-missing-template-id",
|
||||
action="store_true",
|
||||
help="Continue sans erreur si la colonne `template_id` est absente du schéma "
|
||||
"(équivaut à --include-no-template, mais le script ne plantera pas).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not _has_template_id_column():
|
||||
if args.allow_missing_template_id or args.include_no_template:
|
||||
logger.warning("⚠️ Colonne `template_id` absente — bascule en mode sans-template.")
|
||||
args.include_no_template = True
|
||||
else:
|
||||
logger.error(
|
||||
"❌ La colonne `glossaries.template_id` est absente du schéma actuel. "
|
||||
"Appliquez d'abord la migration Alembic (alembic upgrade head) ou relancez "
|
||||
"avec --allow-missing-template-id pour ne sauvegarder que les glossaires sans template_id."
|
||||
)
|
||||
return 2
|
||||
|
||||
from database.connection import get_sync_session
|
||||
|
||||
use_raw = args.allow_missing_template_id or args.include_no_template
|
||||
if args.allow_missing_template_id and not args.include_no_template:
|
||||
args.include_no_template = True
|
||||
|
||||
logger.info(
|
||||
"🔍 Recherche de doublons%s%s…",
|
||||
f" pour user_id={args.user}" if args.user else "",
|
||||
" (incl. sans template)" if args.include_no_template else "",
|
||||
)
|
||||
|
||||
with get_sync_session() as session:
|
||||
groups = find_duplicates(
|
||||
session,
|
||||
user_id=args.user,
|
||||
include_no_template=args.include_no_template,
|
||||
use_raw_query=use_raw,
|
||||
)
|
||||
|
||||
if not groups:
|
||||
if args.user:
|
||||
logger.info("✅ Aucun doublon trouvé pour user_id=%s.", args.user)
|
||||
else:
|
||||
logger.info("✅ Aucun doublon trouvé.")
|
||||
return 0
|
||||
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
else:
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
output_path = ROOT / "backups" / f"glossary_duplicates_{ts}.json"
|
||||
|
||||
logger.info("💾 Écriture du backup vers %s …", output_path)
|
||||
payload = write_backup(groups, output_path)
|
||||
|
||||
print_report(payload)
|
||||
logger.info("✅ Backup écrit : %s (%d octets)", output_path, output_path.stat().st_size)
|
||||
logger.info("ℹ️ Aucune suppression effectuée. Relire le JSON pour décider de l'action manuelle.")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
323
scripts/delete_duplicate_glossaries.py
Normal file
323
scripts/delete_duplicate_glossaries.py
Normal file
@@ -0,0 +1,323 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Supprime les glossaires dupliqués en se basant sur un fichier JSON de backup
|
||||
produit par `scripts/backup_duplicate_glossaries.py`.
|
||||
|
||||
⚠️ DESTRUCTIF. Par défaut, demande confirmation interactive avant chaque
|
||||
suppression. Utiliser --yes pour les exécutions automatisées.
|
||||
|
||||
Pour chaque groupe dans le JSON, le glossaire listé dans "keep" est conservé,
|
||||
ceux listés dans "duplicates" sont supprimés (ainsi que leurs termes via cascade).
|
||||
|
||||
Usage:
|
||||
# Dry-run (relecture) :
|
||||
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --dry-run
|
||||
|
||||
# Confirmation interactive :
|
||||
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json
|
||||
|
||||
# Sans confirmation (CI / cron) :
|
||||
python scripts/delete_duplicate_glossaries.py backups/glossary_duplicates_xxx.json --yes
|
||||
|
||||
# Re-génère le backup à la volée (si --input absent) :
|
||||
DATABASE_URL=... python scripts/delete_duplicate_glossaries.py --user <USER_ID> --dry-run
|
||||
|
||||
# Back-up automatique avant suppression (recommandé) :
|
||||
python scripts/delete_duplicate_glossaries.py --user <USER_ID> --yes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from sqlalchemy import inspect, text
|
||||
|
||||
from database.connection import get_sync_session, sync_engine
|
||||
from database.models import Glossary, User
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("delete_dup_glossaries")
|
||||
|
||||
|
||||
def load_backup(path: Path) -> dict:
|
||||
"""Load and validate a backup JSON file."""
|
||||
if not path.exists():
|
||||
logger.error("❌ Fichier de backup introuvable : %s", path)
|
||||
sys.exit(2)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if "groups" not in data:
|
||||
logger.error("❌ Le fichier %s n'a pas de clé 'groups' — format invalide.", path)
|
||||
sys.exit(2)
|
||||
return data
|
||||
|
||||
|
||||
def validate_ids(session, backup: dict) -> tuple[list[dict], list[str], list[str]]:
|
||||
"""Check that every duplicate id exists in the DB and matches the expected user.
|
||||
|
||||
Returns (valid_groups, errors, warnings).
|
||||
- errors: bloquants (mismatch owner / id introuvable)
|
||||
- warnings: informatifs (user parent absent — données orphelines)
|
||||
|
||||
Utilise du SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
||||
si la colonne `template_id` n'existe pas dans le schéma).
|
||||
"""
|
||||
errors: list[str] = []
|
||||
warnings: list[str] = []
|
||||
valid: list[dict] = []
|
||||
|
||||
for group in backup["groups"]:
|
||||
user_id = group["user_id"]
|
||||
|
||||
user_exists = session.execute(
|
||||
text("SELECT id FROM users WHERE id = :uid"), {"uid": user_id}
|
||||
).first()
|
||||
if not user_exists:
|
||||
warnings.append(
|
||||
f"User parent introuvable (données orphelines) : user_id={user_id} — "
|
||||
f"la suppression des glossaires associés sera tentée."
|
||||
)
|
||||
|
||||
for dup in group["duplicates"]:
|
||||
dup_id = dup["id"]
|
||||
row = session.execute(
|
||||
text("SELECT id FROM glossaries WHERE id = :gid AND user_id = :uid"),
|
||||
{"gid": dup_id, "uid": user_id},
|
||||
).first()
|
||||
if not row:
|
||||
errors.append(
|
||||
f"Glossary introuvable ou owner mismatch : id={dup_id} user={user_id}"
|
||||
)
|
||||
|
||||
valid.append(group)
|
||||
|
||||
return valid, errors, warnings
|
||||
|
||||
|
||||
def print_preview(valid_groups: list[dict]) -> tuple[int, int]:
|
||||
"""Print what would be deleted. Returns (total_dupes, total_user_groups)."""
|
||||
total_dupes = 0
|
||||
total_terms = 0
|
||||
|
||||
logger.info("=" * 78)
|
||||
logger.info("Aperçu de la suppression")
|
||||
logger.info("=" * 78)
|
||||
|
||||
by_template: dict[str, int] = {}
|
||||
for g in valid_groups:
|
||||
tid = g.get("template_id") or "(no template)"
|
||||
by_template[tid] = by_template.get(tid, 0) + g["duplicates_count"]
|
||||
total_dupes += g["duplicates_count"]
|
||||
total_terms += sum(len(d.get("terms", [])) for d in g["duplicates"])
|
||||
|
||||
logger.info("Groupes à traiter : %d", len(valid_groups))
|
||||
logger.info("Glossaires à supprimer : %d", total_dupes)
|
||||
logger.info("Termes concernés (estim.) : %d", total_terms)
|
||||
logger.info("")
|
||||
logger.info("Détail par template :")
|
||||
for tid, count in sorted(by_template.items()):
|
||||
logger.info(" %-14s %d doublon(s) à supprimer", tid, count)
|
||||
logger.info("=" * 78)
|
||||
return total_dupes, total_terms
|
||||
|
||||
|
||||
def delete_group(session, group: dict) -> tuple[int, int]:
|
||||
"""Delete the duplicates of a single group. Returns (glossaries_deleted, terms_deleted).
|
||||
|
||||
Utilise SQL brut pour ne pas dépendre du mapping ORM (qui planterait
|
||||
si la colonne `template_id` n'existe pas dans le schéma).
|
||||
"""
|
||||
deleted = 0
|
||||
terms_deleted = 0
|
||||
|
||||
for dup in group["duplicates"]:
|
||||
# 1. Compter les termes (avant suppression, pour les logs).
|
||||
term_count = session.execute(
|
||||
text("SELECT COUNT(*) FROM glossary_terms WHERE glossary_id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
).scalar() or 0
|
||||
|
||||
# 2. Récupérer le template_id pour le log (best effort).
|
||||
try:
|
||||
tpl = session.execute(
|
||||
text("SELECT template_id FROM glossaries WHERE id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
).scalar()
|
||||
except Exception:
|
||||
tpl = None
|
||||
|
||||
# 3. Supprimer d'abord les termes (FK), puis le glossaire.
|
||||
try:
|
||||
session.execute(
|
||||
text("DELETE FROM glossary_terms WHERE glossary_id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
)
|
||||
session.execute(
|
||||
text("DELETE FROM glossaries WHERE id = :gid"),
|
||||
{"gid": dup["id"]},
|
||||
)
|
||||
session.flush()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
raise RuntimeError(f"Échec suppression glossary {dup['id']}: {e}") from e
|
||||
|
||||
deleted += 1
|
||||
terms_deleted += term_count
|
||||
logger.info(
|
||||
" 🗑️ Supprimé id=%s (template=%s, %d termes)",
|
||||
dup["id"], tpl, term_count,
|
||||
)
|
||||
|
||||
return deleted, terms_deleted
|
||||
|
||||
|
||||
def perform_deletion(backup: dict, dry_run: bool) -> int:
|
||||
"""Run the full deletion flow. Returns process exit code."""
|
||||
with get_sync_session() as session:
|
||||
valid_groups, errors, warnings = validate_ids(session, backup)
|
||||
if errors:
|
||||
logger.error("❌ %d erreur(s) de validation :", len(errors))
|
||||
for e in errors:
|
||||
logger.error(" - %s", e)
|
||||
logger.error("Annulation. Corrigez le backup ou la DB puis ré-essayez.")
|
||||
return 3
|
||||
for w in warnings:
|
||||
logger.warning("⚠️ %s", w)
|
||||
|
||||
if not valid_groups:
|
||||
logger.info("✅ Aucun groupe à supprimer.")
|
||||
return 0
|
||||
|
||||
total_dupes, total_terms = print_preview(valid_groups)
|
||||
|
||||
if dry_run:
|
||||
logger.info("⚠️ Mode --dry-run : aucune suppression effectuée.")
|
||||
return 0
|
||||
|
||||
if total_dupes == 0:
|
||||
return 0
|
||||
|
||||
# Commit par user pour limiter l'impact d'une erreur partielle (F6).
|
||||
grand_deleted = 0
|
||||
grand_terms = 0
|
||||
for group in valid_groups:
|
||||
user_id = group["user_id"]
|
||||
tid = group.get("template_id")
|
||||
logger.info("👤 user=%s template=%s — suppression…", user_id, tid)
|
||||
try:
|
||||
deleted, terms = delete_group(session, group)
|
||||
session.commit()
|
||||
grand_deleted += deleted
|
||||
grand_terms += terms
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error("❌ Échec pour user=%s template=%s : %s", user_id, tid, e)
|
||||
logger.error(" Transaction annulée pour ce groupe, on continue.")
|
||||
|
||||
logger.info("=" * 78)
|
||||
logger.info(
|
||||
"✅ Terminé : %d glossaire(s) supprimé(s), %d termes supprimé(s).",
|
||||
grand_deleted, grand_terms,
|
||||
)
|
||||
logger.info("=" * 78)
|
||||
return 0
|
||||
|
||||
|
||||
def confirm(prompt: str) -> bool:
|
||||
"""Ask the user for confirmation. Returns True if user accepts."""
|
||||
try:
|
||||
answer = input(f"{prompt} [oui/non] : ").strip().lower()
|
||||
except EOFError:
|
||||
return False
|
||||
return answer in ("oui", "o", "yes", "y")
|
||||
|
||||
|
||||
def regenerate_backup(user_id: str | None, allow_missing_template_id: bool = False) -> Path:
|
||||
"""Run the backup script as a subprocess to get a fresh JSON."""
|
||||
import subprocess
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
out_path = ROOT / "backups" / f"glossary_duplicates_{timestamp}.json"
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(ROOT / "scripts" / "backup_duplicate_glossaries.py"),
|
||||
]
|
||||
if user_id:
|
||||
cmd += ["--user", user_id]
|
||||
if allow_missing_template_id:
|
||||
cmd += ["--allow-missing-template-id"]
|
||||
cmd += ["--output", str(out_path)]
|
||||
logger.info("🔄 Génération d'un backup frais : %s", " ".join(cmd))
|
||||
res = subprocess.run(cmd, env=__import__("os").environ.copy())
|
||||
if res.returncode != 0:
|
||||
logger.error("❌ Échec de la génération du backup (code=%d).", res.returncode)
|
||||
sys.exit(2)
|
||||
return out_path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Supprime les doublons de glossaires en se basant sur un backup JSON."
|
||||
)
|
||||
parser.add_argument(
|
||||
"input",
|
||||
nargs="?",
|
||||
metavar="BACKUP_JSON",
|
||||
help="Fichier JSON de backup. Si absent, --user doit être fourni pour en générer un.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user",
|
||||
metavar="USER_ID",
|
||||
help="Génère un backup frais limité à cet utilisateur (utilisé si BACKUP_JSON absent).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allow-missing-template-id",
|
||||
action="store_true",
|
||||
help="Transmis au script de backup si le schéma DB n'a pas la colonne template_id.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Affiche ce qui serait supprimé sans rien modifier.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--yes",
|
||||
action="store_true",
|
||||
help="Ne demande pas de confirmation interactive.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Résoudre le fichier d'entrée
|
||||
if args.input:
|
||||
backup_path = Path(args.input)
|
||||
elif args.user:
|
||||
backup_path = regenerate_backup(args.user, args.allow_missing_template_id)
|
||||
else:
|
||||
parser.error("Fournissez un BACKUP_JSON ou bien --user USER_ID.")
|
||||
|
||||
backup = load_backup(backup_path)
|
||||
logger.info("📄 Backup chargé : %s (généré le %s)", backup_path, backup.get("generated_at"))
|
||||
|
||||
if not args.dry_run and not args.yes:
|
||||
total = backup.get("total_duplicates", 0)
|
||||
if total == 0:
|
||||
logger.info("✅ Aucun doublon à supprimer dans ce backup.")
|
||||
return 0
|
||||
if not confirm(f"Supprimer {total} glossaire(s) listé(s) dans le backup ?"):
|
||||
logger.info("Annulé par l'utilisateur.")
|
||||
return 1
|
||||
|
||||
return perform_deletion(backup, dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user