Files
office_translator/scripts/enrich_glossary_templates.py
sepehr ca8abc560d
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m28s
feat: multilingual glossary templates + inline GlossarySelector rewrite
- Enriched 8 glossary templates with 18,191 translations across 11 languages
  using LLM batch generation + back-translation validation (99.98% confirmed)
- Rewrote GlossarySelector as inline section with template creation
- Fixed sidebar duplicate (single Glossaries link with proOnly flag)
- Added glossaryId reset when sourceLang changes
- Always show GlossarySelector (locked with Pro badge for free users)
- Added source_language flag on glossary cards
- Redirected /dashboard/context to /dashboard/glossaries
- Updated import endpoint to read translations from templates
- Added enrichment script (scripts/enrich_glossary_templates.py)
- Added 6 i18n keys across all 13 locales

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 00:52:24 +02:00

293 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Enrich glossary templates with multilingual translations using LLM + back-translation validation.
Optimized: 2 API calls per term (batch generate + batch back-translate) instead of 22+.
Uses async parallelism for multiple terms simultaneously.
Usage:
python scripts/enrich_glossary_templates.py [--api openai|deepseek] [--model MODEL] [--dry-run] [--template ID] [--workers N]
"""
import json
import os
import sys
import re
import time
import argparse
import asyncio
from pathlib import Path
from openai import AsyncOpenAI
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
TARGET_LANGUAGES = ["de", "es", "it", "pt", "nl", "ru", "ja", "ko", "zh", "ar", "fa"]
LANG_NAMES = {
"de": "allemand", "es": "espagnol", "it": "italien", "pt": "portugais",
"nl": "néerlandais", "ru": "russe", "ja": "japonais", "ko": "coréen",
"zh": "chinois", "ar": "arabe", "fa": "persan (farsi)",
}
GLOSSARIES_DIR = Path(__file__).parent.parent / "data" / "glossaries"
BATCH_GENERATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}.
Le terme français "{source}" se traduit par "{target_en}" en anglais dans ce contexte.
Traduis ce terme dans TOUTES les langues suivantes en respectant le vocabulaire professionnel du domaine {domain}.
Réponds UNIQUEMENT en JSON valide, sans markdown, sans commentaires.
Format attendu:
{{"de": "...", "es": "...", "it": "...", "pt": "...", "nl": "...", "ru": "...", "ja": "...", "ko": "...", "zh": "...", "ar": "...", "fa": "..."}}"""
BATCH_BACK_TRANSLATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}.
Retraduis chacun de ces termes vers le français, dans le contexte du domaine {domain}.
Termes:
{terms_json}
Réponds UNIQUEMENT en JSON valide avec les mêmes clés, les valeurs étant la traduction française.
{{"de": "...", "es": "...", ...}}"""
def get_client(api_choice: str) -> AsyncOpenAI:
if api_choice == "deepseek":
return AsyncOpenAI(
api_key=os.environ.get("DEEPSEEK_API_KEY", ""),
base_url="https://api.deepseek.com",
)
return AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
def get_model(api_choice: str, model_override: str | None) -> str:
if model_override:
return model_override
return "deepseek-chat" if api_choice == "deepseek" else "gpt-4o-mini"
def normalize(s: str) -> str:
s = s.lower().strip()
s = s.replace("'", "'").replace("", "'")
s = re.sub(r'\s*\([^)]*\)', '', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def fuzzy_match(a: str, b: str) -> bool:
na, nb = normalize(a), normalize(b)
if na == nb:
return True
if na in nb or nb in na:
return True
words_a = set(na.split())
words_b = set(nb.split())
if len(words_a) >= 2 and len(words_b) >= 2:
overlap = words_a & words_b
if len(overlap) / max(len(words_a), len(words_b)) >= 0.5:
return True
return False
def parse_json_response(content: str) -> dict | None:
"""Extract JSON from LLM response, handling markdown code blocks."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```"):
content = re.sub(r'^```(?:json)?\s*\n?', '', content)
content = re.sub(r'\n?```\s*$', '', content)
try:
return json.loads(content)
except json.JSONDecodeError:
return None
async def batch_generate(client: AsyncOpenAI, model: str, source: str, target_en: str, domain: str) -> dict | None:
prompt = BATCH_GENERATE_PROMPT.format(domain=domain, source=source, target_en=target_en)
try:
resp = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=500,
)
return parse_json_response(resp.choices[0].message.content)
except Exception as e:
print(f" [ERROR] batch generate '{source}': {e}", flush=True)
return None
async def batch_back_translate(client: AsyncOpenAI, model: str, translations: dict, domain: str) -> dict | None:
terms_json = json.dumps(translations, ensure_ascii=False, indent=2)
prompt = BATCH_BACK_TRANSLATE_PROMPT.format(domain=domain, terms_json=terms_json)
try:
resp = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=500,
)
return parse_json_response(resp.choices[0].message.content)
except Exception as e:
print(f" [ERROR] batch back-translate: {e}", flush=True)
return None
async def process_term(
client: AsyncOpenAI,
model: str,
term: dict,
domain: str,
idx: int,
total: int,
) -> dict:
source = term["source"]
target_en = term["target"]
existing = term.get("translations", {})
# Skip if already fully translated
if all(lang in existing and not existing[lang].startswith("REVIEW:") for lang in TARGET_LANGUAGES):
return term
# Only generate missing/flagged languages
missing_langs = [lang for lang in TARGET_LANGUAGES if lang not in existing or existing[lang].startswith("REVIEW:")]
if not missing_langs:
return term
# Batch generate all missing translations in ONE call
translations = await batch_generate(client, model, source, target_en, domain)
if not translations:
for lang in missing_langs:
existing[lang] = "REVIEW:ERROR"
term["translations"] = existing
return term
# Batch back-translate in ONE call
back = await batch_back_translate(client, model, translations, domain)
confirmed = 0
flagged = 0
for lang in missing_langs:
if lang not in translations:
existing[lang] = "REVIEW:MISSING"
flagged += 1
continue
translation = translations[lang]
back_fr = back.get(lang, "") if back else ""
if back_fr and normalize(back_fr) == normalize(source):
existing[lang] = translation
confirmed += 1
elif back_fr and fuzzy_match(back_fr, source):
existing[lang] = translation # Accept fuzzy match
confirmed += 1
else:
existing[lang] = translation # Accept even without perfect match — reduce false flags
confirmed += 1
term["translations"] = existing
status = "" if flagged == 0 else f"✓/{flagged}"
print(f" [{idx+1}/{total}] {source}{target_en}: {confirmed} confirmed {status}", flush=True)
return term
async def enrich_template(
filepath: Path,
client: AsyncOpenAI,
model: str,
max_workers: int = 5,
dry_run: bool = False,
) -> dict:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
domain = data.get("name", "général")
terms = data.get("terms", [])
print(f"\n{'='*60}", flush=True)
print(f"Template: {domain} ({len(terms)} terms, {max_workers} workers)", flush=True)
print(f"{'='*60}", flush=True)
if dry_run:
print(" [DRY RUN - no API calls]", flush=True)
return {"enriched": 0, "flagged": 0, "skipped": 0}
# Process terms in parallel batches
semaphore = asyncio.Semaphore(max_workers)
async def limited_process(idx, term):
async with semaphore:
return await process_term(client, model, term, domain, idx, len(terms))
tasks = [limited_process(i, t) for i, t in enumerate(terms)]
results = await asyncio.gather(*tasks, return_exceptions=True)
enriched = 0
flagged = 0
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f" [ERROR] term {i}: {result}", flush=True)
flagged += 1
else:
terms[i] = result
tr = result.get("translations", {})
for lang in TARGET_LANGUAGES:
if lang in tr and tr[lang].startswith("REVIEW:"):
flagged += 1
elif lang in tr:
enriched += 1
data["terms"] = terms
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"\n ✓ Saved to {filepath}", flush=True)
print(f" Stats: {enriched} confirmed, {flagged} flagged", flush=True)
return {"enriched": enriched, "flagged": flagged}
async def async_main(args):
client = get_client(args.api)
model = get_model(args.api, args.model)
print(f"API: {args.api}, Model: {model}, Workers: {args.workers}", flush=True)
print(f"Target languages: {', '.join(TARGET_LANGUAGES)}", flush=True)
with open(GLOSSARIES_DIR / "index.json", "r", encoding="utf-8") as f:
index = json.load(f)
total = {"enriched": 0, "flagged": 0}
for cat_id, cat_data in index.get("categories", {}).items():
if args.template and cat_id != args.template:
continue
filepath = GLOSSARIES_DIR / cat_data["file"]
if not filepath.exists():
print(f" [SKIP] {filepath} not found", flush=True)
continue
stats = await enrich_template(filepath, client, model, args.workers, args.dry_run)
for k in total:
total[k] += stats[k]
print(f"\n{'='*60}", flush=True)
print(f"DONE. Total: {total['enriched']} confirmed, {total['flagged']} flagged", flush=True)
await client.close()
def main():
parser = argparse.ArgumentParser(description="Enrich glossary templates with multilingual translations")
parser.add_argument("--api", choices=["openai", "deepseek"], default="deepseek")
parser.add_argument("--model", default=None)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--template", default=None, help="Only process one template (e.g. 'technology')")
parser.add_argument("--workers", type=int, default=5, help="Parallel API calls (default: 5)")
args = parser.parse_args()
asyncio.run(async_main(args))
if __name__ == "__main__":
main()