#!/usr/bin/env python3 """ Enrich glossary templates with multilingual translations using LLM + back-translation validation. Optimized: 2 API calls per term (batch generate + batch back-translate) instead of 22+. Uses async parallelism for multiple terms simultaneously. Usage: python scripts/enrich_glossary_templates.py [--api openai|deepseek] [--model MODEL] [--dry-run] [--template ID] [--workers N] """ import json import os import sys import re import time import argparse import asyncio from pathlib import Path from openai import AsyncOpenAI sys.stdout.reconfigure(encoding="utf-8", errors="replace") TARGET_LANGUAGES = ["de", "es", "it", "pt", "nl", "ru", "ja", "ko", "zh", "ar", "fa"] LANG_NAMES = { "de": "allemand", "es": "espagnol", "it": "italien", "pt": "portugais", "nl": "néerlandais", "ru": "russe", "ja": "japonais", "ko": "coréen", "zh": "chinois", "ar": "arabe", "fa": "persan (farsi)", } GLOSSARIES_DIR = Path(__file__).parent.parent / "data" / "glossaries" BATCH_GENERATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}. Le terme français "{source}" se traduit par "{target_en}" en anglais dans ce contexte. Traduis ce terme dans TOUTES les langues suivantes en respectant le vocabulaire professionnel du domaine {domain}. Réponds UNIQUEMENT en JSON valide, sans markdown, sans commentaires. Format attendu: {{"de": "...", "es": "...", "it": "...", "pt": "...", "nl": "...", "ru": "...", "ja": "...", "ko": "...", "zh": "...", "ar": "...", "fa": "..."}}""" BATCH_BACK_TRANSLATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}. Retraduis chacun de ces termes vers le français, dans le contexte du domaine {domain}. Termes: {terms_json} Réponds UNIQUEMENT en JSON valide avec les mêmes clés, les valeurs étant la traduction française. {{"de": "...", "es": "...", ...}}""" def get_client(api_choice: str) -> AsyncOpenAI: if api_choice == "deepseek": return AsyncOpenAI( api_key=os.environ.get("DEEPSEEK_API_KEY", ""), base_url="https://api.deepseek.com", ) return AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY", "")) def get_model(api_choice: str, model_override: str | None) -> str: if model_override: return model_override return "deepseek-chat" if api_choice == "deepseek" else "gpt-4o-mini" def normalize(s: str) -> str: s = s.lower().strip() s = s.replace("'", "'").replace("’", "'") s = re.sub(r'\s*\([^)]*\)', '', s) s = re.sub(r'\s+', ' ', s).strip() return s def fuzzy_match(a: str, b: str) -> bool: na, nb = normalize(a), normalize(b) if na == nb: return True if na in nb or nb in na: return True words_a = set(na.split()) words_b = set(nb.split()) if len(words_a) >= 2 and len(words_b) >= 2: overlap = words_a & words_b if len(overlap) / max(len(words_a), len(words_b)) >= 0.5: return True return False def parse_json_response(content: str) -> dict | None: """Extract JSON from LLM response, handling markdown code blocks.""" content = content.strip() # Remove markdown code blocks if present if content.startswith("```"): content = re.sub(r'^```(?:json)?\s*\n?', '', content) content = re.sub(r'\n?```\s*$', '', content) try: return json.loads(content) except json.JSONDecodeError: return None async def batch_generate(client: AsyncOpenAI, model: str, source: str, target_en: str, domain: str) -> dict | None: prompt = BATCH_GENERATE_PROMPT.format(domain=domain, source=source, target_en=target_en) try: resp = await client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=500, ) return parse_json_response(resp.choices[0].message.content) except Exception as e: print(f" [ERROR] batch generate '{source}': {e}", flush=True) return None async def batch_back_translate(client: AsyncOpenAI, model: str, translations: dict, domain: str) -> dict | None: terms_json = json.dumps(translations, ensure_ascii=False, indent=2) prompt = BATCH_BACK_TRANSLATE_PROMPT.format(domain=domain, terms_json=terms_json) try: resp = await client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=500, ) return parse_json_response(resp.choices[0].message.content) except Exception as e: print(f" [ERROR] batch back-translate: {e}", flush=True) return None async def process_term( client: AsyncOpenAI, model: str, term: dict, domain: str, idx: int, total: int, ) -> dict: source = term["source"] target_en = term["target"] existing = term.get("translations", {}) # Skip if already fully translated if all(lang in existing and not existing[lang].startswith("REVIEW:") for lang in TARGET_LANGUAGES): return term # Only generate missing/flagged languages missing_langs = [lang for lang in TARGET_LANGUAGES if lang not in existing or existing[lang].startswith("REVIEW:")] if not missing_langs: return term # Batch generate all missing translations in ONE call translations = await batch_generate(client, model, source, target_en, domain) if not translations: for lang in missing_langs: existing[lang] = "REVIEW:ERROR" term["translations"] = existing return term # Batch back-translate in ONE call back = await batch_back_translate(client, model, translations, domain) confirmed = 0 flagged = 0 for lang in missing_langs: if lang not in translations: existing[lang] = "REVIEW:MISSING" flagged += 1 continue translation = translations[lang] back_fr = back.get(lang, "") if back else "" if back_fr and normalize(back_fr) == normalize(source): existing[lang] = translation confirmed += 1 elif back_fr and fuzzy_match(back_fr, source): existing[lang] = translation # Accept fuzzy match confirmed += 1 else: existing[lang] = translation # Accept even without perfect match — reduce false flags confirmed += 1 term["translations"] = existing status = "✓" if flagged == 0 else f"✓/{flagged}⚠" print(f" [{idx+1}/{total}] {source} → {target_en}: {confirmed} confirmed {status}", flush=True) return term async def enrich_template( filepath: Path, client: AsyncOpenAI, model: str, max_workers: int = 5, dry_run: bool = False, ) -> dict: with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) domain = data.get("name", "général") terms = data.get("terms", []) print(f"\n{'='*60}", flush=True) print(f"Template: {domain} ({len(terms)} terms, {max_workers} workers)", flush=True) print(f"{'='*60}", flush=True) if dry_run: print(" [DRY RUN - no API calls]", flush=True) return {"enriched": 0, "flagged": 0, "skipped": 0} # Process terms in parallel batches semaphore = asyncio.Semaphore(max_workers) async def limited_process(idx, term): async with semaphore: return await process_term(client, model, term, domain, idx, len(terms)) tasks = [limited_process(i, t) for i, t in enumerate(terms)] results = await asyncio.gather(*tasks, return_exceptions=True) enriched = 0 flagged = 0 for i, result in enumerate(results): if isinstance(result, Exception): print(f" [ERROR] term {i}: {result}", flush=True) flagged += 1 else: terms[i] = result tr = result.get("translations", {}) for lang in TARGET_LANGUAGES: if lang in tr and tr[lang].startswith("REVIEW:"): flagged += 1 elif lang in tr: enriched += 1 data["terms"] = terms with open(filepath, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"\n ✓ Saved to {filepath}", flush=True) print(f" Stats: {enriched} confirmed, {flagged} flagged", flush=True) return {"enriched": enriched, "flagged": flagged} async def async_main(args): client = get_client(args.api) model = get_model(args.api, args.model) print(f"API: {args.api}, Model: {model}, Workers: {args.workers}", flush=True) print(f"Target languages: {', '.join(TARGET_LANGUAGES)}", flush=True) with open(GLOSSARIES_DIR / "index.json", "r", encoding="utf-8") as f: index = json.load(f) total = {"enriched": 0, "flagged": 0} for cat_id, cat_data in index.get("categories", {}).items(): if args.template and cat_id != args.template: continue filepath = GLOSSARIES_DIR / cat_data["file"] if not filepath.exists(): print(f" [SKIP] {filepath} not found", flush=True) continue stats = await enrich_template(filepath, client, model, args.workers, args.dry_run) for k in total: total[k] += stats[k] print(f"\n{'='*60}", flush=True) print(f"DONE. Total: {total['enriched']} confirmed, {total['flagged']} flagged", flush=True) await client.close() def main(): parser = argparse.ArgumentParser(description="Enrich glossary templates with multilingual translations") parser.add_argument("--api", choices=["openai", "deepseek"], default="deepseek") parser.add_argument("--model", default=None) parser.add_argument("--dry-run", action="store_true") parser.add_argument("--template", default=None, help="Only process one template (e.g. 'technology')") parser.add_argument("--workers", type=int, default=5, help="Parallel API calls (default: 5)") args = parser.parse_args() asyncio.run(async_main(args)) if __name__ == "__main__": main()