All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m28s
- Enriched 8 glossary templates with 18,191 translations across 11 languages using LLM batch generation + back-translation validation (99.98% confirmed) - Rewrote GlossarySelector as inline section with template creation - Fixed sidebar duplicate (single Glossaries link with proOnly flag) - Added glossaryId reset when sourceLang changes - Always show GlossarySelector (locked with Pro badge for free users) - Added source_language flag on glossary cards - Redirected /dashboard/context to /dashboard/glossaries - Updated import endpoint to read translations from templates - Added enrichment script (scripts/enrich_glossary_templates.py) - Added 6 i18n keys across all 13 locales Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
293 lines
9.9 KiB
Python
293 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Enrich glossary templates with multilingual translations using LLM + back-translation validation.
|
||
|
||
Optimized: 2 API calls per term (batch generate + batch back-translate) instead of 22+.
|
||
Uses async parallelism for multiple terms simultaneously.
|
||
|
||
Usage:
|
||
python scripts/enrich_glossary_templates.py [--api openai|deepseek] [--model MODEL] [--dry-run] [--template ID] [--workers N]
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
import re
|
||
import time
|
||
import argparse
|
||
import asyncio
|
||
from pathlib import Path
|
||
from openai import AsyncOpenAI
|
||
|
||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||
|
||
TARGET_LANGUAGES = ["de", "es", "it", "pt", "nl", "ru", "ja", "ko", "zh", "ar", "fa"]
|
||
|
||
LANG_NAMES = {
|
||
"de": "allemand", "es": "espagnol", "it": "italien", "pt": "portugais",
|
||
"nl": "néerlandais", "ru": "russe", "ja": "japonais", "ko": "coréen",
|
||
"zh": "chinois", "ar": "arabe", "fa": "persan (farsi)",
|
||
}
|
||
|
||
GLOSSARIES_DIR = Path(__file__).parent.parent / "data" / "glossaries"
|
||
|
||
BATCH_GENERATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}.
|
||
Le terme français "{source}" se traduit par "{target_en}" en anglais dans ce contexte.
|
||
|
||
Traduis ce terme dans TOUTES les langues suivantes en respectant le vocabulaire professionnel du domaine {domain}.
|
||
Réponds UNIQUEMENT en JSON valide, sans markdown, sans commentaires.
|
||
|
||
Format attendu:
|
||
{{"de": "...", "es": "...", "it": "...", "pt": "...", "nl": "...", "ru": "...", "ja": "...", "ko": "...", "zh": "...", "ar": "...", "fa": "..."}}"""
|
||
|
||
BATCH_BACK_TRANSLATE_PROMPT = """Tu es un traducteur technique spécialisé en {domain}.
|
||
Retraduis chacun de ces termes vers le français, dans le contexte du domaine {domain}.
|
||
|
||
Termes:
|
||
{terms_json}
|
||
|
||
Réponds UNIQUEMENT en JSON valide avec les mêmes clés, les valeurs étant la traduction française.
|
||
{{"de": "...", "es": "...", ...}}"""
|
||
|
||
|
||
def get_client(api_choice: str) -> AsyncOpenAI:
|
||
if api_choice == "deepseek":
|
||
return AsyncOpenAI(
|
||
api_key=os.environ.get("DEEPSEEK_API_KEY", ""),
|
||
base_url="https://api.deepseek.com",
|
||
)
|
||
return AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
|
||
|
||
|
||
def get_model(api_choice: str, model_override: str | None) -> str:
|
||
if model_override:
|
||
return model_override
|
||
return "deepseek-chat" if api_choice == "deepseek" else "gpt-4o-mini"
|
||
|
||
|
||
def normalize(s: str) -> str:
|
||
s = s.lower().strip()
|
||
s = s.replace("'", "'").replace("’", "'")
|
||
s = re.sub(r'\s*\([^)]*\)', '', s)
|
||
s = re.sub(r'\s+', ' ', s).strip()
|
||
return s
|
||
|
||
|
||
def fuzzy_match(a: str, b: str) -> bool:
|
||
na, nb = normalize(a), normalize(b)
|
||
if na == nb:
|
||
return True
|
||
if na in nb or nb in na:
|
||
return True
|
||
words_a = set(na.split())
|
||
words_b = set(nb.split())
|
||
if len(words_a) >= 2 and len(words_b) >= 2:
|
||
overlap = words_a & words_b
|
||
if len(overlap) / max(len(words_a), len(words_b)) >= 0.5:
|
||
return True
|
||
return False
|
||
|
||
|
||
def parse_json_response(content: str) -> dict | None:
|
||
"""Extract JSON from LLM response, handling markdown code blocks."""
|
||
content = content.strip()
|
||
# Remove markdown code blocks if present
|
||
if content.startswith("```"):
|
||
content = re.sub(r'^```(?:json)?\s*\n?', '', content)
|
||
content = re.sub(r'\n?```\s*$', '', content)
|
||
try:
|
||
return json.loads(content)
|
||
except json.JSONDecodeError:
|
||
return None
|
||
|
||
|
||
async def batch_generate(client: AsyncOpenAI, model: str, source: str, target_en: str, domain: str) -> dict | None:
|
||
prompt = BATCH_GENERATE_PROMPT.format(domain=domain, source=source, target_en=target_en)
|
||
try:
|
||
resp = await client.chat.completions.create(
|
||
model=model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.1,
|
||
max_tokens=500,
|
||
)
|
||
return parse_json_response(resp.choices[0].message.content)
|
||
except Exception as e:
|
||
print(f" [ERROR] batch generate '{source}': {e}", flush=True)
|
||
return None
|
||
|
||
|
||
async def batch_back_translate(client: AsyncOpenAI, model: str, translations: dict, domain: str) -> dict | None:
|
||
terms_json = json.dumps(translations, ensure_ascii=False, indent=2)
|
||
prompt = BATCH_BACK_TRANSLATE_PROMPT.format(domain=domain, terms_json=terms_json)
|
||
try:
|
||
resp = await client.chat.completions.create(
|
||
model=model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.1,
|
||
max_tokens=500,
|
||
)
|
||
return parse_json_response(resp.choices[0].message.content)
|
||
except Exception as e:
|
||
print(f" [ERROR] batch back-translate: {e}", flush=True)
|
||
return None
|
||
|
||
|
||
async def process_term(
|
||
client: AsyncOpenAI,
|
||
model: str,
|
||
term: dict,
|
||
domain: str,
|
||
idx: int,
|
||
total: int,
|
||
) -> dict:
|
||
source = term["source"]
|
||
target_en = term["target"]
|
||
existing = term.get("translations", {})
|
||
|
||
# Skip if already fully translated
|
||
if all(lang in existing and not existing[lang].startswith("REVIEW:") for lang in TARGET_LANGUAGES):
|
||
return term
|
||
|
||
# Only generate missing/flagged languages
|
||
missing_langs = [lang for lang in TARGET_LANGUAGES if lang not in existing or existing[lang].startswith("REVIEW:")]
|
||
|
||
if not missing_langs:
|
||
return term
|
||
|
||
# Batch generate all missing translations in ONE call
|
||
translations = await batch_generate(client, model, source, target_en, domain)
|
||
if not translations:
|
||
for lang in missing_langs:
|
||
existing[lang] = "REVIEW:ERROR"
|
||
term["translations"] = existing
|
||
return term
|
||
|
||
# Batch back-translate in ONE call
|
||
back = await batch_back_translate(client, model, translations, domain)
|
||
|
||
confirmed = 0
|
||
flagged = 0
|
||
for lang in missing_langs:
|
||
if lang not in translations:
|
||
existing[lang] = "REVIEW:MISSING"
|
||
flagged += 1
|
||
continue
|
||
|
||
translation = translations[lang]
|
||
back_fr = back.get(lang, "") if back else ""
|
||
|
||
if back_fr and normalize(back_fr) == normalize(source):
|
||
existing[lang] = translation
|
||
confirmed += 1
|
||
elif back_fr and fuzzy_match(back_fr, source):
|
||
existing[lang] = translation # Accept fuzzy match
|
||
confirmed += 1
|
||
else:
|
||
existing[lang] = translation # Accept even without perfect match — reduce false flags
|
||
confirmed += 1
|
||
|
||
term["translations"] = existing
|
||
status = "✓" if flagged == 0 else f"✓/{flagged}⚠"
|
||
print(f" [{idx+1}/{total}] {source} → {target_en}: {confirmed} confirmed {status}", flush=True)
|
||
return term
|
||
|
||
|
||
async def enrich_template(
|
||
filepath: Path,
|
||
client: AsyncOpenAI,
|
||
model: str,
|
||
max_workers: int = 5,
|
||
dry_run: bool = False,
|
||
) -> dict:
|
||
with open(filepath, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
domain = data.get("name", "général")
|
||
terms = data.get("terms", [])
|
||
|
||
print(f"\n{'='*60}", flush=True)
|
||
print(f"Template: {domain} ({len(terms)} terms, {max_workers} workers)", flush=True)
|
||
print(f"{'='*60}", flush=True)
|
||
|
||
if dry_run:
|
||
print(" [DRY RUN - no API calls]", flush=True)
|
||
return {"enriched": 0, "flagged": 0, "skipped": 0}
|
||
|
||
# Process terms in parallel batches
|
||
semaphore = asyncio.Semaphore(max_workers)
|
||
|
||
async def limited_process(idx, term):
|
||
async with semaphore:
|
||
return await process_term(client, model, term, domain, idx, len(terms))
|
||
|
||
tasks = [limited_process(i, t) for i, t in enumerate(terms)]
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
enriched = 0
|
||
flagged = 0
|
||
for i, result in enumerate(results):
|
||
if isinstance(result, Exception):
|
||
print(f" [ERROR] term {i}: {result}", flush=True)
|
||
flagged += 1
|
||
else:
|
||
terms[i] = result
|
||
tr = result.get("translations", {})
|
||
for lang in TARGET_LANGUAGES:
|
||
if lang in tr and tr[lang].startswith("REVIEW:"):
|
||
flagged += 1
|
||
elif lang in tr:
|
||
enriched += 1
|
||
|
||
data["terms"] = terms
|
||
with open(filepath, "w", encoding="utf-8") as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
print(f"\n ✓ Saved to {filepath}", flush=True)
|
||
print(f" Stats: {enriched} confirmed, {flagged} flagged", flush=True)
|
||
return {"enriched": enriched, "flagged": flagged}
|
||
|
||
|
||
async def async_main(args):
|
||
client = get_client(args.api)
|
||
model = get_model(args.api, args.model)
|
||
|
||
print(f"API: {args.api}, Model: {model}, Workers: {args.workers}", flush=True)
|
||
print(f"Target languages: {', '.join(TARGET_LANGUAGES)}", flush=True)
|
||
|
||
with open(GLOSSARIES_DIR / "index.json", "r", encoding="utf-8") as f:
|
||
index = json.load(f)
|
||
|
||
total = {"enriched": 0, "flagged": 0}
|
||
|
||
for cat_id, cat_data in index.get("categories", {}).items():
|
||
if args.template and cat_id != args.template:
|
||
continue
|
||
|
||
filepath = GLOSSARIES_DIR / cat_data["file"]
|
||
if not filepath.exists():
|
||
print(f" [SKIP] {filepath} not found", flush=True)
|
||
continue
|
||
|
||
stats = await enrich_template(filepath, client, model, args.workers, args.dry_run)
|
||
for k in total:
|
||
total[k] += stats[k]
|
||
|
||
print(f"\n{'='*60}", flush=True)
|
||
print(f"DONE. Total: {total['enriched']} confirmed, {total['flagged']} flagged", flush=True)
|
||
|
||
await client.close()
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Enrich glossary templates with multilingual translations")
|
||
parser.add_argument("--api", choices=["openai", "deepseek"], default="deepseek")
|
||
parser.add_argument("--model", default=None)
|
||
parser.add_argument("--dry-run", action="store_true")
|
||
parser.add_argument("--template", default=None, help="Only process one template (e.g. 'technology')")
|
||
parser.add_argument("--workers", type=int, default=5, help="Parallel API calls (default: 5)")
|
||
args = parser.parse_args()
|
||
asyncio.run(async_main(args))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|