feat: add multilingual glossary support (backend + frontend types)

Backend: - Add source_language column to glossaries table - Add translations JSON column to glossary_terms table - Alembic migration for schema changes - format_glossary_for_prompt now language-aware: extracts correct translation per target language, falls back to EN reference for templates with only FR→EN data - CRUD routes accept/return source_language and translations - Pydantic schemas updated Frontend: - Types updated: GlossaryTerm now has translations: Record<string, string> - Glossary/GlossaryListItem now have source_language - Added SUPPORTED_LANGUAGES constant (13 languages) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 15:25:28 +02:00
parent a76f7710e8
commit b2d918c832
8 changed files with 167 additions and 46 deletions
--- a/services/glossary_service.py
+++ b/services/glossary_service.py
@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
 logger = logging.getLogger(__name__)


-def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
+def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
    """
-    Retrieve glossary terms for a specific glossary owned by a user.
-    
+    Retrieve glossary terms and metadata for a specific glossary owned by a user.
+
    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)
-    
+
    Returns:
-        List of dictionaries with 'source' and 'target' keys
-        
+        Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
+
    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )
-            
+
            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )
-            
-            # Get all terms for this glossary
+
            terms = (
                session.query(GlossaryTerm)
                .filter(GlossaryTerm.glossary_id == glossary_id)
                .all()
            )
-            
-            # Format as list of dicts
-            result = [{"source": term.source, "target": term.target} for term in terms]
-            
+
+            result = [{
+                "source": term.source,
+                "target": term.target,
+                "translations": term.translations or {}
+            } for term in terms]
+
            logger.info(
                f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
            )
-            
-            return result
+
+            return {
+                "source_language": glossary.source_language or "fr",
+                "terms": result,
+            }
            
    except GlossaryNotFoundError:
        raise
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
        )


-def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
+def format_glossary_for_prompt(
+    terms: List[Dict[str, str]],
+    source_lang: str = "fr",
+    target_lang: str = "en",
+) -> str:
    """
    Format glossary terms for injection into an LLM system prompt.
-    
-    The format is designed to be clear and unambiguous for LLMs:
-    - Clear header explaining the purpose
-    - Simple source → target format
-    - Explicit instruction to use these translations
-    
+
+    When a term has a translation for target_lang in its translations dict,
+    that specific translation is used. Otherwise, falls back to the default
+    target field (backward compat). For templates that only have EN translations,
+    the LLM is instructed to derive the correct target_lang equivalent.
+
    Args:
-        terms: List of dictionaries with 'source' and 'target' keys
-    
+        terms: List of dicts with 'source', 'target', and optional 'translations'
+        source_lang: ISO code of the source language
+        target_lang: ISO code of the target language
+
    Returns:
        Formatted string for LLM prompt
    """
    if not terms:
        return ""
-    
-    # Sort terms by length (longest first) to avoid substring conflicts
-    # e.g., "machine learning" should match before "machine"
+
    sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
-    
+
    lines = [
-        "TERMINOLOGY GLOSSARY (use these exact translations):",
+        f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
        ""
    ]
-    
+
+    has_fallback = False
    for term in sorted_terms:
        source = term.get("source", "").strip()
-        target = term.get("target", "").strip()
-        if source and target:
-            # Escape single quotes in terms for clarity
+        if not source:
+            continue
+
+        translations = term.get("translations", {}) or {}
+        specific = translations.get(target_lang, "").strip()
+        default_target = term.get("target", "").strip()
+
+        if specific:
            source_escaped = source.replace("'", "\\'")
-            target_escaped = target.replace("'", "\\'")
+            target_escaped = specific.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}'")
-    
+        elif default_target:
+            source_escaped = source.replace("'", "\\'")
+            target_escaped = default_target.replace("'", "\\'")
+            lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
+            has_fallback = True
+        # If neither specific nor default, skip the term
+
+    if not any(line.startswith("- ") for line in lines):
+        return ""
+
    lines.extend([
        "",
        "IMPORTANT: Always use these translations when the terms appear in the text."
    ])
-    
+
+    if has_fallback:
+        lines.append(
+            "NOTE: Some entries show an English reference — translate to the correct "
+            f"{target_lang} equivalent while preserving the intended meaning."
+        )
+
    return "\n".join(lines)


 def build_full_prompt(
    custom_prompt: Optional[str],
-    glossary_terms: Optional[List[Dict[str, str]]]
+    glossary_terms: Optional[List[Dict[str, str]]],
+    source_lang: str = "fr",
+    target_lang: str = "en",
 ) -> str:
    """
    Build the complete prompt combining custom prompt and glossary.
-    
+
    Args:
        custom_prompt: Optional custom system prompt from user
        glossary_terms: Optional list of glossary terms
-    
+        source_lang: ISO code of the source language
+        target_lang: ISO code of the target language
+
    Returns:
        Combined prompt string
    """
    parts = []
-    
+
    if custom_prompt:
        parts.append(custom_prompt)
-    
+
    if glossary_terms:
-        glossary_prompt = format_glossary_for_prompt(glossary_terms)
+        glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
        if glossary_prompt:
            parts.append(glossary_prompt)
-    
+
    return "\n\n".join(parts) if parts else ""