feat: add multilingual glossary support (backend + frontend types)

Backend: - Add source_language column to glossaries table - Add translations JSON column to glossary_terms table - Alembic migration for schema changes - format_glossary_for_prompt now language-aware: extracts correct translation per target language, falls back to EN reference for templates with only FR→EN data - CRUD routes accept/return source_language and translations - Pydantic schemas updated Frontend: - Types updated: GlossaryTerm now has translations: Record<string, string> - Glossary/GlossaryListItem now have source_language - Added SUPPORTED_LANGUAGES constant (13 languages) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 15:25:28 +02:00
parent a76f7710e8
commit b2d918c832
8 changed files with 167 additions and 46 deletions
--- a/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
+++ b/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
@@ -0,0 +1,35 @@
+"""Add multilingual support to glossaries
+
+Revision ID: d4a1f8e2b3c7
+Revises: cb71a958ad92
+Create Date: 2026-05-16
+
+Adds source_language to glossaries and translations JSON to glossary_terms.
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers
+revision = "d4a1f8e2b3c7"
+down_revision = "cb71a958ad92"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "glossaries",
+        sa.Column("source_language", sa.String(10), nullable=False, server_default="fr"),
+    )
+    op.add_column(
+        "glossary_terms",
+        sa.Column("translations", sa.JSON, nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("glossary_terms", "translations")
+    op.drop_column("glossaries", "source_language")
--- a/database/models.py
+++ b/database/models.py
@@ -330,6 +330,7 @@ class Glossary(Base):
        String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False
    )
    name = Column(String(255), nullable=False)
+    source_language = Column(String(10), nullable=False, default="fr")
    created_at = Column(DateTime, default=_utcnow)
    updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow)

@@ -346,6 +347,7 @@ class Glossary(Base):
            "id": self.id,
            "user_id": self.user_id,
            "name": self.name,
+            "source_language": self.source_language,
            "terms": [term.to_dict() for term in self.terms] if self.terms else [],
            "created_at": self.created_at.isoformat() if self.created_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
@@ -365,6 +367,7 @@ class GlossaryTerm(Base):
    )
    source = Column(String(500), nullable=False)
    target = Column(String(500), nullable=False)
+    translations = Column(JSON, nullable=True, default=dict)
    created_at = Column(DateTime, default=_utcnow)

    # Relationship
@@ -378,6 +381,7 @@ class GlossaryTerm(Base):
            "id": self.id,
            "source": self.source,
            "target": self.target,
+            "translations": self.translations or {},
            "created_at": self.created_at.isoformat() if self.created_at else None,
        }

--- a/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx
+++ b/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx
@@ -71,6 +71,7 @@ export function EditGlossaryDialog({
        id: `temp-${i}`,
        source: t.source,
        target: t.target,
+        translations: t.translations || {},
        created_at: null,
      })),
    };
--- a/frontend/src/app/dashboard/glossaries/types.ts
+++ b/frontend/src/app/dashboard/glossaries/types.ts
@@ -2,12 +2,14 @@ export interface GlossaryTerm {
  id: string;
  source: string;
  target: string;
+  translations: Record<string, string>;
  created_at: string | null;
 }

 export interface Glossary {
  id: string;
  name: string;
+  source_language: string;
  terms: GlossaryTerm[];
  created_at: string;
  updated_at: string;
@@ -16,6 +18,7 @@ export interface Glossary {
 export interface GlossaryListItem {
  id: string;
  name: string;
+  source_language: string;
  terms_count: number;
  created_at: string;
 }
@@ -48,6 +51,7 @@ export interface GlossaryUpdateResponse {
 export interface GlossaryTermInput {
  source: string;
  target: string;
+  translations?: Record<string, string>;
 }

 export interface GlossaryTermInputWithId extends GlossaryTermInput {
@@ -56,16 +60,34 @@ export interface GlossaryTermInputWithId extends GlossaryTermInput {

 export interface GlossaryCreateInput {
  name: string;
+  source_language?: string;
  terms?: GlossaryTermInput[];
 }

 export interface GlossaryUpdateInput {
  name?: string;
+  source_language?: string;
  terms?: GlossaryTermInput[];
 }

 export const MAX_TERMS_PER_GLOSSARY = 500;

+export const SUPPORTED_LANGUAGES: { code: string; label: string; flag: string }[] = [
+  { code: 'en', label: 'English', flag: '🇬🇧' },
+  { code: 'fr', label: 'Français', flag: '🇫🇷' },
+  { code: 'es', label: 'Español', flag: '🇪🇸' },
+  { code: 'de', label: 'Deutsch', flag: '🇩🇪' },
+  { code: 'pt', label: 'Português', flag: '🇧🇷' },
+  { code: 'it', label: 'Italiano', flag: '🇮🇹' },
+  { code: 'nl', label: 'Nederlands', flag: '🇳🇱' },
+  { code: 'ru', label: 'Русский', flag: '🇷🇺' },
+  { code: 'ja', label: '日本語', flag: '🇯🇵' },
+  { code: 'ko', label: '한국어', flag: '🇰🇷' },
+  { code: 'zh', label: '中文', flag: '🇨🇳' },
+  { code: 'ar', label: 'العربية', flag: '🇸🇦' },
+  { code: 'fa', label: 'فارسی', flag: '🇮🇷' },
+];
+
 // Generate unique IDs for React keys
 let idCounter = 0;
 export function generateTermId(): string {
--- a/routes/glossary_routes.py
+++ b/routes/glossary_routes.py
@@ -46,6 +46,7 @@ def _format_term(term: GlossaryTerm) -> dict:
        "id": term.id,
        "source": term.source,
        "target": term.target,
+        "translations": term.translations or {},
        "created_at": term.created_at.isoformat() if term.created_at else None,
    }

@@ -55,6 +56,7 @@ def _format_glossary(glossary: Glossary) -> dict:
    return {
        "id": glossary.id,
        "name": glossary.name,
+        "source_language": glossary.source_language,
        "terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [],
        "created_at": glossary.created_at.isoformat() if glossary.created_at else None,
        "updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None,
@@ -103,6 +105,7 @@ async def create_glossary(
            glossary = Glossary(
                user_id=user.id,
                name=body.name,
+                source_language=body.source_language,
                created_at=datetime.now(timezone.utc),
                updated_at=datetime.now(timezone.utc),
            )
@@ -112,6 +115,7 @@ async def create_glossary(
                    glossary=glossary,
                    source=term_data.source,
                    target=term_data.target,
+                    translations=term_data.translations or {},
                    created_at=datetime.now(timezone.utc),
                )
                session.add(term)
@@ -180,6 +184,7 @@ async def list_glossaries(
                GlossaryListItem(
                    id=g.id,
                    name=g.name,
+                    source_language=g.source_language or "fr",
                    terms_count=len(g.terms) if g.terms else 0,
                    created_at=g.created_at,
                )
@@ -331,6 +336,9 @@ async def update_glossary(
            if body.name is not None:
                glossary.name = body.name

+            if body.source_language is not None:
+                glossary.source_language = body.source_language
+
            if body.terms is not None:
                # Delete existing terms
                session.query(GlossaryTerm).filter(
@@ -343,6 +351,7 @@ async def update_glossary(
                        glossary_id=glossary.id,
                        source=term_data.source,
                        target=term_data.target,
+                        translations=term_data.translations or {},
                        created_at=datetime.now(timezone.utc),
                    )
                    session.add(term)
--- a/routes/translate_routes.py
+++ b/routes/translate_routes.py
@@ -915,10 +915,13 @@ async def _run_translation_job(

        # Story 3.10: Retrieve and format glossary terms for LLM prompt
        glossary_terms = None
+        glossary_source_lang = "fr"
        if glossary_id and user_id:
            try:
-                glossary_terms = get_glossary_terms(glossary_id, user_id)
-                logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms")
+                glossary_data = get_glossary_terms(glossary_id, user_id)
+                glossary_terms = glossary_data["terms"]
+                glossary_source_lang = glossary_data.get("source_language", "fr")
+                logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
            except GlossaryNotFoundError as e:
                tracker.set_error(str(e))
                logger.error(f"Job {job_id}: Glossary error - {e}")
@@ -940,7 +943,10 @@ async def _run_translation_job(
            effective_prompt = custom_prompt

        # Build the full prompt combining effective prompt and glossary
-        full_prompt = build_full_prompt(effective_prompt, glossary_terms)
+        full_prompt = build_full_prompt(
+            effective_prompt, glossary_terms,
+            source_lang=glossary_source_lang, target_lang=target_lang,
+        )

        translation_provider = None
        _p = provider.lower()
--- a/schemas/glossary_schemas.py
+++ b/schemas/glossary_schemas.py
@@ -17,6 +17,9 @@ class GlossaryTermCreate(BaseModel):
    target: str = Field(
        ..., min_length=1, max_length=500, description="Traduction cible"
    )
+    translations: Optional[dict[str, str]] = Field(
+        None, description="Traductions multilingues: {\"en\": \"coil\", \"de\": \"Spule\", ...}"
+    )

    @field_validator("source", "target")
    @classmethod
@@ -30,6 +33,7 @@ class GlossaryTermResponse(BaseModel):
    id: str
    source: str
    target: str
+    translations: dict[str, str] = {}
    created_at: Optional[datetime] = None

    model_config = {"from_attributes": True}
@@ -39,6 +43,9 @@ class GlossaryCreate(BaseModel):
    """Schema for creating a glossary."""

    name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire")
+    source_language: str = Field(
+        default="fr", max_length=10, description="Langue source (ISO code)"
+    )
    terms: list[GlossaryTermCreate] = Field(
        default_factory=list, description="Liste des termes"
    )
@@ -53,6 +60,7 @@ class GlossaryUpdate(BaseModel):
    """Schema for updating a glossary (all fields optional)."""

    name: Optional[str] = Field(None, min_length=1, max_length=255)
+    source_language: Optional[str] = Field(None, max_length=10)
    terms: Optional[list[GlossaryTermCreate]] = Field(None)

    @field_validator("name")
@@ -66,6 +74,7 @@ class GlossaryResponse(BaseModel):

    id: str
    name: str
+    source_language: str = "fr"
    terms: list[GlossaryTermResponse] = []
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None
@@ -78,6 +87,7 @@ class GlossaryListItem(BaseModel):

    id: str
    name: str
+    source_language: str = "fr"
    terms_count: int = Field(
        default=0, description="Nombre de termes dans le glossaire"
    )
--- a/services/glossary_service.py
+++ b/services/glossary_service.py
@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
 logger = logging.getLogger(__name__)


-def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
+def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
    """
-    Retrieve glossary terms for a specific glossary owned by a user.
-    
+    Retrieve glossary terms and metadata for a specific glossary owned by a user.
+
    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)
-    
+
    Returns:
-        List of dictionaries with 'source' and 'target' keys
-        
+        Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
+
    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )
-            
+
            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )
-            
-            # Get all terms for this glossary
+
            terms = (
                session.query(GlossaryTerm)
                .filter(GlossaryTerm.glossary_id == glossary_id)
                .all()
            )
-            
-            # Format as list of dicts
-            result = [{"source": term.source, "target": term.target} for term in terms]
-            
+
+            result = [{
+                "source": term.source,
+                "target": term.target,
+                "translations": term.translations or {}
+            } for term in terms]
+
            logger.info(
                f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
            )
-            
-            return result
+
+            return {
+                "source_language": glossary.source_language or "fr",
+                "terms": result,
+            }
            
    except GlossaryNotFoundError:
        raise
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
        )


-def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
+def format_glossary_for_prompt(
+    terms: List[Dict[str, str]],
+    source_lang: str = "fr",
+    target_lang: str = "en",
+) -> str:
    """
    Format glossary terms for injection into an LLM system prompt.
-    
-    The format is designed to be clear and unambiguous for LLMs:
-    - Clear header explaining the purpose
-    - Simple source → target format
-    - Explicit instruction to use these translations
-    
+
+    When a term has a translation for target_lang in its translations dict,
+    that specific translation is used. Otherwise, falls back to the default
+    target field (backward compat). For templates that only have EN translations,
+    the LLM is instructed to derive the correct target_lang equivalent.
+
    Args:
-        terms: List of dictionaries with 'source' and 'target' keys
-    
+        terms: List of dicts with 'source', 'target', and optional 'translations'
+        source_lang: ISO code of the source language
+        target_lang: ISO code of the target language
+
    Returns:
        Formatted string for LLM prompt
    """
    if not terms:
        return ""
-    
-    # Sort terms by length (longest first) to avoid substring conflicts
-    # e.g., "machine learning" should match before "machine"
+
    sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
-    
+
    lines = [
-        "TERMINOLOGY GLOSSARY (use these exact translations):",
+        f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
        ""
    ]
-    
+
+    has_fallback = False
    for term in sorted_terms:
        source = term.get("source", "").strip()
-        target = term.get("target", "").strip()
-        if source and target:
-            # Escape single quotes in terms for clarity
+        if not source:
+            continue
+
+        translations = term.get("translations", {}) or {}
+        specific = translations.get(target_lang, "").strip()
+        default_target = term.get("target", "").strip()
+
+        if specific:
            source_escaped = source.replace("'", "\\'")
-            target_escaped = target.replace("'", "\\'")
+            target_escaped = specific.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}'")
-    
+        elif default_target:
+            source_escaped = source.replace("'", "\\'")
+            target_escaped = default_target.replace("'", "\\'")
+            lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
+            has_fallback = True
+        # If neither specific nor default, skip the term
+
+    if not any(line.startswith("- ") for line in lines):
+        return ""
+
    lines.extend([
        "",
        "IMPORTANT: Always use these translations when the terms appear in the text."
    ])
-    
+
+    if has_fallback:
+        lines.append(
+            "NOTE: Some entries show an English reference — translate to the correct "
+            f"{target_lang} equivalent while preserving the intended meaning."
+        )
+
    return "\n".join(lines)


 def build_full_prompt(
    custom_prompt: Optional[str],
-    glossary_terms: Optional[List[Dict[str, str]]]
+    glossary_terms: Optional[List[Dict[str, str]]],
+    source_lang: str = "fr",
+    target_lang: str = "en",
 ) -> str:
    """
    Build the complete prompt combining custom prompt and glossary.
-    
+
    Args:
        custom_prompt: Optional custom system prompt from user
        glossary_terms: Optional list of glossary terms
-    
+        source_lang: ISO code of the source language
+        target_lang: ISO code of the target language
+
    Returns:
        Combined prompt string
    """
    parts = []
-    
+
    if custom_prompt:
        parts.append(custom_prompt)
-    
+
    if glossary_terms:
-        glossary_prompt = format_glossary_for_prompt(glossary_terms)
+        glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
        if glossary_prompt:
            parts.append(glossary_prompt)
-    
+
    return "\n\n".join(parts) if parts else ""