feat: add multilingual glossary support (backend + frontend types)

Backend: - Add source_language column to glossaries table - Add translations JSON column to glossary_terms table - Alembic migration for schema changes - format_glossary_for_prompt now language-aware: extracts correct translation per target language, falls back to EN reference for templates with only FR→EN data - CRUD routes accept/return source_language and translations - Pydantic schemas updated Frontend: - Types updated: GlossaryTerm now has translations: Record<string, string> - Glossary/GlossaryListItem now have source_language - Added SUPPORTED_LANGUAGES constant (13 languages) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 15:25:28 +02:00
parent a76f7710e8
commit b2d918c832
8 changed files with 167 additions and 46 deletions
--- a/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
+++ b/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
@@ -0,0 +1,35 @@
 """Add multilingual support to glossaries
 Revision ID: d4a1f8e2b3c7
 Revises: cb71a958ad92
 Create Date: 2026-05-16
 Adds source_language to glossaries and translations JSON to glossary_terms.
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers
 revision = "d4a1f8e2b3c7"
 down_revision = "cb71a958ad92"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    op.add_column(
        "glossaries",
        sa.Column("source_language", sa.String(10), nullable=False, server_default="fr"),
    )
    op.add_column(
        "glossary_terms",
        sa.Column("translations", sa.JSON, nullable=True),
    )
 def downgrade() -> None:
    op.drop_column("glossary_terms", "translations")
    op.drop_column("glossaries", "source_language")
--- a/database/models.py
+++ b/database/models.py
@@ -330,6 +330,7 @@ class Glossary(Base):
        String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False
    )
    name = Column(String(255), nullable=False)
    source_language = Column(String(10), nullable=False, default="fr")
    created_at = Column(DateTime, default=_utcnow)
    updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow)
@@ -346,6 +347,7 @@ class Glossary(Base):
            "id": self.id,
            "user_id": self.user_id,
            "name": self.name,
            "source_language": self.source_language,
            "terms": [term.to_dict() for term in self.terms] if self.terms else [],
            "created_at": self.created_at.isoformat() if self.created_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
@@ -365,6 +367,7 @@ class GlossaryTerm(Base):
    )
    source = Column(String(500), nullable=False)
    target = Column(String(500), nullable=False)
    translations = Column(JSON, nullable=True, default=dict)
    created_at = Column(DateTime, default=_utcnow)
    # Relationship
@@ -378,6 +381,7 @@ class GlossaryTerm(Base):
            "id": self.id,
            "source": self.source,
            "target": self.target,
            "translations": self.translations or {},
            "created_at": self.created_at.isoformat() if self.created_at else None,
        }
--- a/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx
+++ b/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx
@@ -71,6 +71,7 @@ export function EditGlossaryDialog({
        id: `temp-${i}`,
        source: t.source,
        target: t.target,
        translations: t.translations || {},
        created_at: null,
      })),
    };
--- a/frontend/src/app/dashboard/glossaries/types.ts
+++ b/frontend/src/app/dashboard/glossaries/types.ts
@@ -2,12 +2,14 @@ export interface GlossaryTerm {
  id: string;
  source: string;
  target: string;
  translations: Record<string, string>;
  created_at: string | null;
 }
 export interface Glossary {
  id: string;
  name: string;
  source_language: string;
  terms: GlossaryTerm[];
  created_at: string;
  updated_at: string;
@@ -16,6 +18,7 @@ export interface Glossary {
 export interface GlossaryListItem {
  id: string;
  name: string;
  source_language: string;
  terms_count: number;
  created_at: string;
 }
@@ -48,6 +51,7 @@ export interface GlossaryUpdateResponse {
 export interface GlossaryTermInput {
  source: string;
  target: string;
  translations?: Record<string, string>;
 }
 export interface GlossaryTermInputWithId extends GlossaryTermInput {
@@ -56,16 +60,34 @@ export interface GlossaryTermInputWithId extends GlossaryTermInput {
 export interface GlossaryCreateInput {
  name: string;
  source_language?: string;
  terms?: GlossaryTermInput[];
 }
 export interface GlossaryUpdateInput {
  name?: string;
  source_language?: string;
  terms?: GlossaryTermInput[];
 }
 export const MAX_TERMS_PER_GLOSSARY = 500;
 export const SUPPORTED_LANGUAGES: { code: string; label: string; flag: string }[] = [
  { code: 'en', label: 'English', flag: '🇬🇧' },
  { code: 'fr', label: 'Français', flag: '🇫🇷' },
  { code: 'es', label: 'Español', flag: '🇪🇸' },
  { code: 'de', label: 'Deutsch', flag: '🇩🇪' },
  { code: 'pt', label: 'Português', flag: '🇧🇷' },
  { code: 'it', label: 'Italiano', flag: '🇮🇹' },
  { code: 'nl', label: 'Nederlands', flag: '🇳🇱' },
  { code: 'ru', label: 'Русский', flag: '🇷🇺' },
  { code: 'ja', label: '日本語', flag: '🇯🇵' },
  { code: 'ko', label: '한국어', flag: '🇰🇷' },
  { code: 'zh', label: '中文', flag: '🇨🇳' },
  { code: 'ar', label: 'العربية', flag: '🇸🇦' },
  { code: 'fa', label: 'فارسی', flag: '🇮🇷' },
 ];
 // Generate unique IDs for React keys
 let idCounter = 0;
 export function generateTermId(): string {
--- a/routes/glossary_routes.py
+++ b/routes/glossary_routes.py
@@ -46,6 +46,7 @@ def _format_term(term: GlossaryTerm) -> dict:
        "id": term.id,
        "source": term.source,
        "target": term.target,
        "translations": term.translations or {},
        "created_at": term.created_at.isoformat() if term.created_at else None,
    }
@@ -55,6 +56,7 @@ def _format_glossary(glossary: Glossary) -> dict:
    return {
        "id": glossary.id,
        "name": glossary.name,
        "source_language": glossary.source_language,
        "terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [],
        "created_at": glossary.created_at.isoformat() if glossary.created_at else None,
        "updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None,
@@ -103,6 +105,7 @@ async def create_glossary(
            glossary = Glossary(
                user_id=user.id,
                name=body.name,
                source_language=body.source_language,
                created_at=datetime.now(timezone.utc),
                updated_at=datetime.now(timezone.utc),
            )
@@ -112,6 +115,7 @@ async def create_glossary(
                    glossary=glossary,
                    source=term_data.source,
                    target=term_data.target,
                    translations=term_data.translations or {},
                    created_at=datetime.now(timezone.utc),
                )
                session.add(term)
@@ -180,6 +184,7 @@ async def list_glossaries(
                GlossaryListItem(
                    id=g.id,
                    name=g.name,
                    source_language=g.source_language or "fr",
                    terms_count=len(g.terms) if g.terms else 0,
                    created_at=g.created_at,
                )
@@ -331,6 +336,9 @@ async def update_glossary(
            if body.name is not None:
                glossary.name = body.name
            if body.source_language is not None:
                glossary.source_language = body.source_language
            if body.terms is not None:
                # Delete existing terms
                session.query(GlossaryTerm).filter(
@@ -343,6 +351,7 @@ async def update_glossary(
                        glossary_id=glossary.id,
                        source=term_data.source,
                        target=term_data.target,
                        translations=term_data.translations or {},
                        created_at=datetime.now(timezone.utc),
                    )
                    session.add(term)
--- a/routes/translate_routes.py
+++ b/routes/translate_routes.py
@@ -915,10 +915,13 @@ async def _run_translation_job(
        # Story 3.10: Retrieve and format glossary terms for LLM prompt
        glossary_terms = None
        glossary_source_lang = "fr"
        if glossary_id and user_id:
            try:
-                glossary_terms = get_glossary_terms(glossary_id, user_id)
+                glossary_data = get_glossary_terms(glossary_id, user_id)
-                logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms")
+                glossary_terms = glossary_data["terms"]
                glossary_source_lang = glossary_data.get("source_language", "fr")
                logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
            except GlossaryNotFoundError as e:
                tracker.set_error(str(e))
                logger.error(f"Job {job_id}: Glossary error - {e}")
@@ -940,7 +943,10 @@ async def _run_translation_job(
            effective_prompt = custom_prompt
        # Build the full prompt combining effective prompt and glossary
-        full_prompt = build_full_prompt(effective_prompt, glossary_terms)
+        full_prompt = build_full_prompt(
            effective_prompt, glossary_terms,
            source_lang=glossary_source_lang, target_lang=target_lang,
        )
        translation_provider = None
        _p = provider.lower()
--- a/schemas/glossary_schemas.py
+++ b/schemas/glossary_schemas.py
@@ -17,6 +17,9 @@ class GlossaryTermCreate(BaseModel):
    target: str = Field(
        ..., min_length=1, max_length=500, description="Traduction cible"
    )
    translations: Optional[dict[str, str]] = Field(
        None, description="Traductions multilingues: {\"en\": \"coil\", \"de\": \"Spule\", ...}"
    )
    @field_validator("source", "target")
    @classmethod
@@ -30,6 +33,7 @@ class GlossaryTermResponse(BaseModel):
    id: str
    source: str
    target: str
    translations: dict[str, str] = {}
    created_at: Optional[datetime] = None
    model_config = {"from_attributes": True}
@@ -39,6 +43,9 @@ class GlossaryCreate(BaseModel):
    """Schema for creating a glossary."""
    name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire")
    source_language: str = Field(
        default="fr", max_length=10, description="Langue source (ISO code)"
    )
    terms: list[GlossaryTermCreate] = Field(
        default_factory=list, description="Liste des termes"
    )
@@ -53,6 +60,7 @@ class GlossaryUpdate(BaseModel):
    """Schema for updating a glossary (all fields optional)."""
    name: Optional[str] = Field(None, min_length=1, max_length=255)
    source_language: Optional[str] = Field(None, max_length=10)
    terms: Optional[list[GlossaryTermCreate]] = Field(None)
    @field_validator("name")
@@ -66,6 +74,7 @@ class GlossaryResponse(BaseModel):
    id: str
    name: str
    source_language: str = "fr"
    terms: list[GlossaryTermResponse] = []
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None
@@ -78,6 +87,7 @@ class GlossaryListItem(BaseModel):
    id: str
    name: str
    source_language: str = "fr"
    terms_count: int = Field(
        default=0, description="Nombre de termes dans le glossaire"
    )
--- a/services/glossary_service.py
+++ b/services/glossary_service.py
@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
 logger = logging.getLogger(__name__)
-def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
+def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
    """
-    Retrieve glossary terms for a specific glossary owned by a user.
+    Retrieve glossary terms and metadata for a specific glossary owned by a user.
-    
+
    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)
-    
+
    Returns:
-        List of dictionaries with 'source' and 'target' keys
+        Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
-        
+
    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )
-            
+
            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )
-            
+
            # Get all terms for this glossary
            terms = (
                session.query(GlossaryTerm)
                .filter(GlossaryTerm.glossary_id == glossary_id)
                .all()
            )
-            
+
-            # Format as list of dicts
+            result = [{
-            result = [{"source": term.source, "target": term.target} for term in terms]
+                "source": term.source,
-            
+                "target": term.target,
                "translations": term.translations or {}
            } for term in terms]
            logger.info(
                f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
            )
-            
+
-            return result
+            return {
                "source_language": glossary.source_language or "fr",
                "terms": result,
            }
    except GlossaryNotFoundError:
        raise
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
        )
-def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
+def format_glossary_for_prompt(
    terms: List[Dict[str, str]],
    source_lang: str = "fr",
    target_lang: str = "en",
 ) -> str:
    """
    Format glossary terms for injection into an LLM system prompt.
-    
+
-    The format is designed to be clear and unambiguous for LLMs:
+    When a term has a translation for target_lang in its translations dict,
-    - Clear header explaining the purpose
+    that specific translation is used. Otherwise, falls back to the default
-    - Simple source → target format
+    target field (backward compat). For templates that only have EN translations,
-    - Explicit instruction to use these translations
+    the LLM is instructed to derive the correct target_lang equivalent.
-    
+
    Args:
-        terms: List of dictionaries with 'source' and 'target' keys
+        terms: List of dicts with 'source', 'target', and optional 'translations'
-    
+        source_lang: ISO code of the source language
        target_lang: ISO code of the target language
    Returns:
        Formatted string for LLM prompt
    """
    if not terms:
        return ""
-    
+
    # Sort terms by length (longest first) to avoid substring conflicts
    # e.g., "machine learning" should match before "machine"
    sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
-    
+
    lines = [
-        "TERMINOLOGY GLOSSARY (use these exact translations):",
+        f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
        ""
    ]
-    
+
    has_fallback = False
    for term in sorted_terms:
        source = term.get("source", "").strip()
-        target = term.get("target", "").strip()
+        if not source:
-        if source and target:
+            continue
-            # Escape single quotes in terms for clarity
+
        translations = term.get("translations", {}) or {}
        specific = translations.get(target_lang, "").strip()
        default_target = term.get("target", "").strip()
        if specific:
            source_escaped = source.replace("'", "\\'")
-            target_escaped = target.replace("'", "\\'")
+            target_escaped = specific.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}'")
-    
+        elif default_target:
            source_escaped = source.replace("'", "\\'")
            target_escaped = default_target.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
            has_fallback = True
        # If neither specific nor default, skip the term
    if not any(line.startswith("- ") for line in lines):
        return ""
    lines.extend([
        "",
        "IMPORTANT: Always use these translations when the terms appear in the text."
    ])
-    
+
    if has_fallback:
        lines.append(
            "NOTE: Some entries show an English reference — translate to the correct "
            f"{target_lang} equivalent while preserving the intended meaning."
        )
    return "\n".join(lines)
 def build_full_prompt(
    custom_prompt: Optional[str],
-    glossary_terms: Optional[List[Dict[str, str]]]
+    glossary_terms: Optional[List[Dict[str, str]]],
    source_lang: str = "fr",
    target_lang: str = "en",
 ) -> str:
    """
    Build the complete prompt combining custom prompt and glossary.
-    
+
    Args:
        custom_prompt: Optional custom system prompt from user
        glossary_terms: Optional list of glossary terms
-    
+        source_lang: ISO code of the source language
        target_lang: ISO code of the target language
    Returns:
        Combined prompt string
    """
    parts = []
-    
+
    if custom_prompt:
        parts.append(custom_prompt)
-    
+
    if glossary_terms:
-        glossary_prompt = format_glossary_for_prompt(glossary_terms)
+        glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
        if glossary_prompt:
            parts.append(glossary_prompt)
-    
+
    return "\n\n".join(parts) if parts else ""