diff --git a/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py b/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py new file mode 100644 index 0000000..497a3e4 --- /dev/null +++ b/alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py @@ -0,0 +1,35 @@ +"""Add multilingual support to glossaries + +Revision ID: d4a1f8e2b3c7 +Revises: cb71a958ad92 +Create Date: 2026-05-16 + +Adds source_language to glossaries and translations JSON to glossary_terms. +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +# revision identifiers +revision = "d4a1f8e2b3c7" +down_revision = "cb71a958ad92" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "glossaries", + sa.Column("source_language", sa.String(10), nullable=False, server_default="fr"), + ) + op.add_column( + "glossary_terms", + sa.Column("translations", sa.JSON, nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("glossary_terms", "translations") + op.drop_column("glossaries", "source_language") diff --git a/database/models.py b/database/models.py index dbdd839..2f94eee 100644 --- a/database/models.py +++ b/database/models.py @@ -330,6 +330,7 @@ class Glossary(Base): String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False ) name = Column(String(255), nullable=False) + source_language = Column(String(10), nullable=False, default="fr") created_at = Column(DateTime, default=_utcnow) updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow) @@ -346,6 +347,7 @@ class Glossary(Base): "id": self.id, "user_id": self.user_id, "name": self.name, + "source_language": self.source_language, "terms": [term.to_dict() for term in self.terms] if self.terms else [], "created_at": self.created_at.isoformat() if self.created_at else None, "updated_at": self.updated_at.isoformat() if self.updated_at else None, @@ -365,6 +367,7 @@ class GlossaryTerm(Base): ) source = Column(String(500), nullable=False) target = Column(String(500), nullable=False) + translations = Column(JSON, nullable=True, default=dict) created_at = Column(DateTime, default=_utcnow) # Relationship @@ -378,6 +381,7 @@ class GlossaryTerm(Base): "id": self.id, "source": self.source, "target": self.target, + "translations": self.translations or {}, "created_at": self.created_at.isoformat() if self.created_at else None, } diff --git a/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx b/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx index 6da048e..fe5e811 100644 --- a/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx +++ b/frontend/src/app/dashboard/glossaries/EditGlossaryDialog.tsx @@ -71,6 +71,7 @@ export function EditGlossaryDialog({ id: `temp-${i}`, source: t.source, target: t.target, + translations: t.translations || {}, created_at: null, })), }; diff --git a/frontend/src/app/dashboard/glossaries/types.ts b/frontend/src/app/dashboard/glossaries/types.ts index 85c7f6f..4813ffe 100644 --- a/frontend/src/app/dashboard/glossaries/types.ts +++ b/frontend/src/app/dashboard/glossaries/types.ts @@ -2,12 +2,14 @@ export interface GlossaryTerm { id: string; source: string; target: string; + translations: Record; created_at: string | null; } export interface Glossary { id: string; name: string; + source_language: string; terms: GlossaryTerm[]; created_at: string; updated_at: string; @@ -16,6 +18,7 @@ export interface Glossary { export interface GlossaryListItem { id: string; name: string; + source_language: string; terms_count: number; created_at: string; } @@ -48,6 +51,7 @@ export interface GlossaryUpdateResponse { export interface GlossaryTermInput { source: string; target: string; + translations?: Record; } export interface GlossaryTermInputWithId extends GlossaryTermInput { @@ -56,16 +60,34 @@ export interface GlossaryTermInputWithId extends GlossaryTermInput { export interface GlossaryCreateInput { name: string; + source_language?: string; terms?: GlossaryTermInput[]; } export interface GlossaryUpdateInput { name?: string; + source_language?: string; terms?: GlossaryTermInput[]; } export const MAX_TERMS_PER_GLOSSARY = 500; +export const SUPPORTED_LANGUAGES: { code: string; label: string; flag: string }[] = [ + { code: 'en', label: 'English', flag: '🇬🇧' }, + { code: 'fr', label: 'Français', flag: '🇫🇷' }, + { code: 'es', label: 'Español', flag: '🇪🇸' }, + { code: 'de', label: 'Deutsch', flag: '🇩🇪' }, + { code: 'pt', label: 'Português', flag: '🇧🇷' }, + { code: 'it', label: 'Italiano', flag: '🇮🇹' }, + { code: 'nl', label: 'Nederlands', flag: '🇳🇱' }, + { code: 'ru', label: 'Русский', flag: '🇷🇺' }, + { code: 'ja', label: '日本語', flag: '🇯🇵' }, + { code: 'ko', label: '한국어', flag: '🇰🇷' }, + { code: 'zh', label: '中文', flag: '🇨🇳' }, + { code: 'ar', label: 'العربية', flag: '🇸🇦' }, + { code: 'fa', label: 'فارسی', flag: '🇮🇷' }, +]; + // Generate unique IDs for React keys let idCounter = 0; export function generateTermId(): string { diff --git a/routes/glossary_routes.py b/routes/glossary_routes.py index 3fc32da..b1086f7 100644 --- a/routes/glossary_routes.py +++ b/routes/glossary_routes.py @@ -46,6 +46,7 @@ def _format_term(term: GlossaryTerm) -> dict: "id": term.id, "source": term.source, "target": term.target, + "translations": term.translations or {}, "created_at": term.created_at.isoformat() if term.created_at else None, } @@ -55,6 +56,7 @@ def _format_glossary(glossary: Glossary) -> dict: return { "id": glossary.id, "name": glossary.name, + "source_language": glossary.source_language, "terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [], "created_at": glossary.created_at.isoformat() if glossary.created_at else None, "updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None, @@ -103,6 +105,7 @@ async def create_glossary( glossary = Glossary( user_id=user.id, name=body.name, + source_language=body.source_language, created_at=datetime.now(timezone.utc), updated_at=datetime.now(timezone.utc), ) @@ -112,6 +115,7 @@ async def create_glossary( glossary=glossary, source=term_data.source, target=term_data.target, + translations=term_data.translations or {}, created_at=datetime.now(timezone.utc), ) session.add(term) @@ -180,6 +184,7 @@ async def list_glossaries( GlossaryListItem( id=g.id, name=g.name, + source_language=g.source_language or "fr", terms_count=len(g.terms) if g.terms else 0, created_at=g.created_at, ) @@ -331,6 +336,9 @@ async def update_glossary( if body.name is not None: glossary.name = body.name + if body.source_language is not None: + glossary.source_language = body.source_language + if body.terms is not None: # Delete existing terms session.query(GlossaryTerm).filter( @@ -343,6 +351,7 @@ async def update_glossary( glossary_id=glossary.id, source=term_data.source, target=term_data.target, + translations=term_data.translations or {}, created_at=datetime.now(timezone.utc), ) session.add(term) diff --git a/routes/translate_routes.py b/routes/translate_routes.py index 79d1105..43d5611 100644 --- a/routes/translate_routes.py +++ b/routes/translate_routes.py @@ -915,10 +915,13 @@ async def _run_translation_job( # Story 3.10: Retrieve and format glossary terms for LLM prompt glossary_terms = None + glossary_source_lang = "fr" if glossary_id and user_id: try: - glossary_terms = get_glossary_terms(glossary_id, user_id) - logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms") + glossary_data = get_glossary_terms(glossary_id, user_id) + glossary_terms = glossary_data["terms"] + glossary_source_lang = glossary_data.get("source_language", "fr") + logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})") except GlossaryNotFoundError as e: tracker.set_error(str(e)) logger.error(f"Job {job_id}: Glossary error - {e}") @@ -940,7 +943,10 @@ async def _run_translation_job( effective_prompt = custom_prompt # Build the full prompt combining effective prompt and glossary - full_prompt = build_full_prompt(effective_prompt, glossary_terms) + full_prompt = build_full_prompt( + effective_prompt, glossary_terms, + source_lang=glossary_source_lang, target_lang=target_lang, + ) translation_provider = None _p = provider.lower() diff --git a/schemas/glossary_schemas.py b/schemas/glossary_schemas.py index eeb579a..760c74d 100644 --- a/schemas/glossary_schemas.py +++ b/schemas/glossary_schemas.py @@ -17,6 +17,9 @@ class GlossaryTermCreate(BaseModel): target: str = Field( ..., min_length=1, max_length=500, description="Traduction cible" ) + translations: Optional[dict[str, str]] = Field( + None, description="Traductions multilingues: {\"en\": \"coil\", \"de\": \"Spule\", ...}" + ) @field_validator("source", "target") @classmethod @@ -30,6 +33,7 @@ class GlossaryTermResponse(BaseModel): id: str source: str target: str + translations: dict[str, str] = {} created_at: Optional[datetime] = None model_config = {"from_attributes": True} @@ -39,6 +43,9 @@ class GlossaryCreate(BaseModel): """Schema for creating a glossary.""" name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire") + source_language: str = Field( + default="fr", max_length=10, description="Langue source (ISO code)" + ) terms: list[GlossaryTermCreate] = Field( default_factory=list, description="Liste des termes" ) @@ -53,6 +60,7 @@ class GlossaryUpdate(BaseModel): """Schema for updating a glossary (all fields optional).""" name: Optional[str] = Field(None, min_length=1, max_length=255) + source_language: Optional[str] = Field(None, max_length=10) terms: Optional[list[GlossaryTermCreate]] = Field(None) @field_validator("name") @@ -66,6 +74,7 @@ class GlossaryResponse(BaseModel): id: str name: str + source_language: str = "fr" terms: list[GlossaryTermResponse] = [] created_at: Optional[datetime] = None updated_at: Optional[datetime] = None @@ -78,6 +87,7 @@ class GlossaryListItem(BaseModel): id: str name: str + source_language: str = "fr" terms_count: int = Field( default=0, description="Nombre de termes dans le glossaire" ) diff --git a/services/glossary_service.py b/services/glossary_service.py index d8547cf..544cff5 100644 --- a/services/glossary_service.py +++ b/services/glossary_service.py @@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError logger = logging.getLogger(__name__) -def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]: +def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]: """ - Retrieve glossary terms for a specific glossary owned by a user. - + Retrieve glossary terms and metadata for a specific glossary owned by a user. + Args: glossary_id: UUID of the glossary user_id: UUID of the user (must own the glossary) - + Returns: - List of dictionaries with 'source' and 'target' keys - + Dict with 'source_language' and 'terms' (list of dicts with source, target, translations) + Raises: GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user """ @@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]: .filter(Glossary.id == glossary_id, Glossary.user_id == user_id) .first() ) - + if not glossary: raise GlossaryNotFoundError( message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.", details={"glossary_id": glossary_id} ) - - # Get all terms for this glossary + terms = ( session.query(GlossaryTerm) .filter(GlossaryTerm.glossary_id == glossary_id) .all() ) - - # Format as list of dicts - result = [{"source": term.source, "target": term.target} for term in terms] - + + result = [{ + "source": term.source, + "target": term.target, + "translations": term.translations or {} + } for term in terms] + logger.info( f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}" ) - - return result + + return { + "source_language": glossary.source_language or "fr", + "terms": result, + } except GlossaryNotFoundError: raise @@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool: ) -def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str: +def format_glossary_for_prompt( + terms: List[Dict[str, str]], + source_lang: str = "fr", + target_lang: str = "en", +) -> str: """ Format glossary terms for injection into an LLM system prompt. - - The format is designed to be clear and unambiguous for LLMs: - - Clear header explaining the purpose - - Simple source → target format - - Explicit instruction to use these translations - + + When a term has a translation for target_lang in its translations dict, + that specific translation is used. Otherwise, falls back to the default + target field (backward compat). For templates that only have EN translations, + the LLM is instructed to derive the correct target_lang equivalent. + Args: - terms: List of dictionaries with 'source' and 'target' keys - + terms: List of dicts with 'source', 'target', and optional 'translations' + source_lang: ISO code of the source language + target_lang: ISO code of the target language + Returns: Formatted string for LLM prompt """ if not terms: return "" - - # Sort terms by length (longest first) to avoid substring conflicts - # e.g., "machine learning" should match before "machine" + sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True) - + lines = [ - "TERMINOLOGY GLOSSARY (use these exact translations):", + f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):", "" ] - + + has_fallback = False for term in sorted_terms: source = term.get("source", "").strip() - target = term.get("target", "").strip() - if source and target: - # Escape single quotes in terms for clarity + if not source: + continue + + translations = term.get("translations", {}) or {} + specific = translations.get(target_lang, "").strip() + default_target = term.get("target", "").strip() + + if specific: source_escaped = source.replace("'", "\\'") - target_escaped = target.replace("'", "\\'") + target_escaped = specific.replace("'", "\\'") lines.append(f"- '{source_escaped}' → '{target_escaped}'") - + elif default_target: + source_escaped = source.replace("'", "\\'") + target_escaped = default_target.replace("'", "\\'") + lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})") + has_fallback = True + # If neither specific nor default, skip the term + + if not any(line.startswith("- ") for line in lines): + return "" + lines.extend([ "", "IMPORTANT: Always use these translations when the terms appear in the text." ]) - + + if has_fallback: + lines.append( + "NOTE: Some entries show an English reference — translate to the correct " + f"{target_lang} equivalent while preserving the intended meaning." + ) + return "\n".join(lines) def build_full_prompt( custom_prompt: Optional[str], - glossary_terms: Optional[List[Dict[str, str]]] + glossary_terms: Optional[List[Dict[str, str]]], + source_lang: str = "fr", + target_lang: str = "en", ) -> str: """ Build the complete prompt combining custom prompt and glossary. - + Args: custom_prompt: Optional custom system prompt from user glossary_terms: Optional list of glossary terms - + source_lang: ISO code of the source language + target_lang: ISO code of the target language + Returns: Combined prompt string """ parts = [] - + if custom_prompt: parts.append(custom_prompt) - + if glossary_terms: - glossary_prompt = format_glossary_for_prompt(glossary_terms) + glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang) if glossary_prompt: parts.append(glossary_prompt) - + return "\n\n".join(parts) if parts else "" \ No newline at end of file