feat: add multilingual glossary support (backend + frontend types)
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s

Backend:
- Add source_language column to glossaries table
- Add translations JSON column to glossary_terms table
- Alembic migration for schema changes
- format_glossary_for_prompt now language-aware: extracts correct
  translation per target language, falls back to EN reference for
  templates with only FR→EN data
- CRUD routes accept/return source_language and translations
- Pydantic schemas updated

Frontend:
- Types updated: GlossaryTerm now has translations: Record<string, string>
- Glossary/GlossaryListItem now have source_language
- Added SUPPORTED_LANGUAGES constant (13 languages)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 15:25:28 +02:00
parent a76f7710e8
commit b2d918c832
8 changed files with 167 additions and 46 deletions

View File

@@ -0,0 +1,35 @@
"""Add multilingual support to glossaries
Revision ID: d4a1f8e2b3c7
Revises: cb71a958ad92
Create Date: 2026-05-16
Adds source_language to glossaries and translations JSON to glossary_terms.
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers
revision = "d4a1f8e2b3c7"
down_revision = "cb71a958ad92"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.add_column(
"glossaries",
sa.Column("source_language", sa.String(10), nullable=False, server_default="fr"),
)
op.add_column(
"glossary_terms",
sa.Column("translations", sa.JSON, nullable=True),
)
def downgrade() -> None:
op.drop_column("glossary_terms", "translations")
op.drop_column("glossaries", "source_language")

View File

@@ -330,6 +330,7 @@ class Glossary(Base):
String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False
) )
name = Column(String(255), nullable=False) name = Column(String(255), nullable=False)
source_language = Column(String(10), nullable=False, default="fr")
created_at = Column(DateTime, default=_utcnow) created_at = Column(DateTime, default=_utcnow)
updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow) updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow)
@@ -346,6 +347,7 @@ class Glossary(Base):
"id": self.id, "id": self.id,
"user_id": self.user_id, "user_id": self.user_id,
"name": self.name, "name": self.name,
"source_language": self.source_language,
"terms": [term.to_dict() for term in self.terms] if self.terms else [], "terms": [term.to_dict() for term in self.terms] if self.terms else [],
"created_at": self.created_at.isoformat() if self.created_at else None, "created_at": self.created_at.isoformat() if self.created_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None, "updated_at": self.updated_at.isoformat() if self.updated_at else None,
@@ -365,6 +367,7 @@ class GlossaryTerm(Base):
) )
source = Column(String(500), nullable=False) source = Column(String(500), nullable=False)
target = Column(String(500), nullable=False) target = Column(String(500), nullable=False)
translations = Column(JSON, nullable=True, default=dict)
created_at = Column(DateTime, default=_utcnow) created_at = Column(DateTime, default=_utcnow)
# Relationship # Relationship
@@ -378,6 +381,7 @@ class GlossaryTerm(Base):
"id": self.id, "id": self.id,
"source": self.source, "source": self.source,
"target": self.target, "target": self.target,
"translations": self.translations or {},
"created_at": self.created_at.isoformat() if self.created_at else None, "created_at": self.created_at.isoformat() if self.created_at else None,
} }

View File

@@ -71,6 +71,7 @@ export function EditGlossaryDialog({
id: `temp-${i}`, id: `temp-${i}`,
source: t.source, source: t.source,
target: t.target, target: t.target,
translations: t.translations || {},
created_at: null, created_at: null,
})), })),
}; };

View File

@@ -2,12 +2,14 @@ export interface GlossaryTerm {
id: string; id: string;
source: string; source: string;
target: string; target: string;
translations: Record<string, string>;
created_at: string | null; created_at: string | null;
} }
export interface Glossary { export interface Glossary {
id: string; id: string;
name: string; name: string;
source_language: string;
terms: GlossaryTerm[]; terms: GlossaryTerm[];
created_at: string; created_at: string;
updated_at: string; updated_at: string;
@@ -16,6 +18,7 @@ export interface Glossary {
export interface GlossaryListItem { export interface GlossaryListItem {
id: string; id: string;
name: string; name: string;
source_language: string;
terms_count: number; terms_count: number;
created_at: string; created_at: string;
} }
@@ -48,6 +51,7 @@ export interface GlossaryUpdateResponse {
export interface GlossaryTermInput { export interface GlossaryTermInput {
source: string; source: string;
target: string; target: string;
translations?: Record<string, string>;
} }
export interface GlossaryTermInputWithId extends GlossaryTermInput { export interface GlossaryTermInputWithId extends GlossaryTermInput {
@@ -56,16 +60,34 @@ export interface GlossaryTermInputWithId extends GlossaryTermInput {
export interface GlossaryCreateInput { export interface GlossaryCreateInput {
name: string; name: string;
source_language?: string;
terms?: GlossaryTermInput[]; terms?: GlossaryTermInput[];
} }
export interface GlossaryUpdateInput { export interface GlossaryUpdateInput {
name?: string; name?: string;
source_language?: string;
terms?: GlossaryTermInput[]; terms?: GlossaryTermInput[];
} }
export const MAX_TERMS_PER_GLOSSARY = 500; export const MAX_TERMS_PER_GLOSSARY = 500;
export const SUPPORTED_LANGUAGES: { code: string; label: string; flag: string }[] = [
{ code: 'en', label: 'English', flag: '🇬🇧' },
{ code: 'fr', label: 'Français', flag: '🇫🇷' },
{ code: 'es', label: 'Español', flag: '🇪🇸' },
{ code: 'de', label: 'Deutsch', flag: '🇩🇪' },
{ code: 'pt', label: 'Português', flag: '🇧🇷' },
{ code: 'it', label: 'Italiano', flag: '🇮🇹' },
{ code: 'nl', label: 'Nederlands', flag: '🇳🇱' },
{ code: 'ru', label: 'Русский', flag: '🇷🇺' },
{ code: 'ja', label: '日本語', flag: '🇯🇵' },
{ code: 'ko', label: '한국어', flag: '🇰🇷' },
{ code: 'zh', label: '中文', flag: '🇨🇳' },
{ code: 'ar', label: 'العربية', flag: '🇸🇦' },
{ code: 'fa', label: 'فارسی', flag: '🇮🇷' },
];
// Generate unique IDs for React keys // Generate unique IDs for React keys
let idCounter = 0; let idCounter = 0;
export function generateTermId(): string { export function generateTermId(): string {

View File

@@ -46,6 +46,7 @@ def _format_term(term: GlossaryTerm) -> dict:
"id": term.id, "id": term.id,
"source": term.source, "source": term.source,
"target": term.target, "target": term.target,
"translations": term.translations or {},
"created_at": term.created_at.isoformat() if term.created_at else None, "created_at": term.created_at.isoformat() if term.created_at else None,
} }
@@ -55,6 +56,7 @@ def _format_glossary(glossary: Glossary) -> dict:
return { return {
"id": glossary.id, "id": glossary.id,
"name": glossary.name, "name": glossary.name,
"source_language": glossary.source_language,
"terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [], "terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [],
"created_at": glossary.created_at.isoformat() if glossary.created_at else None, "created_at": glossary.created_at.isoformat() if glossary.created_at else None,
"updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None, "updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None,
@@ -103,6 +105,7 @@ async def create_glossary(
glossary = Glossary( glossary = Glossary(
user_id=user.id, user_id=user.id,
name=body.name, name=body.name,
source_language=body.source_language,
created_at=datetime.now(timezone.utc), created_at=datetime.now(timezone.utc),
updated_at=datetime.now(timezone.utc), updated_at=datetime.now(timezone.utc),
) )
@@ -112,6 +115,7 @@ async def create_glossary(
glossary=glossary, glossary=glossary,
source=term_data.source, source=term_data.source,
target=term_data.target, target=term_data.target,
translations=term_data.translations or {},
created_at=datetime.now(timezone.utc), created_at=datetime.now(timezone.utc),
) )
session.add(term) session.add(term)
@@ -180,6 +184,7 @@ async def list_glossaries(
GlossaryListItem( GlossaryListItem(
id=g.id, id=g.id,
name=g.name, name=g.name,
source_language=g.source_language or "fr",
terms_count=len(g.terms) if g.terms else 0, terms_count=len(g.terms) if g.terms else 0,
created_at=g.created_at, created_at=g.created_at,
) )
@@ -331,6 +336,9 @@ async def update_glossary(
if body.name is not None: if body.name is not None:
glossary.name = body.name glossary.name = body.name
if body.source_language is not None:
glossary.source_language = body.source_language
if body.terms is not None: if body.terms is not None:
# Delete existing terms # Delete existing terms
session.query(GlossaryTerm).filter( session.query(GlossaryTerm).filter(
@@ -343,6 +351,7 @@ async def update_glossary(
glossary_id=glossary.id, glossary_id=glossary.id,
source=term_data.source, source=term_data.source,
target=term_data.target, target=term_data.target,
translations=term_data.translations or {},
created_at=datetime.now(timezone.utc), created_at=datetime.now(timezone.utc),
) )
session.add(term) session.add(term)

View File

@@ -915,10 +915,13 @@ async def _run_translation_job(
# Story 3.10: Retrieve and format glossary terms for LLM prompt # Story 3.10: Retrieve and format glossary terms for LLM prompt
glossary_terms = None glossary_terms = None
glossary_source_lang = "fr"
if glossary_id and user_id: if glossary_id and user_id:
try: try:
glossary_terms = get_glossary_terms(glossary_id, user_id) glossary_data = get_glossary_terms(glossary_id, user_id)
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms") glossary_terms = glossary_data["terms"]
glossary_source_lang = glossary_data.get("source_language", "fr")
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
except GlossaryNotFoundError as e: except GlossaryNotFoundError as e:
tracker.set_error(str(e)) tracker.set_error(str(e))
logger.error(f"Job {job_id}: Glossary error - {e}") logger.error(f"Job {job_id}: Glossary error - {e}")
@@ -940,7 +943,10 @@ async def _run_translation_job(
effective_prompt = custom_prompt effective_prompt = custom_prompt
# Build the full prompt combining effective prompt and glossary # Build the full prompt combining effective prompt and glossary
full_prompt = build_full_prompt(effective_prompt, glossary_terms) full_prompt = build_full_prompt(
effective_prompt, glossary_terms,
source_lang=glossary_source_lang, target_lang=target_lang,
)
translation_provider = None translation_provider = None
_p = provider.lower() _p = provider.lower()

View File

@@ -17,6 +17,9 @@ class GlossaryTermCreate(BaseModel):
target: str = Field( target: str = Field(
..., min_length=1, max_length=500, description="Traduction cible" ..., min_length=1, max_length=500, description="Traduction cible"
) )
translations: Optional[dict[str, str]] = Field(
None, description="Traductions multilingues: {\"en\": \"coil\", \"de\": \"Spule\", ...}"
)
@field_validator("source", "target") @field_validator("source", "target")
@classmethod @classmethod
@@ -30,6 +33,7 @@ class GlossaryTermResponse(BaseModel):
id: str id: str
source: str source: str
target: str target: str
translations: dict[str, str] = {}
created_at: Optional[datetime] = None created_at: Optional[datetime] = None
model_config = {"from_attributes": True} model_config = {"from_attributes": True}
@@ -39,6 +43,9 @@ class GlossaryCreate(BaseModel):
"""Schema for creating a glossary.""" """Schema for creating a glossary."""
name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire") name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire")
source_language: str = Field(
default="fr", max_length=10, description="Langue source (ISO code)"
)
terms: list[GlossaryTermCreate] = Field( terms: list[GlossaryTermCreate] = Field(
default_factory=list, description="Liste des termes" default_factory=list, description="Liste des termes"
) )
@@ -53,6 +60,7 @@ class GlossaryUpdate(BaseModel):
"""Schema for updating a glossary (all fields optional).""" """Schema for updating a glossary (all fields optional)."""
name: Optional[str] = Field(None, min_length=1, max_length=255) name: Optional[str] = Field(None, min_length=1, max_length=255)
source_language: Optional[str] = Field(None, max_length=10)
terms: Optional[list[GlossaryTermCreate]] = Field(None) terms: Optional[list[GlossaryTermCreate]] = Field(None)
@field_validator("name") @field_validator("name")
@@ -66,6 +74,7 @@ class GlossaryResponse(BaseModel):
id: str id: str
name: str name: str
source_language: str = "fr"
terms: list[GlossaryTermResponse] = [] terms: list[GlossaryTermResponse] = []
created_at: Optional[datetime] = None created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None updated_at: Optional[datetime] = None
@@ -78,6 +87,7 @@ class GlossaryListItem(BaseModel):
id: str id: str
name: str name: str
source_language: str = "fr"
terms_count: int = Field( terms_count: int = Field(
default=0, description="Nombre de termes dans le glossaire" default=0, description="Nombre de termes dans le glossaire"
) )

View File

@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]: def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
""" """
Retrieve glossary terms for a specific glossary owned by a user. Retrieve glossary terms and metadata for a specific glossary owned by a user.
Args: Args:
glossary_id: UUID of the glossary glossary_id: UUID of the glossary
user_id: UUID of the user (must own the glossary) user_id: UUID of the user (must own the glossary)
Returns: Returns:
List of dictionaries with 'source' and 'target' keys Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
Raises: Raises:
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
""" """
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id) .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
.first() .first()
) )
if not glossary: if not glossary:
raise GlossaryNotFoundError( raise GlossaryNotFoundError(
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.", message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
details={"glossary_id": glossary_id} details={"glossary_id": glossary_id}
) )
# Get all terms for this glossary
terms = ( terms = (
session.query(GlossaryTerm) session.query(GlossaryTerm)
.filter(GlossaryTerm.glossary_id == glossary_id) .filter(GlossaryTerm.glossary_id == glossary_id)
.all() .all()
) )
# Format as list of dicts result = [{
result = [{"source": term.source, "target": term.target} for term in terms] "source": term.source,
"target": term.target,
"translations": term.translations or {}
} for term in terms]
logger.info( logger.info(
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}" f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
) )
return result return {
"source_language": glossary.source_language or "fr",
"terms": result,
}
except GlossaryNotFoundError: except GlossaryNotFoundError:
raise raise
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
) )
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str: def format_glossary_for_prompt(
terms: List[Dict[str, str]],
source_lang: str = "fr",
target_lang: str = "en",
) -> str:
""" """
Format glossary terms for injection into an LLM system prompt. Format glossary terms for injection into an LLM system prompt.
The format is designed to be clear and unambiguous for LLMs: When a term has a translation for target_lang in its translations dict,
- Clear header explaining the purpose that specific translation is used. Otherwise, falls back to the default
- Simple source → target format target field (backward compat). For templates that only have EN translations,
- Explicit instruction to use these translations the LLM is instructed to derive the correct target_lang equivalent.
Args: Args:
terms: List of dictionaries with 'source' and 'target' keys terms: List of dicts with 'source', 'target', and optional 'translations'
source_lang: ISO code of the source language
target_lang: ISO code of the target language
Returns: Returns:
Formatted string for LLM prompt Formatted string for LLM prompt
""" """
if not terms: if not terms:
return "" return ""
# Sort terms by length (longest first) to avoid substring conflicts
# e.g., "machine learning" should match before "machine"
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True) sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
lines = [ lines = [
"TERMINOLOGY GLOSSARY (use these exact translations):", f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
"" ""
] ]
has_fallback = False
for term in sorted_terms: for term in sorted_terms:
source = term.get("source", "").strip() source = term.get("source", "").strip()
target = term.get("target", "").strip() if not source:
if source and target: continue
# Escape single quotes in terms for clarity
translations = term.get("translations", {}) or {}
specific = translations.get(target_lang, "").strip()
default_target = term.get("target", "").strip()
if specific:
source_escaped = source.replace("'", "\\'") source_escaped = source.replace("'", "\\'")
target_escaped = target.replace("'", "\\'") target_escaped = specific.replace("'", "\\'")
lines.append(f"- '{source_escaped}''{target_escaped}'") lines.append(f"- '{source_escaped}''{target_escaped}'")
elif default_target:
source_escaped = source.replace("'", "\\'")
target_escaped = default_target.replace("'", "\\'")
lines.append(f"- '{source_escaped}''{target_escaped}' (EN reference, adapt to {target_lang})")
has_fallback = True
# If neither specific nor default, skip the term
if not any(line.startswith("- ") for line in lines):
return ""
lines.extend([ lines.extend([
"", "",
"IMPORTANT: Always use these translations when the terms appear in the text." "IMPORTANT: Always use these translations when the terms appear in the text."
]) ])
if has_fallback:
lines.append(
"NOTE: Some entries show an English reference — translate to the correct "
f"{target_lang} equivalent while preserving the intended meaning."
)
return "\n".join(lines) return "\n".join(lines)
def build_full_prompt( def build_full_prompt(
custom_prompt: Optional[str], custom_prompt: Optional[str],
glossary_terms: Optional[List[Dict[str, str]]] glossary_terms: Optional[List[Dict[str, str]]],
source_lang: str = "fr",
target_lang: str = "en",
) -> str: ) -> str:
""" """
Build the complete prompt combining custom prompt and glossary. Build the complete prompt combining custom prompt and glossary.
Args: Args:
custom_prompt: Optional custom system prompt from user custom_prompt: Optional custom system prompt from user
glossary_terms: Optional list of glossary terms glossary_terms: Optional list of glossary terms
source_lang: ISO code of the source language
target_lang: ISO code of the target language
Returns: Returns:
Combined prompt string Combined prompt string
""" """
parts = [] parts = []
if custom_prompt: if custom_prompt:
parts.append(custom_prompt) parts.append(custom_prompt)
if glossary_terms: if glossary_terms:
glossary_prompt = format_glossary_for_prompt(glossary_terms) glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
if glossary_prompt: if glossary_prompt:
parts.append(glossary_prompt) parts.append(glossary_prompt)
return "\n\n".join(parts) if parts else "" return "\n\n".join(parts) if parts else ""