feat: add multilingual glossary support (backend + frontend types)
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s
Backend: - Add source_language column to glossaries table - Add translations JSON column to glossary_terms table - Alembic migration for schema changes - format_glossary_for_prompt now language-aware: extracts correct translation per target language, falls back to EN reference for templates with only FR→EN data - CRUD routes accept/return source_language and translations - Pydantic schemas updated Frontend: - Types updated: GlossaryTerm now has translations: Record<string, string> - Glossary/GlossaryListItem now have source_language - Added SUPPORTED_LANGUAGES constant (13 languages) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
35
alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
Normal file
35
alembic/versions/d4a1f8e2b3c7_add_glossary_multilingual.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""Add multilingual support to glossaries
|
||||
|
||||
Revision ID: d4a1f8e2b3c7
|
||||
Revises: cb71a958ad92
|
||||
Create Date: 2026-05-16
|
||||
|
||||
Adds source_language to glossaries and translations JSON to glossary_terms.
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers
|
||||
revision = "d4a1f8e2b3c7"
|
||||
down_revision = "cb71a958ad92"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"glossaries",
|
||||
sa.Column("source_language", sa.String(10), nullable=False, server_default="fr"),
|
||||
)
|
||||
op.add_column(
|
||||
"glossary_terms",
|
||||
sa.Column("translations", sa.JSON, nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("glossary_terms", "translations")
|
||||
op.drop_column("glossaries", "source_language")
|
||||
@@ -330,6 +330,7 @@ class Glossary(Base):
|
||||
String(36), ForeignKey("users.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
name = Column(String(255), nullable=False)
|
||||
source_language = Column(String(10), nullable=False, default="fr")
|
||||
created_at = Column(DateTime, default=_utcnow)
|
||||
updated_at = Column(DateTime, default=_utcnow, onupdate=_utcnow)
|
||||
|
||||
@@ -346,6 +347,7 @@ class Glossary(Base):
|
||||
"id": self.id,
|
||||
"user_id": self.user_id,
|
||||
"name": self.name,
|
||||
"source_language": self.source_language,
|
||||
"terms": [term.to_dict() for term in self.terms] if self.terms else [],
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
||||
@@ -365,6 +367,7 @@ class GlossaryTerm(Base):
|
||||
)
|
||||
source = Column(String(500), nullable=False)
|
||||
target = Column(String(500), nullable=False)
|
||||
translations = Column(JSON, nullable=True, default=dict)
|
||||
created_at = Column(DateTime, default=_utcnow)
|
||||
|
||||
# Relationship
|
||||
@@ -378,6 +381,7 @@ class GlossaryTerm(Base):
|
||||
"id": self.id,
|
||||
"source": self.source,
|
||||
"target": self.target,
|
||||
"translations": self.translations or {},
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
@@ -71,6 +71,7 @@ export function EditGlossaryDialog({
|
||||
id: `temp-${i}`,
|
||||
source: t.source,
|
||||
target: t.target,
|
||||
translations: t.translations || {},
|
||||
created_at: null,
|
||||
})),
|
||||
};
|
||||
|
||||
@@ -2,12 +2,14 @@ export interface GlossaryTerm {
|
||||
id: string;
|
||||
source: string;
|
||||
target: string;
|
||||
translations: Record<string, string>;
|
||||
created_at: string | null;
|
||||
}
|
||||
|
||||
export interface Glossary {
|
||||
id: string;
|
||||
name: string;
|
||||
source_language: string;
|
||||
terms: GlossaryTerm[];
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
@@ -16,6 +18,7 @@ export interface Glossary {
|
||||
export interface GlossaryListItem {
|
||||
id: string;
|
||||
name: string;
|
||||
source_language: string;
|
||||
terms_count: number;
|
||||
created_at: string;
|
||||
}
|
||||
@@ -48,6 +51,7 @@ export interface GlossaryUpdateResponse {
|
||||
export interface GlossaryTermInput {
|
||||
source: string;
|
||||
target: string;
|
||||
translations?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface GlossaryTermInputWithId extends GlossaryTermInput {
|
||||
@@ -56,16 +60,34 @@ export interface GlossaryTermInputWithId extends GlossaryTermInput {
|
||||
|
||||
export interface GlossaryCreateInput {
|
||||
name: string;
|
||||
source_language?: string;
|
||||
terms?: GlossaryTermInput[];
|
||||
}
|
||||
|
||||
export interface GlossaryUpdateInput {
|
||||
name?: string;
|
||||
source_language?: string;
|
||||
terms?: GlossaryTermInput[];
|
||||
}
|
||||
|
||||
export const MAX_TERMS_PER_GLOSSARY = 500;
|
||||
|
||||
export const SUPPORTED_LANGUAGES: { code: string; label: string; flag: string }[] = [
|
||||
{ code: 'en', label: 'English', flag: '🇬🇧' },
|
||||
{ code: 'fr', label: 'Français', flag: '🇫🇷' },
|
||||
{ code: 'es', label: 'Español', flag: '🇪🇸' },
|
||||
{ code: 'de', label: 'Deutsch', flag: '🇩🇪' },
|
||||
{ code: 'pt', label: 'Português', flag: '🇧🇷' },
|
||||
{ code: 'it', label: 'Italiano', flag: '🇮🇹' },
|
||||
{ code: 'nl', label: 'Nederlands', flag: '🇳🇱' },
|
||||
{ code: 'ru', label: 'Русский', flag: '🇷🇺' },
|
||||
{ code: 'ja', label: '日本語', flag: '🇯🇵' },
|
||||
{ code: 'ko', label: '한국어', flag: '🇰🇷' },
|
||||
{ code: 'zh', label: '中文', flag: '🇨🇳' },
|
||||
{ code: 'ar', label: 'العربية', flag: '🇸🇦' },
|
||||
{ code: 'fa', label: 'فارسی', flag: '🇮🇷' },
|
||||
];
|
||||
|
||||
// Generate unique IDs for React keys
|
||||
let idCounter = 0;
|
||||
export function generateTermId(): string {
|
||||
|
||||
@@ -46,6 +46,7 @@ def _format_term(term: GlossaryTerm) -> dict:
|
||||
"id": term.id,
|
||||
"source": term.source,
|
||||
"target": term.target,
|
||||
"translations": term.translations or {},
|
||||
"created_at": term.created_at.isoformat() if term.created_at else None,
|
||||
}
|
||||
|
||||
@@ -55,6 +56,7 @@ def _format_glossary(glossary: Glossary) -> dict:
|
||||
return {
|
||||
"id": glossary.id,
|
||||
"name": glossary.name,
|
||||
"source_language": glossary.source_language,
|
||||
"terms": [_format_term(t) for t in glossary.terms] if glossary.terms else [],
|
||||
"created_at": glossary.created_at.isoformat() if glossary.created_at else None,
|
||||
"updated_at": glossary.updated_at.isoformat() if glossary.updated_at else None,
|
||||
@@ -103,6 +105,7 @@ async def create_glossary(
|
||||
glossary = Glossary(
|
||||
user_id=user.id,
|
||||
name=body.name,
|
||||
source_language=body.source_language,
|
||||
created_at=datetime.now(timezone.utc),
|
||||
updated_at=datetime.now(timezone.utc),
|
||||
)
|
||||
@@ -112,6 +115,7 @@ async def create_glossary(
|
||||
glossary=glossary,
|
||||
source=term_data.source,
|
||||
target=term_data.target,
|
||||
translations=term_data.translations or {},
|
||||
created_at=datetime.now(timezone.utc),
|
||||
)
|
||||
session.add(term)
|
||||
@@ -180,6 +184,7 @@ async def list_glossaries(
|
||||
GlossaryListItem(
|
||||
id=g.id,
|
||||
name=g.name,
|
||||
source_language=g.source_language or "fr",
|
||||
terms_count=len(g.terms) if g.terms else 0,
|
||||
created_at=g.created_at,
|
||||
)
|
||||
@@ -331,6 +336,9 @@ async def update_glossary(
|
||||
if body.name is not None:
|
||||
glossary.name = body.name
|
||||
|
||||
if body.source_language is not None:
|
||||
glossary.source_language = body.source_language
|
||||
|
||||
if body.terms is not None:
|
||||
# Delete existing terms
|
||||
session.query(GlossaryTerm).filter(
|
||||
@@ -343,6 +351,7 @@ async def update_glossary(
|
||||
glossary_id=glossary.id,
|
||||
source=term_data.source,
|
||||
target=term_data.target,
|
||||
translations=term_data.translations or {},
|
||||
created_at=datetime.now(timezone.utc),
|
||||
)
|
||||
session.add(term)
|
||||
|
||||
@@ -915,10 +915,13 @@ async def _run_translation_job(
|
||||
|
||||
# Story 3.10: Retrieve and format glossary terms for LLM prompt
|
||||
glossary_terms = None
|
||||
glossary_source_lang = "fr"
|
||||
if glossary_id and user_id:
|
||||
try:
|
||||
glossary_terms = get_glossary_terms(glossary_id, user_id)
|
||||
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms")
|
||||
glossary_data = get_glossary_terms(glossary_id, user_id)
|
||||
glossary_terms = glossary_data["terms"]
|
||||
glossary_source_lang = glossary_data.get("source_language", "fr")
|
||||
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
|
||||
except GlossaryNotFoundError as e:
|
||||
tracker.set_error(str(e))
|
||||
logger.error(f"Job {job_id}: Glossary error - {e}")
|
||||
@@ -940,7 +943,10 @@ async def _run_translation_job(
|
||||
effective_prompt = custom_prompt
|
||||
|
||||
# Build the full prompt combining effective prompt and glossary
|
||||
full_prompt = build_full_prompt(effective_prompt, glossary_terms)
|
||||
full_prompt = build_full_prompt(
|
||||
effective_prompt, glossary_terms,
|
||||
source_lang=glossary_source_lang, target_lang=target_lang,
|
||||
)
|
||||
|
||||
translation_provider = None
|
||||
_p = provider.lower()
|
||||
|
||||
@@ -17,6 +17,9 @@ class GlossaryTermCreate(BaseModel):
|
||||
target: str = Field(
|
||||
..., min_length=1, max_length=500, description="Traduction cible"
|
||||
)
|
||||
translations: Optional[dict[str, str]] = Field(
|
||||
None, description="Traductions multilingues: {\"en\": \"coil\", \"de\": \"Spule\", ...}"
|
||||
)
|
||||
|
||||
@field_validator("source", "target")
|
||||
@classmethod
|
||||
@@ -30,6 +33,7 @@ class GlossaryTermResponse(BaseModel):
|
||||
id: str
|
||||
source: str
|
||||
target: str
|
||||
translations: dict[str, str] = {}
|
||||
created_at: Optional[datetime] = None
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
@@ -39,6 +43,9 @@ class GlossaryCreate(BaseModel):
|
||||
"""Schema for creating a glossary."""
|
||||
|
||||
name: str = Field(..., min_length=1, max_length=255, description="Nom du glossaire")
|
||||
source_language: str = Field(
|
||||
default="fr", max_length=10, description="Langue source (ISO code)"
|
||||
)
|
||||
terms: list[GlossaryTermCreate] = Field(
|
||||
default_factory=list, description="Liste des termes"
|
||||
)
|
||||
@@ -53,6 +60,7 @@ class GlossaryUpdate(BaseModel):
|
||||
"""Schema for updating a glossary (all fields optional)."""
|
||||
|
||||
name: Optional[str] = Field(None, min_length=1, max_length=255)
|
||||
source_language: Optional[str] = Field(None, max_length=10)
|
||||
terms: Optional[list[GlossaryTermCreate]] = Field(None)
|
||||
|
||||
@field_validator("name")
|
||||
@@ -66,6 +74,7 @@ class GlossaryResponse(BaseModel):
|
||||
|
||||
id: str
|
||||
name: str
|
||||
source_language: str = "fr"
|
||||
terms: list[GlossaryTermResponse] = []
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
@@ -78,6 +87,7 @@ class GlossaryListItem(BaseModel):
|
||||
|
||||
id: str
|
||||
name: str
|
||||
source_language: str = "fr"
|
||||
terms_count: int = Field(
|
||||
default=0, description="Nombre de termes dans le glossaire"
|
||||
)
|
||||
|
||||
@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
||||
def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Retrieve glossary terms for a specific glossary owned by a user.
|
||||
|
||||
Retrieve glossary terms and metadata for a specific glossary owned by a user.
|
||||
|
||||
Args:
|
||||
glossary_id: UUID of the glossary
|
||||
user_id: UUID of the user (must own the glossary)
|
||||
|
||||
|
||||
Returns:
|
||||
List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
|
||||
|
||||
Raises:
|
||||
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
||||
"""
|
||||
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
||||
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
|
||||
if not glossary:
|
||||
raise GlossaryNotFoundError(
|
||||
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
||||
details={"glossary_id": glossary_id}
|
||||
)
|
||||
|
||||
# Get all terms for this glossary
|
||||
|
||||
terms = (
|
||||
session.query(GlossaryTerm)
|
||||
.filter(GlossaryTerm.glossary_id == glossary_id)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Format as list of dicts
|
||||
result = [{"source": term.source, "target": term.target} for term in terms]
|
||||
|
||||
|
||||
result = [{
|
||||
"source": term.source,
|
||||
"target": term.target,
|
||||
"translations": term.translations or {}
|
||||
} for term in terms]
|
||||
|
||||
logger.info(
|
||||
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
return {
|
||||
"source_language": glossary.source_language or "fr",
|
||||
"terms": result,
|
||||
}
|
||||
|
||||
except GlossaryNotFoundError:
|
||||
raise
|
||||
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
|
||||
def format_glossary_for_prompt(
|
||||
terms: List[Dict[str, str]],
|
||||
source_lang: str = "fr",
|
||||
target_lang: str = "en",
|
||||
) -> str:
|
||||
"""
|
||||
Format glossary terms for injection into an LLM system prompt.
|
||||
|
||||
The format is designed to be clear and unambiguous for LLMs:
|
||||
- Clear header explaining the purpose
|
||||
- Simple source → target format
|
||||
- Explicit instruction to use these translations
|
||||
|
||||
|
||||
When a term has a translation for target_lang in its translations dict,
|
||||
that specific translation is used. Otherwise, falls back to the default
|
||||
target field (backward compat). For templates that only have EN translations,
|
||||
the LLM is instructed to derive the correct target_lang equivalent.
|
||||
|
||||
Args:
|
||||
terms: List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
terms: List of dicts with 'source', 'target', and optional 'translations'
|
||||
source_lang: ISO code of the source language
|
||||
target_lang: ISO code of the target language
|
||||
|
||||
Returns:
|
||||
Formatted string for LLM prompt
|
||||
"""
|
||||
if not terms:
|
||||
return ""
|
||||
|
||||
# Sort terms by length (longest first) to avoid substring conflicts
|
||||
# e.g., "machine learning" should match before "machine"
|
||||
|
||||
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
|
||||
|
||||
|
||||
lines = [
|
||||
"TERMINOLOGY GLOSSARY (use these exact translations):",
|
||||
f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
|
||||
""
|
||||
]
|
||||
|
||||
|
||||
has_fallback = False
|
||||
for term in sorted_terms:
|
||||
source = term.get("source", "").strip()
|
||||
target = term.get("target", "").strip()
|
||||
if source and target:
|
||||
# Escape single quotes in terms for clarity
|
||||
if not source:
|
||||
continue
|
||||
|
||||
translations = term.get("translations", {}) or {}
|
||||
specific = translations.get(target_lang, "").strip()
|
||||
default_target = term.get("target", "").strip()
|
||||
|
||||
if specific:
|
||||
source_escaped = source.replace("'", "\\'")
|
||||
target_escaped = target.replace("'", "\\'")
|
||||
target_escaped = specific.replace("'", "\\'")
|
||||
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
||||
|
||||
elif default_target:
|
||||
source_escaped = source.replace("'", "\\'")
|
||||
target_escaped = default_target.replace("'", "\\'")
|
||||
lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
|
||||
has_fallback = True
|
||||
# If neither specific nor default, skip the term
|
||||
|
||||
if not any(line.startswith("- ") for line in lines):
|
||||
return ""
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"IMPORTANT: Always use these translations when the terms appear in the text."
|
||||
])
|
||||
|
||||
|
||||
if has_fallback:
|
||||
lines.append(
|
||||
"NOTE: Some entries show an English reference — translate to the correct "
|
||||
f"{target_lang} equivalent while preserving the intended meaning."
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_full_prompt(
|
||||
custom_prompt: Optional[str],
|
||||
glossary_terms: Optional[List[Dict[str, str]]]
|
||||
glossary_terms: Optional[List[Dict[str, str]]],
|
||||
source_lang: str = "fr",
|
||||
target_lang: str = "en",
|
||||
) -> str:
|
||||
"""
|
||||
Build the complete prompt combining custom prompt and glossary.
|
||||
|
||||
|
||||
Args:
|
||||
custom_prompt: Optional custom system prompt from user
|
||||
glossary_terms: Optional list of glossary terms
|
||||
|
||||
source_lang: ISO code of the source language
|
||||
target_lang: ISO code of the target language
|
||||
|
||||
Returns:
|
||||
Combined prompt string
|
||||
"""
|
||||
parts = []
|
||||
|
||||
|
||||
if custom_prompt:
|
||||
parts.append(custom_prompt)
|
||||
|
||||
|
||||
if glossary_terms:
|
||||
glossary_prompt = format_glossary_for_prompt(glossary_terms)
|
||||
glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
|
||||
if glossary_prompt:
|
||||
parts.append(glossary_prompt)
|
||||
|
||||
|
||||
return "\n\n".join(parts) if parts else ""
|
||||
Reference in New Issue
Block a user