feat: add multilingual glossary support (backend + frontend types)
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s

Backend:
- Add source_language column to glossaries table
- Add translations JSON column to glossary_terms table
- Alembic migration for schema changes
- format_glossary_for_prompt now language-aware: extracts correct
  translation per target language, falls back to EN reference for
  templates with only FR→EN data
- CRUD routes accept/return source_language and translations
- Pydantic schemas updated

Frontend:
- Types updated: GlossaryTerm now has translations: Record<string, string>
- Glossary/GlossaryListItem now have source_language
- Added SUPPORTED_LANGUAGES constant (13 languages)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 15:25:28 +02:00
parent a76f7710e8
commit b2d918c832
8 changed files with 167 additions and 46 deletions

View File

@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
logger = logging.getLogger(__name__)
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
"""
Retrieve glossary terms for a specific glossary owned by a user.
Retrieve glossary terms and metadata for a specific glossary owned by a user.
Args:
glossary_id: UUID of the glossary
user_id: UUID of the user (must own the glossary)
Returns:
List of dictionaries with 'source' and 'target' keys
Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
Raises:
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
"""
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
.first()
)
if not glossary:
raise GlossaryNotFoundError(
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
details={"glossary_id": glossary_id}
)
# Get all terms for this glossary
terms = (
session.query(GlossaryTerm)
.filter(GlossaryTerm.glossary_id == glossary_id)
.all()
)
# Format as list of dicts
result = [{"source": term.source, "target": term.target} for term in terms]
result = [{
"source": term.source,
"target": term.target,
"translations": term.translations or {}
} for term in terms]
logger.info(
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
)
return result
return {
"source_language": glossary.source_language or "fr",
"terms": result,
}
except GlossaryNotFoundError:
raise
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
)
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
def format_glossary_for_prompt(
terms: List[Dict[str, str]],
source_lang: str = "fr",
target_lang: str = "en",
) -> str:
"""
Format glossary terms for injection into an LLM system prompt.
The format is designed to be clear and unambiguous for LLMs:
- Clear header explaining the purpose
- Simple source → target format
- Explicit instruction to use these translations
When a term has a translation for target_lang in its translations dict,
that specific translation is used. Otherwise, falls back to the default
target field (backward compat). For templates that only have EN translations,
the LLM is instructed to derive the correct target_lang equivalent.
Args:
terms: List of dictionaries with 'source' and 'target' keys
terms: List of dicts with 'source', 'target', and optional 'translations'
source_lang: ISO code of the source language
target_lang: ISO code of the target language
Returns:
Formatted string for LLM prompt
"""
if not terms:
return ""
# Sort terms by length (longest first) to avoid substring conflicts
# e.g., "machine learning" should match before "machine"
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
lines = [
"TERMINOLOGY GLOSSARY (use these exact translations):",
f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
""
]
has_fallback = False
for term in sorted_terms:
source = term.get("source", "").strip()
target = term.get("target", "").strip()
if source and target:
# Escape single quotes in terms for clarity
if not source:
continue
translations = term.get("translations", {}) or {}
specific = translations.get(target_lang, "").strip()
default_target = term.get("target", "").strip()
if specific:
source_escaped = source.replace("'", "\\'")
target_escaped = target.replace("'", "\\'")
target_escaped = specific.replace("'", "\\'")
lines.append(f"- '{source_escaped}''{target_escaped}'")
elif default_target:
source_escaped = source.replace("'", "\\'")
target_escaped = default_target.replace("'", "\\'")
lines.append(f"- '{source_escaped}''{target_escaped}' (EN reference, adapt to {target_lang})")
has_fallback = True
# If neither specific nor default, skip the term
if not any(line.startswith("- ") for line in lines):
return ""
lines.extend([
"",
"IMPORTANT: Always use these translations when the terms appear in the text."
])
if has_fallback:
lines.append(
"NOTE: Some entries show an English reference — translate to the correct "
f"{target_lang} equivalent while preserving the intended meaning."
)
return "\n".join(lines)
def build_full_prompt(
custom_prompt: Optional[str],
glossary_terms: Optional[List[Dict[str, str]]]
glossary_terms: Optional[List[Dict[str, str]]],
source_lang: str = "fr",
target_lang: str = "en",
) -> str:
"""
Build the complete prompt combining custom prompt and glossary.
Args:
custom_prompt: Optional custom system prompt from user
glossary_terms: Optional list of glossary terms
source_lang: ISO code of the source language
target_lang: ISO code of the target language
Returns:
Combined prompt string
"""
parts = []
if custom_prompt:
parts.append(custom_prompt)
if glossary_terms:
glossary_prompt = format_glossary_for_prompt(glossary_terms)
glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
if glossary_prompt:
parts.append(glossary_prompt)
return "\n\n".join(parts) if parts else ""