feat: add multilingual glossary support (backend + frontend types)
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 1m31s
Backend: - Add source_language column to glossaries table - Add translations JSON column to glossary_terms table - Alembic migration for schema changes - format_glossary_for_prompt now language-aware: extracts correct translation per target language, falls back to EN reference for templates with only FR→EN data - CRUD routes accept/return source_language and translations - Pydantic schemas updated Frontend: - Types updated: GlossaryTerm now has translations: Record<string, string> - Glossary/GlossaryListItem now have source_language - Added SUPPORTED_LANGUAGES constant (13 languages) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -15,17 +15,17 @@ from utils.exceptions import GlossaryNotFoundError
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
||||
def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Retrieve glossary terms for a specific glossary owned by a user.
|
||||
|
||||
Retrieve glossary terms and metadata for a specific glossary owned by a user.
|
||||
|
||||
Args:
|
||||
glossary_id: UUID of the glossary
|
||||
user_id: UUID of the user (must own the glossary)
|
||||
|
||||
|
||||
Returns:
|
||||
List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
|
||||
|
||||
Raises:
|
||||
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
||||
"""
|
||||
@@ -36,28 +36,33 @@ def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
||||
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
|
||||
if not glossary:
|
||||
raise GlossaryNotFoundError(
|
||||
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
||||
details={"glossary_id": glossary_id}
|
||||
)
|
||||
|
||||
# Get all terms for this glossary
|
||||
|
||||
terms = (
|
||||
session.query(GlossaryTerm)
|
||||
.filter(GlossaryTerm.glossary_id == glossary_id)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Format as list of dicts
|
||||
result = [{"source": term.source, "target": term.target} for term in terms]
|
||||
|
||||
|
||||
result = [{
|
||||
"source": term.source,
|
||||
"target": term.target,
|
||||
"translations": term.translations or {}
|
||||
} for term in terms]
|
||||
|
||||
logger.info(
|
||||
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
return {
|
||||
"source_language": glossary.source_language or "fr",
|
||||
"terms": result,
|
||||
}
|
||||
|
||||
except GlossaryNotFoundError:
|
||||
raise
|
||||
@@ -112,72 +117,101 @@ def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
|
||||
def format_glossary_for_prompt(
|
||||
terms: List[Dict[str, str]],
|
||||
source_lang: str = "fr",
|
||||
target_lang: str = "en",
|
||||
) -> str:
|
||||
"""
|
||||
Format glossary terms for injection into an LLM system prompt.
|
||||
|
||||
The format is designed to be clear and unambiguous for LLMs:
|
||||
- Clear header explaining the purpose
|
||||
- Simple source → target format
|
||||
- Explicit instruction to use these translations
|
||||
|
||||
|
||||
When a term has a translation for target_lang in its translations dict,
|
||||
that specific translation is used. Otherwise, falls back to the default
|
||||
target field (backward compat). For templates that only have EN translations,
|
||||
the LLM is instructed to derive the correct target_lang equivalent.
|
||||
|
||||
Args:
|
||||
terms: List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
terms: List of dicts with 'source', 'target', and optional 'translations'
|
||||
source_lang: ISO code of the source language
|
||||
target_lang: ISO code of the target language
|
||||
|
||||
Returns:
|
||||
Formatted string for LLM prompt
|
||||
"""
|
||||
if not terms:
|
||||
return ""
|
||||
|
||||
# Sort terms by length (longest first) to avoid substring conflicts
|
||||
# e.g., "machine learning" should match before "machine"
|
||||
|
||||
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
|
||||
|
||||
|
||||
lines = [
|
||||
"TERMINOLOGY GLOSSARY (use these exact translations):",
|
||||
f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
|
||||
""
|
||||
]
|
||||
|
||||
|
||||
has_fallback = False
|
||||
for term in sorted_terms:
|
||||
source = term.get("source", "").strip()
|
||||
target = term.get("target", "").strip()
|
||||
if source and target:
|
||||
# Escape single quotes in terms for clarity
|
||||
if not source:
|
||||
continue
|
||||
|
||||
translations = term.get("translations", {}) or {}
|
||||
specific = translations.get(target_lang, "").strip()
|
||||
default_target = term.get("target", "").strip()
|
||||
|
||||
if specific:
|
||||
source_escaped = source.replace("'", "\\'")
|
||||
target_escaped = target.replace("'", "\\'")
|
||||
target_escaped = specific.replace("'", "\\'")
|
||||
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
||||
|
||||
elif default_target:
|
||||
source_escaped = source.replace("'", "\\'")
|
||||
target_escaped = default_target.replace("'", "\\'")
|
||||
lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
|
||||
has_fallback = True
|
||||
# If neither specific nor default, skip the term
|
||||
|
||||
if not any(line.startswith("- ") for line in lines):
|
||||
return ""
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"IMPORTANT: Always use these translations when the terms appear in the text."
|
||||
])
|
||||
|
||||
|
||||
if has_fallback:
|
||||
lines.append(
|
||||
"NOTE: Some entries show an English reference — translate to the correct "
|
||||
f"{target_lang} equivalent while preserving the intended meaning."
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_full_prompt(
|
||||
custom_prompt: Optional[str],
|
||||
glossary_terms: Optional[List[Dict[str, str]]]
|
||||
glossary_terms: Optional[List[Dict[str, str]]],
|
||||
source_lang: str = "fr",
|
||||
target_lang: str = "en",
|
||||
) -> str:
|
||||
"""
|
||||
Build the complete prompt combining custom prompt and glossary.
|
||||
|
||||
|
||||
Args:
|
||||
custom_prompt: Optional custom system prompt from user
|
||||
glossary_terms: Optional list of glossary terms
|
||||
|
||||
source_lang: ISO code of the source language
|
||||
target_lang: ISO code of the target language
|
||||
|
||||
Returns:
|
||||
Combined prompt string
|
||||
"""
|
||||
parts = []
|
||||
|
||||
|
||||
if custom_prompt:
|
||||
parts.append(custom_prompt)
|
||||
|
||||
|
||||
if glossary_terms:
|
||||
glossary_prompt = format_glossary_for_prompt(glossary_terms)
|
||||
glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
|
||||
if glossary_prompt:
|
||||
parts.append(glossary_prompt)
|
||||
|
||||
|
||||
return "\n\n".join(parts) if parts else ""
|
||||
Reference in New Issue
Block a user