All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m25s
227 lines
7.4 KiB
Python
227 lines
7.4 KiB
Python
"""
|
|
Glossary Service for Translation
|
|
Story 3.10: Glossaires - Application lors Traduction LLM
|
|
|
|
Provides functions to retrieve glossary terms and format them for LLM prompts.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
from database.connection import get_sync_session
|
|
from database.models import Glossary, GlossaryTerm
|
|
from utils.exceptions import GlossaryNotFoundError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Retrieve glossary terms and metadata for a specific glossary owned by a user.
|
|
|
|
Args:
|
|
glossary_id: UUID of the glossary
|
|
user_id: UUID of the user (must own the glossary)
|
|
|
|
Returns:
|
|
Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)
|
|
|
|
Raises:
|
|
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
|
"""
|
|
try:
|
|
with get_sync_session() as session:
|
|
glossary = (
|
|
session.query(Glossary)
|
|
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
|
.first()
|
|
)
|
|
|
|
if not glossary:
|
|
raise GlossaryNotFoundError(
|
|
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
|
details={"glossary_id": glossary_id}
|
|
)
|
|
|
|
terms = (
|
|
session.query(GlossaryTerm)
|
|
.filter(GlossaryTerm.glossary_id == glossary_id)
|
|
.all()
|
|
)
|
|
|
|
result = [{
|
|
"source": term.source,
|
|
"target": term.target,
|
|
"translations": term.translations or {}
|
|
} for term in terms]
|
|
|
|
logger.info(
|
|
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
|
|
)
|
|
|
|
return {
|
|
"source_language": glossary.source_language or "fr",
|
|
"target_language": getattr(glossary, "target_language", None) or "multi",
|
|
"terms": result,
|
|
}
|
|
|
|
except GlossaryNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving glossary {glossary_id}: {e}")
|
|
raise GlossaryNotFoundError(
|
|
message="Erreur lors de la récupération du glossaire.",
|
|
details={"glossary_id": glossary_id, "error": str(e)}
|
|
)
|
|
|
|
|
|
def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
|
|
"""
|
|
Validate that a glossary exists and belongs to the user.
|
|
|
|
This is a lightweight check that doesn't return the terms,
|
|
useful for early validation before starting a translation job.
|
|
|
|
Args:
|
|
glossary_id: UUID of the glossary
|
|
user_id: UUID of the user (must own the glossary)
|
|
|
|
Returns:
|
|
True if glossary exists and belongs to user
|
|
|
|
Raises:
|
|
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
|
"""
|
|
try:
|
|
with get_sync_session() as session:
|
|
glossary = (
|
|
session.query(Glossary)
|
|
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
|
.first()
|
|
)
|
|
|
|
if not glossary:
|
|
raise GlossaryNotFoundError(
|
|
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
|
details={"glossary_id": glossary_id}
|
|
)
|
|
|
|
return True
|
|
|
|
except GlossaryNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error validating glossary access {glossary_id}: {e}")
|
|
raise GlossaryNotFoundError(
|
|
message="Erreur lors de la validation du glossaire.",
|
|
details={"glossary_id": glossary_id, "error": str(e)}
|
|
)
|
|
|
|
|
|
def format_glossary_for_prompt(
|
|
terms: List[Dict[str, str]],
|
|
source_lang: str = "fr",
|
|
target_lang: str = "en",
|
|
glossary_target_lang: str = "multi",
|
|
) -> str:
|
|
"""
|
|
Format glossary terms for injection into an LLM system prompt.
|
|
|
|
When a term has a translation for target_lang in its translations dict,
|
|
that specific translation is used. Otherwise, falls back to the default
|
|
target field (backward compat). For templates that only have EN translations,
|
|
the LLM is instructed to derive the correct target_lang equivalent.
|
|
|
|
Args:
|
|
terms: List of dicts with 'source', 'target', and optional 'translations'
|
|
source_lang: ISO code of the source language
|
|
target_lang: ISO code of the target language
|
|
glossary_target_lang: ISO code of the glossary's target language configuration
|
|
|
|
Returns:
|
|
Formatted string for LLM prompt
|
|
"""
|
|
if not terms:
|
|
return ""
|
|
|
|
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
|
|
|
|
lines = [
|
|
f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
|
|
""
|
|
]
|
|
|
|
has_fallback = False
|
|
for term in sorted_terms:
|
|
source = term.get("source", "").strip()
|
|
if not source:
|
|
continue
|
|
|
|
translations = term.get("translations", {}) or {}
|
|
specific = translations.get(target_lang, "").strip()
|
|
default_target = term.get("target", "").strip()
|
|
|
|
if specific:
|
|
source_escaped = source.replace("'", "\\'")
|
|
target_escaped = specific.replace("'", "\\'")
|
|
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
|
elif default_target:
|
|
source_escaped = source.replace("'", "\\'")
|
|
target_escaped = default_target.replace("'", "\\'")
|
|
if glossary_target_lang == target_lang:
|
|
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
|
else:
|
|
lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
|
|
has_fallback = True
|
|
# If neither specific nor default, skip the term
|
|
|
|
if not any(line.startswith("- ") for line in lines):
|
|
return ""
|
|
|
|
lines.extend([
|
|
"",
|
|
"IMPORTANT: Always use these translations when the terms appear in the text."
|
|
])
|
|
|
|
if has_fallback:
|
|
lines.append(
|
|
"NOTE: Some entries show an English reference — translate to the correct "
|
|
f"{target_lang} equivalent while preserving the intended meaning."
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_full_prompt(
|
|
custom_prompt: Optional[str],
|
|
glossary_terms: Optional[List[Dict[str, str]]],
|
|
source_lang: str = "fr",
|
|
target_lang: str = "en",
|
|
glossary_target_lang: str = "multi",
|
|
) -> str:
|
|
"""
|
|
Build the complete prompt combining custom prompt and glossary.
|
|
|
|
Args:
|
|
custom_prompt: Optional custom system prompt from user
|
|
glossary_terms: Optional list of glossary terms
|
|
source_lang: ISO code of the source language
|
|
target_lang: ISO code of the target language
|
|
glossary_target_lang: ISO code of the glossary's target language configuration
|
|
|
|
Returns:
|
|
Combined prompt string
|
|
"""
|
|
parts = []
|
|
|
|
if custom_prompt:
|
|
parts.append(custom_prompt)
|
|
|
|
if glossary_terms:
|
|
glossary_prompt = format_glossary_for_prompt(
|
|
glossary_terms, source_lang, target_lang, glossary_target_lang
|
|
)
|
|
if glossary_prompt:
|
|
parts.append(glossary_prompt)
|
|
|
|
return "\n\n".join(parts) if parts else "" |