183 lines
5.7 KiB
Python
183 lines
5.7 KiB
Python
"""
|
|
Glossary Service for Translation
|
|
Story 3.10: Glossaires - Application lors Traduction LLM
|
|
|
|
Provides functions to retrieve glossary terms and format them for LLM prompts.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
from database.connection import get_sync_session
|
|
from database.models import Glossary, GlossaryTerm
|
|
from utils.exceptions import GlossaryNotFoundError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
|
"""
|
|
Retrieve glossary terms for a specific glossary owned by a user.
|
|
|
|
Args:
|
|
glossary_id: UUID of the glossary
|
|
user_id: UUID of the user (must own the glossary)
|
|
|
|
Returns:
|
|
List of dictionaries with 'source' and 'target' keys
|
|
|
|
Raises:
|
|
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
|
"""
|
|
try:
|
|
with get_sync_session() as session:
|
|
glossary = (
|
|
session.query(Glossary)
|
|
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
|
.first()
|
|
)
|
|
|
|
if not glossary:
|
|
raise GlossaryNotFoundError(
|
|
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
|
details={"glossary_id": glossary_id}
|
|
)
|
|
|
|
# Get all terms for this glossary
|
|
terms = (
|
|
session.query(GlossaryTerm)
|
|
.filter(GlossaryTerm.glossary_id == glossary_id)
|
|
.all()
|
|
)
|
|
|
|
# Format as list of dicts
|
|
result = [{"source": term.source, "target": term.target} for term in terms]
|
|
|
|
logger.info(
|
|
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
|
|
)
|
|
|
|
return result
|
|
|
|
except GlossaryNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving glossary {glossary_id}: {e}")
|
|
raise GlossaryNotFoundError(
|
|
message="Erreur lors de la récupération du glossaire.",
|
|
details={"glossary_id": glossary_id, "error": str(e)}
|
|
)
|
|
|
|
|
|
def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
|
|
"""
|
|
Validate that a glossary exists and belongs to the user.
|
|
|
|
This is a lightweight check that doesn't return the terms,
|
|
useful for early validation before starting a translation job.
|
|
|
|
Args:
|
|
glossary_id: UUID of the glossary
|
|
user_id: UUID of the user (must own the glossary)
|
|
|
|
Returns:
|
|
True if glossary exists and belongs to user
|
|
|
|
Raises:
|
|
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
|
"""
|
|
try:
|
|
with get_sync_session() as session:
|
|
glossary = (
|
|
session.query(Glossary)
|
|
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
|
.first()
|
|
)
|
|
|
|
if not glossary:
|
|
raise GlossaryNotFoundError(
|
|
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
|
details={"glossary_id": glossary_id}
|
|
)
|
|
|
|
return True
|
|
|
|
except GlossaryNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error validating glossary access {glossary_id}: {e}")
|
|
raise GlossaryNotFoundError(
|
|
message="Erreur lors de la validation du glossaire.",
|
|
details={"glossary_id": glossary_id, "error": str(e)}
|
|
)
|
|
|
|
|
|
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
|
|
"""
|
|
Format glossary terms for injection into an LLM system prompt.
|
|
|
|
The format is designed to be clear and unambiguous for LLMs:
|
|
- Clear header explaining the purpose
|
|
- Simple source → target format
|
|
- Explicit instruction to use these translations
|
|
|
|
Args:
|
|
terms: List of dictionaries with 'source' and 'target' keys
|
|
|
|
Returns:
|
|
Formatted string for LLM prompt
|
|
"""
|
|
if not terms:
|
|
return ""
|
|
|
|
# Sort terms by length (longest first) to avoid substring conflicts
|
|
# e.g., "machine learning" should match before "machine"
|
|
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
|
|
|
|
lines = [
|
|
"TERMINOLOGY GLOSSARY (use these exact translations):",
|
|
""
|
|
]
|
|
|
|
for term in sorted_terms:
|
|
source = term.get("source", "").strip()
|
|
target = term.get("target", "").strip()
|
|
if source and target:
|
|
# Escape single quotes in terms for clarity
|
|
source_escaped = source.replace("'", "\\'")
|
|
target_escaped = target.replace("'", "\\'")
|
|
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
|
|
|
lines.extend([
|
|
"",
|
|
"IMPORTANT: Always use these translations when the terms appear in the text."
|
|
])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_full_prompt(
|
|
custom_prompt: Optional[str],
|
|
glossary_terms: Optional[List[Dict[str, str]]]
|
|
) -> str:
|
|
"""
|
|
Build the complete prompt combining custom prompt and glossary.
|
|
|
|
Args:
|
|
custom_prompt: Optional custom system prompt from user
|
|
glossary_terms: Optional list of glossary terms
|
|
|
|
Returns:
|
|
Combined prompt string
|
|
"""
|
|
parts = []
|
|
|
|
if custom_prompt:
|
|
parts.append(custom_prompt)
|
|
|
|
if glossary_terms:
|
|
glossary_prompt = format_glossary_for_prompt(glossary_terms)
|
|
if glossary_prompt:
|
|
parts.append(glossary_prompt)
|
|
|
|
return "\n\n".join(parts) if parts else "" |