feat: revue de code, doc CODE_REVIEW, forfaits 2026, traduction LLM, providers avec modèle
Made-with: Cursor
This commit is contained in:
183
services/glossary_service.py
Normal file
183
services/glossary_service.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Glossary Service for Translation
|
||||
Story 3.10: Glossaires - Application lors Traduction LLM
|
||||
|
||||
Provides functions to retrieve glossary terms and format them for LLM prompts.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
from database.connection import get_sync_session
|
||||
from database.models import Glossary, GlossaryTerm
|
||||
from utils.exceptions import GlossaryNotFoundError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Retrieve glossary terms for a specific glossary owned by a user.
|
||||
|
||||
Args:
|
||||
glossary_id: UUID of the glossary
|
||||
user_id: UUID of the user (must own the glossary)
|
||||
|
||||
Returns:
|
||||
List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
Raises:
|
||||
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
||||
"""
|
||||
try:
|
||||
with get_sync_session() as session:
|
||||
glossary = (
|
||||
session.query(Glossary)
|
||||
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not glossary:
|
||||
raise GlossaryNotFoundError(
|
||||
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
||||
details={"glossary_id": glossary_id}
|
||||
)
|
||||
|
||||
# Get all terms for this glossary
|
||||
terms = (
|
||||
session.query(GlossaryTerm)
|
||||
.filter(GlossaryTerm.glossary_id == glossary_id)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Format as list of dicts
|
||||
result = [{"source": term.source, "target": term.target} for term in terms]
|
||||
|
||||
logger.info(
|
||||
f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except GlossaryNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving glossary {glossary_id}: {e}")
|
||||
raise GlossaryNotFoundError(
|
||||
message="Erreur lors de la récupération du glossaire.",
|
||||
details={"glossary_id": glossary_id, "error": str(e)}
|
||||
)
|
||||
|
||||
|
||||
def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
|
||||
"""
|
||||
Validate that a glossary exists and belongs to the user.
|
||||
|
||||
This is a lightweight check that doesn't return the terms,
|
||||
useful for early validation before starting a translation job.
|
||||
|
||||
Args:
|
||||
glossary_id: UUID of the glossary
|
||||
user_id: UUID of the user (must own the glossary)
|
||||
|
||||
Returns:
|
||||
True if glossary exists and belongs to user
|
||||
|
||||
Raises:
|
||||
GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
|
||||
"""
|
||||
try:
|
||||
with get_sync_session() as session:
|
||||
glossary = (
|
||||
session.query(Glossary)
|
||||
.filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not glossary:
|
||||
raise GlossaryNotFoundError(
|
||||
message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
|
||||
details={"glossary_id": glossary_id}
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except GlossaryNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating glossary access {glossary_id}: {e}")
|
||||
raise GlossaryNotFoundError(
|
||||
message="Erreur lors de la validation du glossaire.",
|
||||
details={"glossary_id": glossary_id, "error": str(e)}
|
||||
)
|
||||
|
||||
|
||||
def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
|
||||
"""
|
||||
Format glossary terms for injection into an LLM system prompt.
|
||||
|
||||
The format is designed to be clear and unambiguous for LLMs:
|
||||
- Clear header explaining the purpose
|
||||
- Simple source → target format
|
||||
- Explicit instruction to use these translations
|
||||
|
||||
Args:
|
||||
terms: List of dictionaries with 'source' and 'target' keys
|
||||
|
||||
Returns:
|
||||
Formatted string for LLM prompt
|
||||
"""
|
||||
if not terms:
|
||||
return ""
|
||||
|
||||
# Sort terms by length (longest first) to avoid substring conflicts
|
||||
# e.g., "machine learning" should match before "machine"
|
||||
sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)
|
||||
|
||||
lines = [
|
||||
"TERMINOLOGY GLOSSARY (use these exact translations):",
|
||||
""
|
||||
]
|
||||
|
||||
for term in sorted_terms:
|
||||
source = term.get("source", "").strip()
|
||||
target = term.get("target", "").strip()
|
||||
if source and target:
|
||||
# Escape single quotes in terms for clarity
|
||||
source_escaped = source.replace("'", "\\'")
|
||||
target_escaped = target.replace("'", "\\'")
|
||||
lines.append(f"- '{source_escaped}' → '{target_escaped}'")
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"IMPORTANT: Always use these translations when the terms appear in the text."
|
||||
])
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_full_prompt(
|
||||
custom_prompt: Optional[str],
|
||||
glossary_terms: Optional[List[Dict[str, str]]]
|
||||
) -> str:
|
||||
"""
|
||||
Build the complete prompt combining custom prompt and glossary.
|
||||
|
||||
Args:
|
||||
custom_prompt: Optional custom system prompt from user
|
||||
glossary_terms: Optional list of glossary terms
|
||||
|
||||
Returns:
|
||||
Combined prompt string
|
||||
"""
|
||||
parts = []
|
||||
|
||||
if custom_prompt:
|
||||
parts.append(custom_prompt)
|
||||
|
||||
if glossary_terms:
|
||||
glossary_prompt = format_glossary_for_prompt(glossary_terms)
|
||||
if glossary_prompt:
|
||||
parts.append(glossary_prompt)
|
||||
|
||||
return "\n\n".join(parts) if parts else ""
|
||||
Reference in New Issue
Block a user