office_translator/services/glossary_service.py

"""
Glossary Service for Translation
Story 3.10: Glossaires - Application lors Traduction LLM

Provides functions to retrieve glossary terms and format them for LLM prompts.
"""

import logging
from typing import List, Dict, Any, Optional

from database.connection import get_sync_session
from database.models import Glossary, GlossaryTerm
from utils.exceptions import GlossaryNotFoundError

logger = logging.getLogger(__name__)


def get_glossary_terms(glossary_id: str, user_id: str) -> List[Dict[str, str]]:
    """
    Retrieve glossary terms for a specific glossary owned by a user.

    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)

    Returns:
        List of dictionaries with 'source' and 'target' keys

    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
    try:
        with get_sync_session() as session:
            glossary = (
                session.query(Glossary)
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )

            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )

            # Get all terms for this glossary
            terms = (
                session.query(GlossaryTerm)
                .filter(GlossaryTerm.glossary_id == glossary_id)
                .all()
            )

            # Format as list of dicts
            result = [{"source": term.source, "target": term.target} for term in terms]

            logger.info(
                f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
            )

            return result

    except GlossaryNotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error retrieving glossary {glossary_id}: {e}")
        raise GlossaryNotFoundError(
            message="Erreur lors de la récupération du glossaire.",
            details={"glossary_id": glossary_id, "error": str(e)}
        )


def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
    """
    Validate that a glossary exists and belongs to the user.

    This is a lightweight check that doesn't return the terms,
    useful for early validation before starting a translation job.

    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)

    Returns:
        True if glossary exists and belongs to user

    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
    try:
        with get_sync_session() as session:
            glossary = (
                session.query(Glossary)
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )

            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )

            return True

    except GlossaryNotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error validating glossary access {glossary_id}: {e}")
        raise GlossaryNotFoundError(
            message="Erreur lors de la validation du glossaire.",
            details={"glossary_id": glossary_id, "error": str(e)}
        )


def format_glossary_for_prompt(terms: List[Dict[str, str]]) -> str:
    """
    Format glossary terms for injection into an LLM system prompt.

    The format is designed to be clear and unambiguous for LLMs:
    - Clear header explaining the purpose
    - Simple source → target format
    - Explicit instruction to use these translations

    Args:
        terms: List of dictionaries with 'source' and 'target' keys

    Returns:
        Formatted string for LLM prompt
    """
    if not terms:
        return ""

    # Sort terms by length (longest first) to avoid substring conflicts
    # e.g., "machine learning" should match before "machine"
    sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)

    lines = [
        "TERMINOLOGY GLOSSARY (use these exact translations):",
        ""
    ]

    for term in sorted_terms:
        source = term.get("source", "").strip()
        target = term.get("target", "").strip()
        if source and target:
            # Escape single quotes in terms for clarity
            source_escaped = source.replace("'", "\\'")
            target_escaped = target.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}'")

    lines.extend([
        "",
        "IMPORTANT: Always use these translations when the terms appear in the text."
    ])

    return "\n".join(lines)


def build_full_prompt(
    custom_prompt: Optional[str],
    glossary_terms: Optional[List[Dict[str, str]]]
) -> str:
    """
    Build the complete prompt combining custom prompt and glossary.

    Args:
        custom_prompt: Optional custom system prompt from user
        glossary_terms: Optional list of glossary terms

    Returns:
        Combined prompt string
    """
    parts = []

    if custom_prompt:
        parts.append(custom_prompt)

    if glossary_terms:
        glossary_prompt = format_glossary_for_prompt(glossary_terms)
        if glossary_prompt:
            parts.append(glossary_prompt)

    return "\n\n".join(parts) if parts else ""