office_translator/services/glossary_service.py

"""
Glossary Service for Translation
Story 3.10: Glossaires - Application lors Traduction LLM

Provides functions to retrieve glossary terms and format them for LLM prompts.
"""

import logging
from typing import List, Dict, Any, Optional

from database.connection import get_sync_session
from database.models import Glossary, GlossaryTerm
from utils.exceptions import GlossaryNotFoundError

logger = logging.getLogger(__name__)


def get_glossary_terms(glossary_id: str, user_id: str) -> Dict[str, Any]:
    """
    Retrieve glossary terms and metadata for a specific glossary owned by a user.

    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)

    Returns:
        Dict with 'source_language' and 'terms' (list of dicts with source, target, translations)

    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
    try:
        with get_sync_session() as session:
            glossary = (
                session.query(Glossary)
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )

            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )

            terms = (
                session.query(GlossaryTerm)
                .filter(GlossaryTerm.glossary_id == glossary_id)
                .all()
            )

            result = [{
                "source": term.source,
                "target": term.target,
                "translations": term.translations or {}
            } for term in terms]

            logger.info(
                f"Retrieved {len(result)} terms from glossary {glossary_id} for user {user_id}"
            )

            return {
                "source_language": glossary.source_language or "fr",
                "terms": result,
            }

    except GlossaryNotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error retrieving glossary {glossary_id}: {e}")
        raise GlossaryNotFoundError(
            message="Erreur lors de la récupération du glossaire.",
            details={"glossary_id": glossary_id, "error": str(e)}
        )


def validate_glossary_access(glossary_id: str, user_id: str) -> bool:
    """
    Validate that a glossary exists and belongs to the user.

    This is a lightweight check that doesn't return the terms,
    useful for early validation before starting a translation job.

    Args:
        glossary_id: UUID of the glossary
        user_id: UUID of the user (must own the glossary)

    Returns:
        True if glossary exists and belongs to user

    Raises:
        GlossaryNotFoundError: If glossary doesn't exist or doesn't belong to user
    """
    try:
        with get_sync_session() as session:
            glossary = (
                session.query(Glossary)
                .filter(Glossary.id == glossary_id, Glossary.user_id == user_id)
                .first()
            )

            if not glossary:
                raise GlossaryNotFoundError(
                    message="Glossaire introuvable ou vous n'avez pas accès à cette ressource.",
                    details={"glossary_id": glossary_id}
                )

            return True

    except GlossaryNotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error validating glossary access {glossary_id}: {e}")
        raise GlossaryNotFoundError(
            message="Erreur lors de la validation du glossaire.",
            details={"glossary_id": glossary_id, "error": str(e)}
        )


def format_glossary_for_prompt(
    terms: List[Dict[str, str]],
    source_lang: str = "fr",
    target_lang: str = "en",
) -> str:
    """
    Format glossary terms for injection into an LLM system prompt.

    When a term has a translation for target_lang in its translations dict,
    that specific translation is used. Otherwise, falls back to the default
    target field (backward compat). For templates that only have EN translations,
    the LLM is instructed to derive the correct target_lang equivalent.

    Args:
        terms: List of dicts with 'source', 'target', and optional 'translations'
        source_lang: ISO code of the source language
        target_lang: ISO code of the target language

    Returns:
        Formatted string for LLM prompt
    """
    if not terms:
        return ""

    sorted_terms = sorted(terms, key=lambda t: len(t.get("source", "")), reverse=True)

    lines = [
        f"TERMINOLOGY GLOSSARY (translate from {source_lang} to {target_lang}):",
        ""
    ]

    has_fallback = False
    for term in sorted_terms:
        source = term.get("source", "").strip()
        if not source:
            continue

        translations = term.get("translations", {}) or {}
        specific = translations.get(target_lang, "").strip()
        default_target = term.get("target", "").strip()

        if specific:
            source_escaped = source.replace("'", "\\'")
            target_escaped = specific.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}'")
        elif default_target:
            source_escaped = source.replace("'", "\\'")
            target_escaped = default_target.replace("'", "\\'")
            lines.append(f"- '{source_escaped}' → '{target_escaped}' (EN reference, adapt to {target_lang})")
            has_fallback = True
        # If neither specific nor default, skip the term

    if not any(line.startswith("- ") for line in lines):
        return ""

    lines.extend([
        "",
        "IMPORTANT: Always use these translations when the terms appear in the text."
    ])

    if has_fallback:
        lines.append(
            "NOTE: Some entries show an English reference — translate to the correct "
            f"{target_lang} equivalent while preserving the intended meaning."
        )

    return "\n".join(lines)


def build_full_prompt(
    custom_prompt: Optional[str],
    glossary_terms: Optional[List[Dict[str, str]]],
    source_lang: str = "fr",
    target_lang: str = "en",
) -> str:
    """
    Build the complete prompt combining custom prompt and glossary.

    Args:
        custom_prompt: Optional custom system prompt from user
        glossary_terms: Optional list of glossary terms
        source_lang: ISO code of the source language
        target_lang: ISO code of the target language

    Returns:
        Combined prompt string
    """
    parts = []

    if custom_prompt:
        parts.append(custom_prompt)

    if glossary_terms:
        glossary_prompt = format_glossary_for_prompt(glossary_terms, source_lang, target_lang)
        if glossary_prompt:
            parts.append(glossary_prompt)

    return "\n\n".join(parts) if parts else ""