office_translator/translators/word_translator.py

"""
Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images
OPTIMIZED: Uses batch translation for 5-10x faster processing

Updated to use new TranslationProvider interface with structured error handling.
"""

import time
import concurrent.futures
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable, Any

from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.section import Section

from services.providers.base import TranslationProvider

# Languages written right-to-left
RTL_LANGUAGES: frozenset = frozenset(
    {"ar", "he", "fa", "ur", "ku", "ps", "ug", "sd", "yi", "dv", "ckb"}
)


try:
    import structlog

    logger = structlog.get_logger(__name__)
    _HAS_STRUCTLOG = True
except ImportError:
    import logging

    logger = logging.getLogger(__name__)
    _HAS_STRUCTLOG = False


def _log_info(event: str, **kwargs):
    """Log info with structlog or standard logging compatibility."""
    if _HAS_STRUCTLOG:
        logger.info(event, **kwargs)
    else:
        msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
        logger.info(msg)


def _log_error(event: str, **kwargs):
    """Log error with structlog or standard logging compatibility."""
    if _HAS_STRUCTLOG:
        logger.error(event, **kwargs)
    else:
        msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
        logger.error(msg)


def _set_paragraph_rtl(paragraph: Paragraph) -> None:
    """
    Enable RTL mode on a paragraph and all its runs.

    Sets:
      - w:pPr/w:bidi  → paragraph text direction = RTL
      - w:pPr/w:jc    → alignment = right
      - w:rPr/w:rtl   → run-level RTL marker for each run
    """
    pPr = paragraph._p.get_or_add_pPr()

    if pPr.find(qn("w:bidi")) is None:
        pPr.append(OxmlElement("w:bidi"))

    jc = pPr.find(qn("w:jc"))
    if jc is None:
        jc = OxmlElement("w:jc")
        pPr.append(jc)
    jc.set(qn("w:val"), "right")

    for run in paragraph.runs:
        rPr = run._r.get_or_add_rPr()
        if rPr.find(qn("w:rtl")) is None:
            rPr.append(OxmlElement("w:rtl"))


def _apply_rtl_to_document(document: Document) -> None:
    """Apply RTL direction to every paragraph and section in the document."""
    # Body paragraphs
    for para in document.paragraphs:
        _set_paragraph_rtl(para)
    # Body tables
    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    _set_paragraph_rtl(para)
    # Headers, footers, and section-level RTL (page layout direction)
    for section in document.sections:
        # Set the section (page) direction to RTL so Word renders margins,
        # columns and page numbering from right to left.
        sectPr = section._sectPr
        if sectPr.find(qn("w:bidi")) is None:
            sectPr.append(OxmlElement("w:bidi"))

        for hf in (section.header, section.footer):
            for para in hf.paragraphs:
                _set_paragraph_rtl(para)
            for table in hf.tables:
                for row in table.rows:
                    for cell in row.cells:
                        for para in cell.paragraphs:
                            _set_paragraph_rtl(para)


class WordProcessorError(Exception):
    """Exception for Word processing errors with structured error codes."""

    INVALID_FORMAT = "INVALID_FORMAT"
    DOCX_CORRUPTED = "DOCX_CORRUPTED"
    DOCX_READ_ERROR = "DOCX_READ_ERROR"
    DOCX_WRITE_ERROR = "DOCX_WRITE_ERROR"
    DOCX_TOO_LARGE = "DOCX_TOO_LARGE"

    ERROR_MESSAGES = {
        INVALID_FORMAT: "Format de fichier non supporte. Utilisez .docx.",
        DOCX_CORRUPTED: "Le document Word est corrompu ou illisible.",
        DOCX_READ_ERROR: "Erreur lors de la lecture du document Word.",
        DOCX_WRITE_ERROR: "Erreur lors de la creation du document traduit.",
        DOCX_TOO_LARGE: "Le fichier est trop volumineux (max 50 Mo).",
    }

    def __init__(
        self,
        code: str,
        message: Optional[str] = None,
        details: Optional[Dict[str, Any]] = None,
    ):
        self.code = code
        self.message = message or self.ERROR_MESSAGES.get(code, "Erreur inconnue")
        self.details = details or {}
        super().__init__(self.message)

    def to_dict(self) -> Dict[str, Any]:
        """Convert error to dictionary format for API responses."""
        result = {"error": self.code, "message": self.message}
        if self.details:
            result["details"] = self.details
        return result


class WordTranslator:
    """
    Handles translation of Word documents with strict formatting preservation.

    Uses the new TranslationProvider interface for improved error handling
    and fallback chain support.
    """

    MAX_FILE_SIZE_MB = 50
    DOCX_MAGIC_BYTES = b"PK"  # .docx files are ZIP archives

    def __init__(self, provider: Optional[TranslationProvider] = None):
        """
        Initialize WordTranslator.

        Args:
            provider: TranslationProvider instance for translations.
                     If None, will use fallback to legacy translation_service.
        """
        self._provider = provider
        self._custom_prompt: Optional[str] = None

    def set_provider(self, provider: TranslationProvider) -> None:
        """Set the translation provider."""
        self._provider = provider

    def set_custom_prompt(self, prompt: Optional[str]) -> None:
        """Set custom system prompt for LLM providers."""
        self._custom_prompt = prompt

    def translate_file(
        self,
        input_path: Path,
        output_path: Path,
        target_language: str,
        source_language: str = "auto",
        progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
    ) -> Path:
        """
        Translate a Word document while preserving all formatting and structure.
        Uses batch translation for improved performance.

        Args:
            input_path: Path to input Word file
            output_path: Path for translated output file
            target_language: Target language code (e.g., 'fr', 'en')
            source_language: Source language code (default: auto-detect)
            progress_callback: Optional callback for progress updates
                             Receives dict with: element, total_elements, runs_translated

        Returns:
            Path to translated file

        Raises:
            WordProcessorError: If file is invalid, corrupted, or processing fails
        """
        start_time = time.time()

        input_path = Path(input_path)
        output_path = Path(output_path)

        self._validate_file(input_path)

        try:
            document = Document(input_path)
        except Exception as e:
            raise WordProcessorError(
                code=WordProcessorError.DOCX_CORRUPTED,
                details={"file_name": input_path.name, "error": str(e)},
            )

        try:
            runs_translated = 0

            text_elements: List[Tuple[str, Callable[[str], None]]] = []

            self._collect_from_body(document, text_elements)

            total_sections = len(document.sections)
            total_elements = 0
            for section_idx, section in enumerate(document.sections):
                self._collect_from_section(section, text_elements)
                total_elements = len(text_elements)

                if progress_callback:
                    progress_callback(
                        {
                            "current": section_idx + 1,
                            "total": total_sections,
                            "paragraph": section_idx + 1,
                            "total_paragraphs": total_sections,
                            "runs_translated": runs_translated,
                            "phase": "collecting",
                        }
                    )

            if text_elements:
                texts = [elem[0] for elem in text_elements]
                total_elements = len(text_elements)
                _log_info(
                    "word_batch_translation_start",
                    file_name=input_path.name,
                    text_count=len(texts),
                    target_lang=target_language,
                )

                # Split into chunks and translate them IN PARALLEL using a thread
                # pool.  Each worker handles one chunk independently, making
                # full use of available CPU/network concurrency.  Progress is
                # reported as chunks complete (out-of-order completions are
                # fine — the tracker only moves forward).
                CHUNK_SIZE = 15
                MAX_WORKERS = 6
                chunks = [
                    (i, texts[i : i + CHUNK_SIZE])
                    for i in range(0, total_elements, CHUNK_SIZE)
                ]
                translated_texts: List[str] = [""] * total_elements
                completed_items = [0]  # mutable counter shared across threads

                def _translate_chunk(
                    chunk_idx: int, chunk: List[str]
                ) -> Tuple[int, List[str]]:
                    result = self._batch_translate(chunk, target_language, source_language)
                    return chunk_idx, result

                with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
                    future_map = {
                        pool.submit(_translate_chunk, idx, chunk): (idx, chunk)
                        for idx, chunk in chunks
                    }
                    for future in concurrent.futures.as_completed(future_map):
                        chunk_idx, translated_chunk = future.result()
                        start = chunk_idx
                        for j, t in enumerate(translated_chunk):
                            translated_texts[start + j] = t
                        completed_items[0] += len(translated_chunk)
                        if progress_callback:
                            done = min(completed_items[0], total_elements)
                            progress_callback(
                                {
                                    "current": done,
                                    "total": total_elements,
                                    "paragraph": done,
                                    "total_paragraphs": total_elements,
                                    "runs_translated": runs_translated,
                                    "phase": "translating",
                                }
                            )

                # Apply translations (fast — just text assignment)
                for i, ((original_text, setter), translated) in enumerate(
                    zip(text_elements, translated_texts)
                ):
                    if translated is not None and setter is not None:
                        try:
                            setter(translated)
                            runs_translated += 1
                        except Exception as e:
                            _log_error(
                                "word_setter_error",
                                error=str(e),
                                index=i,
                            )

            # Apply RTL layout when the target language is written right-to-left.
            if target_language.lower() in RTL_LANGUAGES:
                _apply_rtl_to_document(document)

            if progress_callback:
                progress_callback(
                    {
                        "current": total_elements if text_elements else total_sections,
                        "total": total_elements if text_elements else total_sections,
                        "paragraph": total_sections,
                        "total_paragraphs": total_sections,
                        "runs_translated": runs_translated,
                        "phase": "complete",
                    }
                )

            try:
                document.save(output_path)
            except Exception as e:
                raise WordProcessorError(
                    code=WordProcessorError.DOCX_WRITE_ERROR,
                    details={"file_name": output_path.name, "error": str(e)},
                )

            processing_time_ms = round((time.time() - start_time) * 1000, 2)

            _log_info(
                "word_translation_success",
                file_name=input_path.name,
                runs_translated=runs_translated,
                source_lang=source_language,
                target_lang=target_language,
                processing_time_ms=processing_time_ms,
            )

            return output_path

        except WordProcessorError:
            raise
        except Exception as e:
            raise WordProcessorError(
                code=WordProcessorError.DOCX_READ_ERROR,
                details={"file_name": input_path.name, "error": str(e)},
            )

    def _validate_file(self, file_path: Path) -> None:
        """Validate file format and size."""
        if not file_path.exists():
            raise WordProcessorError(
                code=WordProcessorError.DOCX_READ_ERROR,
                message=f"Fichier introuvable: {file_path.name}",
                details={"file_name": file_path.name},
            )

        if file_path.suffix.lower() != ".docx":
            raise WordProcessorError(
                code=WordProcessorError.INVALID_FORMAT,
                details={
                    "file_name": file_path.name,
                    "extension": file_path.suffix,
                    "expected": ".docx",
                },
            )

        with open(file_path, "rb") as f:
            header = f.read(4)
        if header[:2] != self.DOCX_MAGIC_BYTES:
            raise WordProcessorError(
                code=WordProcessorError.INVALID_FORMAT,
                details={"file_name": file_path.name, "reason": "Invalid file header"},
            )

        file_size_mb = file_path.stat().st_size / (1024 * 1024)
        if file_size_mb > self.MAX_FILE_SIZE_MB:
            raise WordProcessorError(
                code=WordProcessorError.DOCX_TOO_LARGE,
                details={
                    "file_name": file_path.name,
                    "size_mb": round(file_size_mb, 2),
                    "max_mb": self.MAX_FILE_SIZE_MB,
                },
            )

    def _batch_translate(
        self, texts: List[str], target_language: str, source_language: str = "auto"
    ) -> List[str]:
        """
        Batch translate using new provider interface.

        Args:
            texts: List of texts to translate
            target_language: Target language code
            source_language: Source language code

        Returns:
            List of translated texts (same order as input)
        """
        if not texts:
            return []

        if self._provider is not None:
            return self._translate_with_provider(
                texts, target_language, source_language
            )

        return self._translate_with_legacy(texts, target_language, source_language)

    def _translate_with_provider(
        self, texts: List[str], target_language: str, source_language: str
    ) -> List[str]:
        """Translate using the TranslationProvider.translate_batch() interface."""
        translated = self._provider.translate_batch(texts, target_language, source_language)
        # Fallback: keep original text for any empty/failed result
        return [
            t if (t and t.strip()) else orig
            for t, orig in zip(translated, texts)
        ]

    def _translate_with_legacy(
        self, texts: List[str], target_language: str, source_language: str
    ) -> List[str]:
        """Fallback to legacy translation_service for backward compatibility."""
        from services.translation_service import translation_service

        _log_info(
            "word_using_legacy_service",
            text_count=len(texts),
            target_lang=target_language,
        )

        return translation_service.translate_batch(
            texts, target_language, source_language
        )

    def _collect_from_body(
        self, document: Document, text_elements: List[Tuple[str, Callable[[str], None]]]
    ) -> None:
        """Collect all text elements from document body."""
        for element in document.element.body:
            if isinstance(element, CT_P):
                paragraph = Paragraph(element, document)
                self._collect_from_paragraph(paragraph, text_elements)
            elif isinstance(element, CT_Tbl):
                table = Table(element, document)
                self._collect_from_table(table, text_elements)

    def _collect_from_paragraph(
        self,
        paragraph: Paragraph,
        text_elements: List[Tuple[str, Callable[[str], None]]],
    ) -> None:
        """Collect text from paragraph runs, preserving inter-run whitespace.

        Each run is sent for translation WITHOUT its surrounding whitespace.
        The whitespace is captured and reapplied after translation so that words
        at formatting boundaries (e.g. bold/normal) do not get concatenated.
        """
        if not paragraph.text.strip():
            return

        for run in paragraph.runs:
            if run.text and run.text.strip():
                original = run.text
                # Capture leading/trailing whitespace that must survive translation.
                leading = original[: len(original) - len(original.lstrip())]
                trailing = original[len(original.rstrip()) :]
                stripped = original.strip()

                def make_setter(r, lead: str, trail: str):
                    def setter(text: str) -> None:
                        # Strip any whitespace the translator may have added/removed
                        # and reapply the original boundary whitespace.
                        r.text = lead + text.strip() + trail

                    return setter

                text_elements.append((stripped, make_setter(run, leading, trailing)))

    def _collect_from_table(
        self, table: Table, text_elements: List[Tuple[str, Callable[[str], None]]]
    ) -> None:
        """Collect text from table cells."""
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    self._collect_from_paragraph(paragraph, text_elements)
                for nested_table in cell.tables:
                    self._collect_from_table(nested_table, text_elements)

    def _collect_from_section(
        self, section: Section, text_elements: List[Tuple[str, Callable[[str], None]]]
    ) -> None:
        """Collect text from headers and footers."""
        headers_footers = [
            section.header,
            section.footer,
            section.first_page_header,
            section.first_page_footer,
            section.even_page_header,
            section.even_page_footer,
        ]

        for hf in headers_footers:
            if hf:
                for paragraph in hf.paragraphs:
                    self._collect_from_paragraph(paragraph, text_elements)
                for table in hf.tables:
                    self._collect_from_table(table, text_elements)


word_translator = WordTranslator()