office_translator/translators/word_translator.py

"""
Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images
OPTIMIZED: Uses batch translation for 5-10x faster processing
"""
from pathlib import Path
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.section import Section
from docx.shared import Inches, Pt
from docx.oxml.ns import qn
from services.translation_service import translation_service
from typing import List, Tuple, Any
import tempfile
import os


class WordTranslator:
    """Handles translation of Word documents with strict formatting preservation"""

    def __init__(self):
        self.translation_service = translation_service

    def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
        """
        Translate a Word document while preserving all formatting and structure.
        Uses batch translation for improved performance.
        """
        document = Document(input_path)

        # Collect all translatable text elements
        text_elements = []

        # Collect from document body
        self._collect_from_body(document, text_elements)

        # Collect from headers and footers
        for section in document.sections:
            self._collect_from_section(section, text_elements)

        # Batch translate all texts at once
        if text_elements:
            texts = [elem[0] for elem in text_elements]
            print(f"Batch translating {len(texts)} text segments...")
            translated_texts = self.translation_service.translate_batch(texts, target_language)

            # Apply translations
            for (original_text, setter), translated in zip(text_elements, translated_texts):
                if translated is not None and translated != original_text:
                    try:
                        setter(translated)
                    except Exception as e:
                        print(f"Error applying translation: {e}")

        # Translate images if enabled (separate process)
        if getattr(self.translation_service, 'translate_images', False):
            self._translate_images(document, target_language, input_path)

        # Save the translated document
        document.save(output_path)

        return output_path

    def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]):
        """Collect all text elements from document body"""
        for element in document.element.body:
            if isinstance(element, CT_P):
                paragraph = Paragraph(element, document)
                self._collect_from_paragraph(paragraph, text_elements)
            elif isinstance(element, CT_Tbl):
                table = Table(element, document)
                self._collect_from_table(table, text_elements)

    def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]):
        """Collect text from paragraph runs"""
        if not paragraph.text.strip():
            return

        for run in paragraph.runs:
            if run.text and run.text.strip():
                # Create a setter function for this run
                def make_setter(r):
                    def setter(text):
                        r.text = text
                    return setter
                text_elements.append((run.text, make_setter(run)))

    def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]):
        """Collect text from table cells"""
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    self._collect_from_paragraph(paragraph, text_elements)
                # Handle nested tables
                for nested_table in cell.tables:
                    self._collect_from_table(nested_table, text_elements)

    def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]):
        """Collect text from headers and footers"""
        headers_footers = [
            section.header, section.footer,
            section.first_page_header, section.first_page_footer,
            section.even_page_header, section.even_page_footer
        ]

        for hf in headers_footers:
            if hf:
                for paragraph in hf.paragraphs:
                    self._collect_from_paragraph(paragraph, text_elements)
                for table in hf.tables:
                    self._collect_from_table(table, text_elements)

    def _translate_images(self, document: Document, target_language: str, input_path: Path):
        """Extract text from images and add translations as captions"""
        from services.translation_service import OllamaTranslationProvider

        if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
            return

        try:
            import zipfile
            import base64

            with zipfile.ZipFile(input_path, 'r') as zip_ref:
                image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]

                for idx, image_file in enumerate(image_files):
                    try:
                        image_data = zip_ref.read(image_file)
                        ext = os.path.splitext(image_file)[1]

                        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
                            tmp.write(image_data)
                            tmp_path = tmp.name

                        translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
                        os.unlink(tmp_path)

                        if translated_text and translated_text.strip():
                            p = document.add_paragraph()
                            p.add_run(f"[Image {idx + 1} translation: ").bold = True
                            p.add_run(translated_text)
                            p.add_run("]").bold = True
                            print(f"Translated image {idx + 1}: {translated_text[:50]}...")
                    except Exception as e:
                        print(f"Error translating image {image_file}: {e}")

        except Exception as e:
            print(f"Error processing images: {e}")


# Global translator instance
word_translator = WordTranslator()