""" Word Document Translation Module Translates Word files while preserving all formatting, styles, tables, and images OPTIMIZED: Uses batch translation for 5-10x faster processing """ from pathlib import Path from docx import Document from docx.text.paragraph import Paragraph from docx.table import Table, _Cell from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl from docx.section import Section from docx.shared import Inches, Pt from docx.oxml.ns import qn from services.translation_service import translation_service from typing import List, Tuple, Any import tempfile import os class WordTranslator: """Handles translation of Word documents with strict formatting preservation""" def __init__(self): self.translation_service = translation_service def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: """ Translate a Word document while preserving all formatting and structure. Uses batch translation for improved performance. """ document = Document(input_path) # Collect all translatable text elements text_elements = [] # Collect from document body self._collect_from_body(document, text_elements) # Collect from headers and footers for section in document.sections: self._collect_from_section(section, text_elements) # Batch translate all texts at once if text_elements: texts = [elem[0] for elem in text_elements] print(f"Batch translating {len(texts)} text segments...") translated_texts = self.translation_service.translate_batch(texts, target_language) # Apply translations for (original_text, setter), translated in zip(text_elements, translated_texts): if translated is not None and translated != original_text: try: setter(translated) except Exception as e: print(f"Error applying translation: {e}") # Translate images if enabled (separate process) if getattr(self.translation_service, 'translate_images', False): self._translate_images(document, target_language, input_path) # Save the translated document document.save(output_path) return output_path def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]): """Collect all text elements from document body""" for element in document.element.body: if isinstance(element, CT_P): paragraph = Paragraph(element, document) self._collect_from_paragraph(paragraph, text_elements) elif isinstance(element, CT_Tbl): table = Table(element, document) self._collect_from_table(table, text_elements) def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]): """Collect text from paragraph runs""" if not paragraph.text.strip(): return for run in paragraph.runs: if run.text and run.text.strip(): # Create a setter function for this run def make_setter(r): def setter(text): r.text = text return setter text_elements.append((run.text, make_setter(run))) def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]): """Collect text from table cells""" for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: self._collect_from_paragraph(paragraph, text_elements) # Handle nested tables for nested_table in cell.tables: self._collect_from_table(nested_table, text_elements) def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]): """Collect text from headers and footers""" headers_footers = [ section.header, section.footer, section.first_page_header, section.first_page_footer, section.even_page_header, section.even_page_footer ] for hf in headers_footers: if hf: for paragraph in hf.paragraphs: self._collect_from_paragraph(paragraph, text_elements) for table in hf.tables: self._collect_from_table(table, text_elements) def _translate_images(self, document: Document, target_language: str, input_path: Path): """Extract text from images and add translations as captions""" from services.translation_service import OllamaTranslationProvider if not isinstance(self.translation_service.provider, OllamaTranslationProvider): return try: import zipfile import base64 with zipfile.ZipFile(input_path, 'r') as zip_ref: image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')] for idx, image_file in enumerate(image_files): try: image_data = zip_ref.read(image_file) ext = os.path.splitext(image_file)[1] with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: tmp.write(image_data) tmp_path = tmp.name translated_text = self.translation_service.provider.translate_image(tmp_path, target_language) os.unlink(tmp_path) if translated_text and translated_text.strip(): p = document.add_paragraph() p.add_run(f"[Image {idx + 1} translation: ").bold = True p.add_run(translated_text) p.add_run("]").bold = True print(f"Translated image {idx + 1}: {translated_text[:50]}...") except Exception as e: print(f"Error translating image {image_file}: {e}") except Exception as e: print(f"Error processing images: {e}") # Global translator instance word_translator = WordTranslator()