office_translator/translators/word_translator.py
Sepehr 8f9ca669cf Performance optimization: batch translation for 5-10x speed improvement
- GoogleTranslationProvider: Added batch translation with separator method
- DeepLTranslationProvider: Added translator caching and batch support
- LibreTranslationProvider: Added translator caching and batch support
- WordTranslator: Collect all texts -> batch translate -> apply pattern
- ExcelTranslator: Collect all texts -> batch translate -> apply pattern
- PowerPointTranslator: Collect all texts -> batch translate -> apply pattern
- Enhanced Ollama/OpenAI prompts with stricter translation-only rules
- Added rule: return original text if uncertain about translation
2025-11-30 20:41:20 +01:00

157 lines
6.5 KiB
Python

"""
Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images
OPTIMIZED: Uses batch translation for 5-10x faster processing
"""
from pathlib import Path
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.section import Section
from docx.shared import Inches, Pt
from docx.oxml.ns import qn
from services.translation_service import translation_service
from typing import List, Tuple, Any
import tempfile
import os
class WordTranslator:
"""Handles translation of Word documents with strict formatting preservation"""
def __init__(self):
self.translation_service = translation_service
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
"""
Translate a Word document while preserving all formatting and structure.
Uses batch translation for improved performance.
"""
document = Document(input_path)
# Collect all translatable text elements
text_elements = []
# Collect from document body
self._collect_from_body(document, text_elements)
# Collect from headers and footers
for section in document.sections:
self._collect_from_section(section, text_elements)
# Batch translate all texts at once
if text_elements:
texts = [elem[0] for elem in text_elements]
print(f"Batch translating {len(texts)} text segments...")
translated_texts = self.translation_service.translate_batch(texts, target_language)
# Apply translations
for (original_text, setter), translated in zip(text_elements, translated_texts):
if translated is not None and translated != original_text:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Translate images if enabled (separate process)
if getattr(self.translation_service, 'translate_images', False):
self._translate_images(document, target_language, input_path)
# Save the translated document
document.save(output_path)
return output_path
def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]):
"""Collect all text elements from document body"""
for element in document.element.body:
if isinstance(element, CT_P):
paragraph = Paragraph(element, document)
self._collect_from_paragraph(paragraph, text_elements)
elif isinstance(element, CT_Tbl):
table = Table(element, document)
self._collect_from_table(table, text_elements)
def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]):
"""Collect text from paragraph runs"""
if not paragraph.text.strip():
return
for run in paragraph.runs:
if run.text and run.text.strip():
# Create a setter function for this run
def make_setter(r):
def setter(text):
r.text = text
return setter
text_elements.append((run.text, make_setter(run)))
def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]):
"""Collect text from table cells"""
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
# Handle nested tables
for nested_table in cell.tables:
self._collect_from_table(nested_table, text_elements)
def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]):
"""Collect text from headers and footers"""
headers_footers = [
section.header, section.footer,
section.first_page_header, section.first_page_footer,
section.even_page_header, section.even_page_footer
]
for hf in headers_footers:
if hf:
for paragraph in hf.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
for table in hf.tables:
self._collect_from_table(table, text_elements)
def _translate_images(self, document: Document, target_language: str, input_path: Path):
"""Extract text from images and add translations as captions"""
from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return
try:
import zipfile
import base64
with zipfile.ZipFile(input_path, 'r') as zip_ref:
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
for idx, image_file in enumerate(image_files):
try:
image_data = zip_ref.read(image_file)
ext = os.path.splitext(image_file)[1]
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
os.unlink(tmp_path)
if translated_text and translated_text.strip():
p = document.add_paragraph()
p.add_run(f"[Image {idx + 1} translation: ").bold = True
p.add_run(translated_text)
p.add_run("]").bold = True
print(f"Translated image {idx + 1}: {translated_text[:50]}...")
except Exception as e:
print(f"Error translating image {image_file}: {e}")
except Exception as e:
print(f"Error processing images: {e}")
# Global translator instance
word_translator = WordTranslator()