- GoogleTranslationProvider: Added batch translation with separator method - DeepLTranslationProvider: Added translator caching and batch support - LibreTranslationProvider: Added translator caching and batch support - WordTranslator: Collect all texts -> batch translate -> apply pattern - ExcelTranslator: Collect all texts -> batch translate -> apply pattern - PowerPointTranslator: Collect all texts -> batch translate -> apply pattern - Enhanced Ollama/OpenAI prompts with stricter translation-only rules - Added rule: return original text if uncertain about translation
157 lines
6.5 KiB
Python
157 lines
6.5 KiB
Python
"""
|
|
Word Document Translation Module
|
|
Translates Word files while preserving all formatting, styles, tables, and images
|
|
OPTIMIZED: Uses batch translation for 5-10x faster processing
|
|
"""
|
|
from pathlib import Path
|
|
from docx import Document
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.table import Table, _Cell
|
|
from docx.oxml.text.paragraph import CT_P
|
|
from docx.oxml.table import CT_Tbl
|
|
from docx.section import Section
|
|
from docx.shared import Inches, Pt
|
|
from docx.oxml.ns import qn
|
|
from services.translation_service import translation_service
|
|
from typing import List, Tuple, Any
|
|
import tempfile
|
|
import os
|
|
|
|
|
|
class WordTranslator:
|
|
"""Handles translation of Word documents with strict formatting preservation"""
|
|
|
|
def __init__(self):
|
|
self.translation_service = translation_service
|
|
|
|
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
|
"""
|
|
Translate a Word document while preserving all formatting and structure.
|
|
Uses batch translation for improved performance.
|
|
"""
|
|
document = Document(input_path)
|
|
|
|
# Collect all translatable text elements
|
|
text_elements = []
|
|
|
|
# Collect from document body
|
|
self._collect_from_body(document, text_elements)
|
|
|
|
# Collect from headers and footers
|
|
for section in document.sections:
|
|
self._collect_from_section(section, text_elements)
|
|
|
|
# Batch translate all texts at once
|
|
if text_elements:
|
|
texts = [elem[0] for elem in text_elements]
|
|
print(f"Batch translating {len(texts)} text segments...")
|
|
translated_texts = self.translation_service.translate_batch(texts, target_language)
|
|
|
|
# Apply translations
|
|
for (original_text, setter), translated in zip(text_elements, translated_texts):
|
|
if translated is not None and translated != original_text:
|
|
try:
|
|
setter(translated)
|
|
except Exception as e:
|
|
print(f"Error applying translation: {e}")
|
|
|
|
# Translate images if enabled (separate process)
|
|
if getattr(self.translation_service, 'translate_images', False):
|
|
self._translate_images(document, target_language, input_path)
|
|
|
|
# Save the translated document
|
|
document.save(output_path)
|
|
|
|
return output_path
|
|
|
|
def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]):
|
|
"""Collect all text elements from document body"""
|
|
for element in document.element.body:
|
|
if isinstance(element, CT_P):
|
|
paragraph = Paragraph(element, document)
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
elif isinstance(element, CT_Tbl):
|
|
table = Table(element, document)
|
|
self._collect_from_table(table, text_elements)
|
|
|
|
def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]):
|
|
"""Collect text from paragraph runs"""
|
|
if not paragraph.text.strip():
|
|
return
|
|
|
|
for run in paragraph.runs:
|
|
if run.text and run.text.strip():
|
|
# Create a setter function for this run
|
|
def make_setter(r):
|
|
def setter(text):
|
|
r.text = text
|
|
return setter
|
|
text_elements.append((run.text, make_setter(run)))
|
|
|
|
def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]):
|
|
"""Collect text from table cells"""
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for paragraph in cell.paragraphs:
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
# Handle nested tables
|
|
for nested_table in cell.tables:
|
|
self._collect_from_table(nested_table, text_elements)
|
|
|
|
def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]):
|
|
"""Collect text from headers and footers"""
|
|
headers_footers = [
|
|
section.header, section.footer,
|
|
section.first_page_header, section.first_page_footer,
|
|
section.even_page_header, section.even_page_footer
|
|
]
|
|
|
|
for hf in headers_footers:
|
|
if hf:
|
|
for paragraph in hf.paragraphs:
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
for table in hf.tables:
|
|
self._collect_from_table(table, text_elements)
|
|
|
|
def _translate_images(self, document: Document, target_language: str, input_path: Path):
|
|
"""Extract text from images and add translations as captions"""
|
|
from services.translation_service import OllamaTranslationProvider
|
|
|
|
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
|
return
|
|
|
|
try:
|
|
import zipfile
|
|
import base64
|
|
|
|
with zipfile.ZipFile(input_path, 'r') as zip_ref:
|
|
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
|
|
|
|
for idx, image_file in enumerate(image_files):
|
|
try:
|
|
image_data = zip_ref.read(image_file)
|
|
ext = os.path.splitext(image_file)[1]
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
|
tmp.write(image_data)
|
|
tmp_path = tmp.name
|
|
|
|
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
|
os.unlink(tmp_path)
|
|
|
|
if translated_text and translated_text.strip():
|
|
p = document.add_paragraph()
|
|
p.add_run(f"[Image {idx + 1} translation: ").bold = True
|
|
p.add_run(translated_text)
|
|
p.add_run("]").bold = True
|
|
print(f"Translated image {idx + 1}: {translated_text[:50]}...")
|
|
except Exception as e:
|
|
print(f"Error translating image {image_file}: {e}")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing images: {e}")
|
|
|
|
|
|
# Global translator instance
|
|
word_translator = WordTranslator()
|