office_translator/translators/word_translator.py

231 lines
9.0 KiB
Python

"""
Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images
"""
from pathlib import Path
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.section import Section
from docx.shared import Inches, Pt
from docx.oxml.ns import qn
from services.translation_service import translation_service
import tempfile
import os
class WordTranslator:
"""Handles translation of Word documents with strict formatting preservation"""
def __init__(self):
self.translation_service = translation_service
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
"""
Translate a Word document while preserving all formatting and structure
Args:
input_path: Path to input Word file
output_path: Path to save translated Word file
target_language: Target language code
Returns:
Path to the translated file
"""
document = Document(input_path)
# Translate main document body
self._translate_document_body(document, target_language)
# Translate headers and footers in all sections
for section in document.sections:
self._translate_section(section, target_language)
# Translate images if enabled
if getattr(self.translation_service, 'translate_images', False):
self._translate_images(document, target_language, input_path)
# Save the translated document
document.save(output_path)
return output_path
def _translate_images(self, document: Document, target_language: str, input_path: Path):
"""
Extract text from images and add translations as captions
"""
from services.translation_service import OllamaTranslationProvider
# Only works with Ollama vision
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return
try:
import zipfile
import base64
# Extract images from docx (it's a zip file)
with zipfile.ZipFile(input_path, 'r') as zip_ref:
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
for idx, image_file in enumerate(image_files):
try:
# Extract image
image_data = zip_ref.read(image_file)
# Create temp file
ext = os.path.splitext(image_file)[1]
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
# Translate image with vision
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
# Clean up temp file
os.unlink(tmp_path)
if translated_text and translated_text.strip():
# Add translated text as a new paragraph after image
# We'll add it at the end with a note
p = document.add_paragraph()
p.add_run(f"[Image {idx + 1} translation: ").bold = True
p.add_run(translated_text)
p.add_run("]").bold = True
print(f"Translated image {idx + 1}: {translated_text[:50]}...")
except Exception as e:
print(f"Error translating image {image_file}: {e}")
continue
except Exception as e:
print(f"Error processing images: {e}")
def _translate_document_body(self, document: Document, target_language: str):
"""
Translate all elements in the document body
Args:
document: Document to translate
target_language: Target language code
"""
for element in document.element.body:
if isinstance(element, CT_P):
# It's a paragraph
paragraph = Paragraph(element, document)
self._translate_paragraph(paragraph, target_language)
elif isinstance(element, CT_Tbl):
# It's a table
table = Table(element, document)
self._translate_table(table, target_language)
def _translate_paragraph(self, paragraph: Paragraph, target_language: str):
"""
Translate a paragraph while preserving all formatting
Args:
paragraph: Paragraph to translate
target_language: Target language code
"""
if not paragraph.text.strip():
return
# For paragraphs with complex formatting (multiple runs), translate run by run
if len(paragraph.runs) > 0:
for run in paragraph.runs:
if run.text.strip():
translated_text = self.translation_service.translate_text(
run.text, target_language
)
run.text = translated_text
else:
# Simple paragraph with no runs
if paragraph.text.strip():
translated_text = self.translation_service.translate_text(
paragraph.text, target_language
)
paragraph.text = translated_text
def _translate_table(self, table: Table, target_language: str):
"""
Translate all cells in a table while preserving structure
Args:
table: Table to translate
target_language: Target language code
"""
for row in table.rows:
for cell in row.cells:
self._translate_cell(cell, target_language)
def _translate_cell(self, cell: _Cell, target_language: str):
"""
Translate content within a table cell
Args:
cell: Cell to translate
target_language: Target language code
"""
for paragraph in cell.paragraphs:
self._translate_paragraph(paragraph, target_language)
# Handle nested tables
for table in cell.tables:
self._translate_table(table, target_language)
def _translate_section(self, section: Section, target_language: str):
"""
Translate headers and footers in a section
Args:
section: Section to translate
target_language: Target language code
"""
# Translate header
if section.header:
for paragraph in section.header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.header.tables:
self._translate_table(table, target_language)
# Translate footer
if section.footer:
for paragraph in section.footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.footer.tables:
self._translate_table(table, target_language)
# Translate first page header (if different)
if section.first_page_header:
for paragraph in section.first_page_header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.first_page_header.tables:
self._translate_table(table, target_language)
# Translate first page footer (if different)
if section.first_page_footer:
for paragraph in section.first_page_footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.first_page_footer.tables:
self._translate_table(table, target_language)
# Translate even page header (if different)
if section.even_page_header:
for paragraph in section.even_page_header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.even_page_header.tables:
self._translate_table(table, target_language)
# Translate even page footer (if different)
if section.even_page_footer:
for paragraph in section.even_page_footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.even_page_footer.tables:
self._translate_table(table, target_language)
# Global translator instance
word_translator = WordTranslator()