Files
office_translator/translators/word_translator.py
2026-03-07 11:42:58 +01:00

528 lines
19 KiB
Python

"""
Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images
OPTIMIZED: Uses batch translation for 5-10x faster processing
Updated to use new TranslationProvider interface with structured error handling.
"""
import time
import concurrent.futures
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable, Any
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.section import Section
from services.providers.base import TranslationProvider
# Languages written right-to-left
RTL_LANGUAGES: frozenset = frozenset(
{"ar", "he", "fa", "ur", "ku", "ps", "ug", "sd", "yi", "dv", "ckb"}
)
try:
import structlog
logger = structlog.get_logger(__name__)
_HAS_STRUCTLOG = True
except ImportError:
import logging
logger = logging.getLogger(__name__)
_HAS_STRUCTLOG = False
def _log_info(event: str, **kwargs):
"""Log info with structlog or standard logging compatibility."""
if _HAS_STRUCTLOG:
logger.info(event, **kwargs)
else:
msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
logger.info(msg)
def _log_error(event: str, **kwargs):
"""Log error with structlog or standard logging compatibility."""
if _HAS_STRUCTLOG:
logger.error(event, **kwargs)
else:
msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
logger.error(msg)
def _set_paragraph_rtl(paragraph: Paragraph) -> None:
"""
Enable RTL mode on a paragraph and all its runs.
Sets:
- w:pPr/w:bidi → paragraph text direction = RTL
- w:pPr/w:jc → alignment = right
- w:rPr/w:rtl → run-level RTL marker for each run
"""
pPr = paragraph._p.get_or_add_pPr()
if pPr.find(qn("w:bidi")) is None:
pPr.append(OxmlElement("w:bidi"))
jc = pPr.find(qn("w:jc"))
if jc is None:
jc = OxmlElement("w:jc")
pPr.append(jc)
jc.set(qn("w:val"), "right")
for run in paragraph.runs:
rPr = run._r.get_or_add_rPr()
if rPr.find(qn("w:rtl")) is None:
rPr.append(OxmlElement("w:rtl"))
def _apply_rtl_to_document(document: Document) -> None:
"""Apply RTL direction to every paragraph and section in the document."""
# Body paragraphs
for para in document.paragraphs:
_set_paragraph_rtl(para)
# Body tables
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
_set_paragraph_rtl(para)
# Headers, footers, and section-level RTL (page layout direction)
for section in document.sections:
# Set the section (page) direction to RTL so Word renders margins,
# columns and page numbering from right to left.
sectPr = section._sectPr
if sectPr.find(qn("w:bidi")) is None:
sectPr.append(OxmlElement("w:bidi"))
for hf in (section.header, section.footer):
for para in hf.paragraphs:
_set_paragraph_rtl(para)
for table in hf.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
_set_paragraph_rtl(para)
class WordProcessorError(Exception):
"""Exception for Word processing errors with structured error codes."""
INVALID_FORMAT = "INVALID_FORMAT"
DOCX_CORRUPTED = "DOCX_CORRUPTED"
DOCX_READ_ERROR = "DOCX_READ_ERROR"
DOCX_WRITE_ERROR = "DOCX_WRITE_ERROR"
DOCX_TOO_LARGE = "DOCX_TOO_LARGE"
ERROR_MESSAGES = {
INVALID_FORMAT: "Format de fichier non supporte. Utilisez .docx.",
DOCX_CORRUPTED: "Le document Word est corrompu ou illisible.",
DOCX_READ_ERROR: "Erreur lors de la lecture du document Word.",
DOCX_WRITE_ERROR: "Erreur lors de la creation du document traduit.",
DOCX_TOO_LARGE: "Le fichier est trop volumineux (max 50 Mo).",
}
def __init__(
self,
code: str,
message: Optional[str] = None,
details: Optional[Dict[str, Any]] = None,
):
self.code = code
self.message = message or self.ERROR_MESSAGES.get(code, "Erreur inconnue")
self.details = details or {}
super().__init__(self.message)
def to_dict(self) -> Dict[str, Any]:
"""Convert error to dictionary format for API responses."""
result = {"error": self.code, "message": self.message}
if self.details:
result["details"] = self.details
return result
class WordTranslator:
"""
Handles translation of Word documents with strict formatting preservation.
Uses the new TranslationProvider interface for improved error handling
and fallback chain support.
"""
MAX_FILE_SIZE_MB = 50
DOCX_MAGIC_BYTES = b"PK" # .docx files are ZIP archives
def __init__(self, provider: Optional[TranslationProvider] = None):
"""
Initialize WordTranslator.
Args:
provider: TranslationProvider instance for translations.
If None, will use fallback to legacy translation_service.
"""
self._provider = provider
self._custom_prompt: Optional[str] = None
def set_provider(self, provider: TranslationProvider) -> None:
"""Set the translation provider."""
self._provider = provider
def set_custom_prompt(self, prompt: Optional[str]) -> None:
"""Set custom system prompt for LLM providers."""
self._custom_prompt = prompt
def translate_file(
self,
input_path: Path,
output_path: Path,
target_language: str,
source_language: str = "auto",
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
) -> Path:
"""
Translate a Word document while preserving all formatting and structure.
Uses batch translation for improved performance.
Args:
input_path: Path to input Word file
output_path: Path for translated output file
target_language: Target language code (e.g., 'fr', 'en')
source_language: Source language code (default: auto-detect)
progress_callback: Optional callback for progress updates
Receives dict with: element, total_elements, runs_translated
Returns:
Path to translated file
Raises:
WordProcessorError: If file is invalid, corrupted, or processing fails
"""
start_time = time.time()
input_path = Path(input_path)
output_path = Path(output_path)
self._validate_file(input_path)
try:
document = Document(input_path)
except Exception as e:
raise WordProcessorError(
code=WordProcessorError.DOCX_CORRUPTED,
details={"file_name": input_path.name, "error": str(e)},
)
try:
runs_translated = 0
text_elements: List[Tuple[str, Callable[[str], None]]] = []
self._collect_from_body(document, text_elements)
total_sections = len(document.sections)
total_elements = 0
for section_idx, section in enumerate(document.sections):
self._collect_from_section(section, text_elements)
total_elements = len(text_elements)
if progress_callback:
progress_callback(
{
"current": section_idx + 1,
"total": total_sections,
"paragraph": section_idx + 1,
"total_paragraphs": total_sections,
"runs_translated": runs_translated,
"phase": "collecting",
}
)
if text_elements:
texts = [elem[0] for elem in text_elements]
total_elements = len(text_elements)
_log_info(
"word_batch_translation_start",
file_name=input_path.name,
text_count=len(texts),
target_lang=target_language,
)
# Split into chunks and translate them IN PARALLEL using a thread
# pool. Each worker handles one chunk independently, making
# full use of available CPU/network concurrency. Progress is
# reported as chunks complete (out-of-order completions are
# fine — the tracker only moves forward).
CHUNK_SIZE = 15
MAX_WORKERS = 6
chunks = [
(i, texts[i : i + CHUNK_SIZE])
for i in range(0, total_elements, CHUNK_SIZE)
]
translated_texts: List[str] = [""] * total_elements
completed_items = [0] # mutable counter shared across threads
def _translate_chunk(
chunk_idx: int, chunk: List[str]
) -> Tuple[int, List[str]]:
result = self._batch_translate(chunk, target_language, source_language)
return chunk_idx, result
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
future_map = {
pool.submit(_translate_chunk, idx, chunk): (idx, chunk)
for idx, chunk in chunks
}
for future in concurrent.futures.as_completed(future_map):
chunk_idx, translated_chunk = future.result()
start = chunk_idx
for j, t in enumerate(translated_chunk):
translated_texts[start + j] = t
completed_items[0] += len(translated_chunk)
if progress_callback:
done = min(completed_items[0], total_elements)
progress_callback(
{
"current": done,
"total": total_elements,
"paragraph": done,
"total_paragraphs": total_elements,
"runs_translated": runs_translated,
"phase": "translating",
}
)
# Apply translations (fast — just text assignment)
for i, ((original_text, setter), translated) in enumerate(
zip(text_elements, translated_texts)
):
if translated is not None and setter is not None:
try:
setter(translated)
runs_translated += 1
except Exception as e:
_log_error(
"word_setter_error",
error=str(e),
index=i,
)
# Apply RTL layout when the target language is written right-to-left.
if target_language.lower() in RTL_LANGUAGES:
_apply_rtl_to_document(document)
if progress_callback:
progress_callback(
{
"current": total_elements if text_elements else total_sections,
"total": total_elements if text_elements else total_sections,
"paragraph": total_sections,
"total_paragraphs": total_sections,
"runs_translated": runs_translated,
"phase": "complete",
}
)
try:
document.save(output_path)
except Exception as e:
raise WordProcessorError(
code=WordProcessorError.DOCX_WRITE_ERROR,
details={"file_name": output_path.name, "error": str(e)},
)
processing_time_ms = round((time.time() - start_time) * 1000, 2)
_log_info(
"word_translation_success",
file_name=input_path.name,
runs_translated=runs_translated,
source_lang=source_language,
target_lang=target_language,
processing_time_ms=processing_time_ms,
)
return output_path
except WordProcessorError:
raise
except Exception as e:
raise WordProcessorError(
code=WordProcessorError.DOCX_READ_ERROR,
details={"file_name": input_path.name, "error": str(e)},
)
def _validate_file(self, file_path: Path) -> None:
"""Validate file format and size."""
if not file_path.exists():
raise WordProcessorError(
code=WordProcessorError.DOCX_READ_ERROR,
message=f"Fichier introuvable: {file_path.name}",
details={"file_name": file_path.name},
)
if file_path.suffix.lower() != ".docx":
raise WordProcessorError(
code=WordProcessorError.INVALID_FORMAT,
details={
"file_name": file_path.name,
"extension": file_path.suffix,
"expected": ".docx",
},
)
with open(file_path, "rb") as f:
header = f.read(4)
if header[:2] != self.DOCX_MAGIC_BYTES:
raise WordProcessorError(
code=WordProcessorError.INVALID_FORMAT,
details={"file_name": file_path.name, "reason": "Invalid file header"},
)
file_size_mb = file_path.stat().st_size / (1024 * 1024)
if file_size_mb > self.MAX_FILE_SIZE_MB:
raise WordProcessorError(
code=WordProcessorError.DOCX_TOO_LARGE,
details={
"file_name": file_path.name,
"size_mb": round(file_size_mb, 2),
"max_mb": self.MAX_FILE_SIZE_MB,
},
)
def _batch_translate(
self, texts: List[str], target_language: str, source_language: str = "auto"
) -> List[str]:
"""
Batch translate using new provider interface.
Args:
texts: List of texts to translate
target_language: Target language code
source_language: Source language code
Returns:
List of translated texts (same order as input)
"""
if not texts:
return []
if self._provider is not None:
return self._translate_with_provider(
texts, target_language, source_language
)
return self._translate_with_legacy(texts, target_language, source_language)
def _translate_with_provider(
self, texts: List[str], target_language: str, source_language: str
) -> List[str]:
"""Translate using the TranslationProvider.translate_batch() interface."""
translated = self._provider.translate_batch(texts, target_language, source_language)
# Fallback: keep original text for any empty/failed result
return [
t if (t and t.strip()) else orig
for t, orig in zip(translated, texts)
]
def _translate_with_legacy(
self, texts: List[str], target_language: str, source_language: str
) -> List[str]:
"""Fallback to legacy translation_service for backward compatibility."""
from services.translation_service import translation_service
_log_info(
"word_using_legacy_service",
text_count=len(texts),
target_lang=target_language,
)
return translation_service.translate_batch(
texts, target_language, source_language
)
def _collect_from_body(
self, document: Document, text_elements: List[Tuple[str, Callable[[str], None]]]
) -> None:
"""Collect all text elements from document body."""
for element in document.element.body:
if isinstance(element, CT_P):
paragraph = Paragraph(element, document)
self._collect_from_paragraph(paragraph, text_elements)
elif isinstance(element, CT_Tbl):
table = Table(element, document)
self._collect_from_table(table, text_elements)
def _collect_from_paragraph(
self,
paragraph: Paragraph,
text_elements: List[Tuple[str, Callable[[str], None]]],
) -> None:
"""Collect text from paragraph runs, preserving inter-run whitespace.
Each run is sent for translation WITHOUT its surrounding whitespace.
The whitespace is captured and reapplied after translation so that words
at formatting boundaries (e.g. bold/normal) do not get concatenated.
"""
if not paragraph.text.strip():
return
for run in paragraph.runs:
if run.text and run.text.strip():
original = run.text
# Capture leading/trailing whitespace that must survive translation.
leading = original[: len(original) - len(original.lstrip())]
trailing = original[len(original.rstrip()) :]
stripped = original.strip()
def make_setter(r, lead: str, trail: str):
def setter(text: str) -> None:
# Strip any whitespace the translator may have added/removed
# and reapply the original boundary whitespace.
r.text = lead + text.strip() + trail
return setter
text_elements.append((stripped, make_setter(run, leading, trailing)))
def _collect_from_table(
self, table: Table, text_elements: List[Tuple[str, Callable[[str], None]]]
) -> None:
"""Collect text from table cells."""
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
for nested_table in cell.tables:
self._collect_from_table(nested_table, text_elements)
def _collect_from_section(
self, section: Section, text_elements: List[Tuple[str, Callable[[str], None]]]
) -> None:
"""Collect text from headers and footers."""
headers_footers = [
section.header,
section.footer,
section.first_page_header,
section.first_page_footer,
section.even_page_header,
section.even_page_footer,
]
for hf in headers_footers:
if hf:
for paragraph in hf.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
for table in hf.tables:
self._collect_from_table(table, text_elements)
word_translator = WordTranslator()