528 lines
19 KiB
Python
528 lines
19 KiB
Python
"""
|
|
Word Document Translation Module
|
|
Translates Word files while preserving all formatting, styles, tables, and images
|
|
OPTIMIZED: Uses batch translation for 5-10x faster processing
|
|
|
|
Updated to use new TranslationProvider interface with structured error handling.
|
|
"""
|
|
|
|
import time
|
|
import concurrent.futures
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple, Optional, Callable, Any
|
|
|
|
from docx import Document
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.table import Table, _Cell
|
|
from docx.oxml.text.paragraph import CT_P
|
|
from docx.oxml.table import CT_Tbl
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
from docx.section import Section
|
|
|
|
from services.providers.base import TranslationProvider
|
|
|
|
# Languages written right-to-left
|
|
RTL_LANGUAGES: frozenset = frozenset(
|
|
{"ar", "he", "fa", "ur", "ku", "ps", "ug", "sd", "yi", "dv", "ckb"}
|
|
)
|
|
|
|
|
|
try:
|
|
import structlog
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
_HAS_STRUCTLOG = True
|
|
except ImportError:
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
_HAS_STRUCTLOG = False
|
|
|
|
|
|
def _log_info(event: str, **kwargs):
|
|
"""Log info with structlog or standard logging compatibility."""
|
|
if _HAS_STRUCTLOG:
|
|
logger.info(event, **kwargs)
|
|
else:
|
|
msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
|
|
logger.info(msg)
|
|
|
|
|
|
def _log_error(event: str, **kwargs):
|
|
"""Log error with structlog or standard logging compatibility."""
|
|
if _HAS_STRUCTLOG:
|
|
logger.error(event, **kwargs)
|
|
else:
|
|
msg = f"{event} " + " ".join(f"{k}={v}" for k, v in kwargs.items())
|
|
logger.error(msg)
|
|
|
|
|
|
def _set_paragraph_rtl(paragraph: Paragraph) -> None:
|
|
"""
|
|
Enable RTL mode on a paragraph and all its runs.
|
|
|
|
Sets:
|
|
- w:pPr/w:bidi → paragraph text direction = RTL
|
|
- w:pPr/w:jc → alignment = right
|
|
- w:rPr/w:rtl → run-level RTL marker for each run
|
|
"""
|
|
pPr = paragraph._p.get_or_add_pPr()
|
|
|
|
if pPr.find(qn("w:bidi")) is None:
|
|
pPr.append(OxmlElement("w:bidi"))
|
|
|
|
jc = pPr.find(qn("w:jc"))
|
|
if jc is None:
|
|
jc = OxmlElement("w:jc")
|
|
pPr.append(jc)
|
|
jc.set(qn("w:val"), "right")
|
|
|
|
for run in paragraph.runs:
|
|
rPr = run._r.get_or_add_rPr()
|
|
if rPr.find(qn("w:rtl")) is None:
|
|
rPr.append(OxmlElement("w:rtl"))
|
|
|
|
|
|
def _apply_rtl_to_document(document: Document) -> None:
|
|
"""Apply RTL direction to every paragraph and section in the document."""
|
|
# Body paragraphs
|
|
for para in document.paragraphs:
|
|
_set_paragraph_rtl(para)
|
|
# Body tables
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for para in cell.paragraphs:
|
|
_set_paragraph_rtl(para)
|
|
# Headers, footers, and section-level RTL (page layout direction)
|
|
for section in document.sections:
|
|
# Set the section (page) direction to RTL so Word renders margins,
|
|
# columns and page numbering from right to left.
|
|
sectPr = section._sectPr
|
|
if sectPr.find(qn("w:bidi")) is None:
|
|
sectPr.append(OxmlElement("w:bidi"))
|
|
|
|
for hf in (section.header, section.footer):
|
|
for para in hf.paragraphs:
|
|
_set_paragraph_rtl(para)
|
|
for table in hf.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for para in cell.paragraphs:
|
|
_set_paragraph_rtl(para)
|
|
|
|
|
|
class WordProcessorError(Exception):
|
|
"""Exception for Word processing errors with structured error codes."""
|
|
|
|
INVALID_FORMAT = "INVALID_FORMAT"
|
|
DOCX_CORRUPTED = "DOCX_CORRUPTED"
|
|
DOCX_READ_ERROR = "DOCX_READ_ERROR"
|
|
DOCX_WRITE_ERROR = "DOCX_WRITE_ERROR"
|
|
DOCX_TOO_LARGE = "DOCX_TOO_LARGE"
|
|
|
|
ERROR_MESSAGES = {
|
|
INVALID_FORMAT: "Format de fichier non supporte. Utilisez .docx.",
|
|
DOCX_CORRUPTED: "Le document Word est corrompu ou illisible.",
|
|
DOCX_READ_ERROR: "Erreur lors de la lecture du document Word.",
|
|
DOCX_WRITE_ERROR: "Erreur lors de la creation du document traduit.",
|
|
DOCX_TOO_LARGE: "Le fichier est trop volumineux (max 50 Mo).",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
code: str,
|
|
message: Optional[str] = None,
|
|
details: Optional[Dict[str, Any]] = None,
|
|
):
|
|
self.code = code
|
|
self.message = message or self.ERROR_MESSAGES.get(code, "Erreur inconnue")
|
|
self.details = details or {}
|
|
super().__init__(self.message)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert error to dictionary format for API responses."""
|
|
result = {"error": self.code, "message": self.message}
|
|
if self.details:
|
|
result["details"] = self.details
|
|
return result
|
|
|
|
|
|
class WordTranslator:
|
|
"""
|
|
Handles translation of Word documents with strict formatting preservation.
|
|
|
|
Uses the new TranslationProvider interface for improved error handling
|
|
and fallback chain support.
|
|
"""
|
|
|
|
MAX_FILE_SIZE_MB = 50
|
|
DOCX_MAGIC_BYTES = b"PK" # .docx files are ZIP archives
|
|
|
|
def __init__(self, provider: Optional[TranslationProvider] = None):
|
|
"""
|
|
Initialize WordTranslator.
|
|
|
|
Args:
|
|
provider: TranslationProvider instance for translations.
|
|
If None, will use fallback to legacy translation_service.
|
|
"""
|
|
self._provider = provider
|
|
self._custom_prompt: Optional[str] = None
|
|
|
|
def set_provider(self, provider: TranslationProvider) -> None:
|
|
"""Set the translation provider."""
|
|
self._provider = provider
|
|
|
|
def set_custom_prompt(self, prompt: Optional[str]) -> None:
|
|
"""Set custom system prompt for LLM providers."""
|
|
self._custom_prompt = prompt
|
|
|
|
def translate_file(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
target_language: str,
|
|
source_language: str = "auto",
|
|
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
) -> Path:
|
|
"""
|
|
Translate a Word document while preserving all formatting and structure.
|
|
Uses batch translation for improved performance.
|
|
|
|
Args:
|
|
input_path: Path to input Word file
|
|
output_path: Path for translated output file
|
|
target_language: Target language code (e.g., 'fr', 'en')
|
|
source_language: Source language code (default: auto-detect)
|
|
progress_callback: Optional callback for progress updates
|
|
Receives dict with: element, total_elements, runs_translated
|
|
|
|
Returns:
|
|
Path to translated file
|
|
|
|
Raises:
|
|
WordProcessorError: If file is invalid, corrupted, or processing fails
|
|
"""
|
|
start_time = time.time()
|
|
|
|
input_path = Path(input_path)
|
|
output_path = Path(output_path)
|
|
|
|
self._validate_file(input_path)
|
|
|
|
try:
|
|
document = Document(input_path)
|
|
except Exception as e:
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.DOCX_CORRUPTED,
|
|
details={"file_name": input_path.name, "error": str(e)},
|
|
)
|
|
|
|
try:
|
|
runs_translated = 0
|
|
|
|
text_elements: List[Tuple[str, Callable[[str], None]]] = []
|
|
|
|
self._collect_from_body(document, text_elements)
|
|
|
|
total_sections = len(document.sections)
|
|
total_elements = 0
|
|
for section_idx, section in enumerate(document.sections):
|
|
self._collect_from_section(section, text_elements)
|
|
total_elements = len(text_elements)
|
|
|
|
if progress_callback:
|
|
progress_callback(
|
|
{
|
|
"current": section_idx + 1,
|
|
"total": total_sections,
|
|
"paragraph": section_idx + 1,
|
|
"total_paragraphs": total_sections,
|
|
"runs_translated": runs_translated,
|
|
"phase": "collecting",
|
|
}
|
|
)
|
|
|
|
if text_elements:
|
|
texts = [elem[0] for elem in text_elements]
|
|
total_elements = len(text_elements)
|
|
_log_info(
|
|
"word_batch_translation_start",
|
|
file_name=input_path.name,
|
|
text_count=len(texts),
|
|
target_lang=target_language,
|
|
)
|
|
|
|
# Split into chunks and translate them IN PARALLEL using a thread
|
|
# pool. Each worker handles one chunk independently, making
|
|
# full use of available CPU/network concurrency. Progress is
|
|
# reported as chunks complete (out-of-order completions are
|
|
# fine — the tracker only moves forward).
|
|
CHUNK_SIZE = 15
|
|
MAX_WORKERS = 6
|
|
chunks = [
|
|
(i, texts[i : i + CHUNK_SIZE])
|
|
for i in range(0, total_elements, CHUNK_SIZE)
|
|
]
|
|
translated_texts: List[str] = [""] * total_elements
|
|
completed_items = [0] # mutable counter shared across threads
|
|
|
|
def _translate_chunk(
|
|
chunk_idx: int, chunk: List[str]
|
|
) -> Tuple[int, List[str]]:
|
|
result = self._batch_translate(chunk, target_language, source_language)
|
|
return chunk_idx, result
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
|
|
future_map = {
|
|
pool.submit(_translate_chunk, idx, chunk): (idx, chunk)
|
|
for idx, chunk in chunks
|
|
}
|
|
for future in concurrent.futures.as_completed(future_map):
|
|
chunk_idx, translated_chunk = future.result()
|
|
start = chunk_idx
|
|
for j, t in enumerate(translated_chunk):
|
|
translated_texts[start + j] = t
|
|
completed_items[0] += len(translated_chunk)
|
|
if progress_callback:
|
|
done = min(completed_items[0], total_elements)
|
|
progress_callback(
|
|
{
|
|
"current": done,
|
|
"total": total_elements,
|
|
"paragraph": done,
|
|
"total_paragraphs": total_elements,
|
|
"runs_translated": runs_translated,
|
|
"phase": "translating",
|
|
}
|
|
)
|
|
|
|
# Apply translations (fast — just text assignment)
|
|
for i, ((original_text, setter), translated) in enumerate(
|
|
zip(text_elements, translated_texts)
|
|
):
|
|
if translated is not None and setter is not None:
|
|
try:
|
|
setter(translated)
|
|
runs_translated += 1
|
|
except Exception as e:
|
|
_log_error(
|
|
"word_setter_error",
|
|
error=str(e),
|
|
index=i,
|
|
)
|
|
|
|
# Apply RTL layout when the target language is written right-to-left.
|
|
if target_language.lower() in RTL_LANGUAGES:
|
|
_apply_rtl_to_document(document)
|
|
|
|
if progress_callback:
|
|
progress_callback(
|
|
{
|
|
"current": total_elements if text_elements else total_sections,
|
|
"total": total_elements if text_elements else total_sections,
|
|
"paragraph": total_sections,
|
|
"total_paragraphs": total_sections,
|
|
"runs_translated": runs_translated,
|
|
"phase": "complete",
|
|
}
|
|
)
|
|
|
|
try:
|
|
document.save(output_path)
|
|
except Exception as e:
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.DOCX_WRITE_ERROR,
|
|
details={"file_name": output_path.name, "error": str(e)},
|
|
)
|
|
|
|
processing_time_ms = round((time.time() - start_time) * 1000, 2)
|
|
|
|
_log_info(
|
|
"word_translation_success",
|
|
file_name=input_path.name,
|
|
runs_translated=runs_translated,
|
|
source_lang=source_language,
|
|
target_lang=target_language,
|
|
processing_time_ms=processing_time_ms,
|
|
)
|
|
|
|
return output_path
|
|
|
|
except WordProcessorError:
|
|
raise
|
|
except Exception as e:
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.DOCX_READ_ERROR,
|
|
details={"file_name": input_path.name, "error": str(e)},
|
|
)
|
|
|
|
def _validate_file(self, file_path: Path) -> None:
|
|
"""Validate file format and size."""
|
|
if not file_path.exists():
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.DOCX_READ_ERROR,
|
|
message=f"Fichier introuvable: {file_path.name}",
|
|
details={"file_name": file_path.name},
|
|
)
|
|
|
|
if file_path.suffix.lower() != ".docx":
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.INVALID_FORMAT,
|
|
details={
|
|
"file_name": file_path.name,
|
|
"extension": file_path.suffix,
|
|
"expected": ".docx",
|
|
},
|
|
)
|
|
|
|
with open(file_path, "rb") as f:
|
|
header = f.read(4)
|
|
if header[:2] != self.DOCX_MAGIC_BYTES:
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.INVALID_FORMAT,
|
|
details={"file_name": file_path.name, "reason": "Invalid file header"},
|
|
)
|
|
|
|
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
if file_size_mb > self.MAX_FILE_SIZE_MB:
|
|
raise WordProcessorError(
|
|
code=WordProcessorError.DOCX_TOO_LARGE,
|
|
details={
|
|
"file_name": file_path.name,
|
|
"size_mb": round(file_size_mb, 2),
|
|
"max_mb": self.MAX_FILE_SIZE_MB,
|
|
},
|
|
)
|
|
|
|
def _batch_translate(
|
|
self, texts: List[str], target_language: str, source_language: str = "auto"
|
|
) -> List[str]:
|
|
"""
|
|
Batch translate using new provider interface.
|
|
|
|
Args:
|
|
texts: List of texts to translate
|
|
target_language: Target language code
|
|
source_language: Source language code
|
|
|
|
Returns:
|
|
List of translated texts (same order as input)
|
|
"""
|
|
if not texts:
|
|
return []
|
|
|
|
if self._provider is not None:
|
|
return self._translate_with_provider(
|
|
texts, target_language, source_language
|
|
)
|
|
|
|
return self._translate_with_legacy(texts, target_language, source_language)
|
|
|
|
def _translate_with_provider(
|
|
self, texts: List[str], target_language: str, source_language: str
|
|
) -> List[str]:
|
|
"""Translate using the TranslationProvider.translate_batch() interface."""
|
|
translated = self._provider.translate_batch(texts, target_language, source_language)
|
|
# Fallback: keep original text for any empty/failed result
|
|
return [
|
|
t if (t and t.strip()) else orig
|
|
for t, orig in zip(translated, texts)
|
|
]
|
|
|
|
def _translate_with_legacy(
|
|
self, texts: List[str], target_language: str, source_language: str
|
|
) -> List[str]:
|
|
"""Fallback to legacy translation_service for backward compatibility."""
|
|
from services.translation_service import translation_service
|
|
|
|
_log_info(
|
|
"word_using_legacy_service",
|
|
text_count=len(texts),
|
|
target_lang=target_language,
|
|
)
|
|
|
|
return translation_service.translate_batch(
|
|
texts, target_language, source_language
|
|
)
|
|
|
|
def _collect_from_body(
|
|
self, document: Document, text_elements: List[Tuple[str, Callable[[str], None]]]
|
|
) -> None:
|
|
"""Collect all text elements from document body."""
|
|
for element in document.element.body:
|
|
if isinstance(element, CT_P):
|
|
paragraph = Paragraph(element, document)
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
elif isinstance(element, CT_Tbl):
|
|
table = Table(element, document)
|
|
self._collect_from_table(table, text_elements)
|
|
|
|
def _collect_from_paragraph(
|
|
self,
|
|
paragraph: Paragraph,
|
|
text_elements: List[Tuple[str, Callable[[str], None]]],
|
|
) -> None:
|
|
"""Collect text from paragraph runs, preserving inter-run whitespace.
|
|
|
|
Each run is sent for translation WITHOUT its surrounding whitespace.
|
|
The whitespace is captured and reapplied after translation so that words
|
|
at formatting boundaries (e.g. bold/normal) do not get concatenated.
|
|
"""
|
|
if not paragraph.text.strip():
|
|
return
|
|
|
|
for run in paragraph.runs:
|
|
if run.text and run.text.strip():
|
|
original = run.text
|
|
# Capture leading/trailing whitespace that must survive translation.
|
|
leading = original[: len(original) - len(original.lstrip())]
|
|
trailing = original[len(original.rstrip()) :]
|
|
stripped = original.strip()
|
|
|
|
def make_setter(r, lead: str, trail: str):
|
|
def setter(text: str) -> None:
|
|
# Strip any whitespace the translator may have added/removed
|
|
# and reapply the original boundary whitespace.
|
|
r.text = lead + text.strip() + trail
|
|
|
|
return setter
|
|
|
|
text_elements.append((stripped, make_setter(run, leading, trailing)))
|
|
|
|
def _collect_from_table(
|
|
self, table: Table, text_elements: List[Tuple[str, Callable[[str], None]]]
|
|
) -> None:
|
|
"""Collect text from table cells."""
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for paragraph in cell.paragraphs:
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
for nested_table in cell.tables:
|
|
self._collect_from_table(nested_table, text_elements)
|
|
|
|
def _collect_from_section(
|
|
self, section: Section, text_elements: List[Tuple[str, Callable[[str], None]]]
|
|
) -> None:
|
|
"""Collect text from headers and footers."""
|
|
headers_footers = [
|
|
section.header,
|
|
section.footer,
|
|
section.first_page_header,
|
|
section.first_page_footer,
|
|
section.even_page_header,
|
|
section.even_page_footer,
|
|
]
|
|
|
|
for hf in headers_footers:
|
|
if hf:
|
|
for paragraph in hf.paragraphs:
|
|
self._collect_from_paragraph(paragraph, text_elements)
|
|
for table in hf.tables:
|
|
self._collect_from_table(table, text_elements)
|
|
|
|
|
|
word_translator = WordTranslator()
|