Performance optimization: batch translation for 5-10x speed improvement

- GoogleTranslationProvider: Added batch translation with separator method
- DeepLTranslationProvider: Added translator caching and batch support
- LibreTranslationProvider: Added translator caching and batch support
- WordTranslator: Collect all texts -> batch translate -> apply pattern
- ExcelTranslator: Collect all texts -> batch translate -> apply pattern
- PowerPointTranslator: Collect all texts -> batch translate -> apply pattern
- Enhanced Ollama/OpenAI prompts with stricter translation-only rules
- Added rule: return original text if uncertain about translation
This commit is contained in:
Sepehr 2025-11-30 20:41:20 +01:00
parent 54d85f0b34
commit 8f9ca669cf
5 changed files with 430 additions and 423 deletions

View File

@ -319,6 +319,9 @@ async def translate_document(
if validation_result.warnings: if validation_result.warnings:
logger.warning(f"[{request_id}] File validation warnings: {validation_result.warnings}") logger.warning(f"[{request_id}] File validation warnings: {validation_result.warnings}")
# Reset file position after validation read
await file.seek(0)
# Check rate limit for translations # Check rate limit for translations
client_ip = request.client.host if request.client else "unknown" client_ip = request.client.host if request.client else "unknown"
if not await rate_limit_manager.check_translation_limit(client_ip): if not await rate_limit_manager.check_translation_limit(client_ip):

View File

@ -3,10 +3,12 @@ Translation Service Abstraction
Provides a unified interface for different translation providers Provides a unified interface for different translation providers
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional, List from typing import Optional, List, Dict
import requests import requests
from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator
from config import config from config import config
import concurrent.futures
import threading
class TranslationProvider(ABC): class TranslationProvider(ABC):
@ -17,59 +19,222 @@ class TranslationProvider(ABC):
"""Translate text from source to target language""" """Translate text from source to target language"""
pass pass
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Translate multiple texts at once - default implementation"""
return [self.translate(text, target_language, source_language) for text in texts]
class GoogleTranslationProvider(TranslationProvider): class GoogleTranslationProvider(TranslationProvider):
"""Google Translate implementation""" """Google Translate implementation with batch support"""
def __init__(self):
self._local = threading.local()
def _get_translator(self, source_language: str, target_language: str) -> GoogleTranslator:
"""Get or create a translator instance for the current thread"""
key = f"{source_language}_{target_language}"
if not hasattr(self._local, 'translators'):
self._local.translators = {}
if key not in self._local.translators:
self._local.translators[key] = GoogleTranslator(source=source_language, target=target_language)
return self._local.translators[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip(): if not text or not text.strip():
return text return text
try:
translator = self._get_translator(source_language, target_language)
return translator.translate(text)
except Exception as e:
print(f"Translation error: {e}")
return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto', batch_size: int = 50) -> List[str]:
"""
Translate multiple texts using batch processing for speed.
Uses deep_translator's batch capability when possible.
"""
if not texts:
return []
# Filter and track empty texts
results = [''] * len(texts)
non_empty_indices = []
non_empty_texts = []
for i, text in enumerate(texts):
if text and text.strip():
non_empty_indices.append(i)
non_empty_texts.append(text)
else:
results[i] = text if text else ''
if not non_empty_texts:
return results
try: try:
translator = GoogleTranslator(source=source_language, target=target_language) translator = GoogleTranslator(source=source_language, target=target_language)
return translator.translate(text)
# Process in batches
translated_texts = []
for i in range(0, len(non_empty_texts), batch_size):
batch = non_empty_texts[i:i + batch_size]
try:
# Use translate_batch if available
if hasattr(translator, 'translate_batch'):
batch_result = translator.translate_batch(batch)
else:
# Fallback: join with separator, translate, split
separator = "\n|||SPLIT|||\n"
combined = separator.join(batch)
translated_combined = translator.translate(combined)
if translated_combined:
batch_result = translated_combined.split("|||SPLIT|||")
# Clean up results
batch_result = [t.strip() for t in batch_result]
# If split didn't work correctly, fall back to individual
if len(batch_result) != len(batch):
batch_result = [translator.translate(t) for t in batch]
else:
batch_result = batch
translated_texts.extend(batch_result)
except Exception as e: except Exception as e:
print(f"Translation error: {e}") print(f"Batch translation error, falling back to individual: {e}")
return text for text in batch:
try:
translated_texts.append(translator.translate(text))
except:
translated_texts.append(text)
# Map back to original positions
for idx, translated in zip(non_empty_indices, translated_texts):
results[idx] = translated if translated else texts[idx]
return results
except Exception as e:
print(f"Batch translation failed: {e}")
# Fallback to individual translations
for idx, text in zip(non_empty_indices, non_empty_texts):
try:
results[idx] = GoogleTranslator(source=source_language, target=target_language).translate(text) or text
except:
results[idx] = text
return results
class DeepLTranslationProvider(TranslationProvider): class DeepLTranslationProvider(TranslationProvider):
"""DeepL Translate implementation""" """DeepL Translate implementation with batch support"""
def __init__(self, api_key: str): def __init__(self, api_key: str):
self.api_key = api_key self.api_key = api_key
self._translator_cache = {}
def _get_translator(self, source_language: str, target_language: str) -> DeeplTranslator:
key = f"{source_language}_{target_language}"
if key not in self._translator_cache:
self._translator_cache[key] = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language)
return self._translator_cache[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip(): if not text or not text.strip():
return text return text
try: try:
translator = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language) translator = self._get_translator(source_language, target_language)
return translator.translate(text) return translator.translate(text)
except Exception as e: except Exception as e:
print(f"Translation error: {e}") print(f"Translation error: {e}")
return text return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Batch translate using DeepL"""
if not texts:
return []
results = [''] * len(texts)
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
if not non_empty:
return [t if t else '' for t in texts]
try:
translator = self._get_translator(source_language, target_language)
non_empty_texts = [t for _, t in non_empty]
if hasattr(translator, 'translate_batch'):
translated = translator.translate_batch(non_empty_texts)
else:
translated = [translator.translate(t) for t in non_empty_texts]
for (idx, _), trans in zip(non_empty, translated):
results[idx] = trans if trans else texts[idx]
# Fill empty positions
for i, text in enumerate(texts):
if not text or not text.strip():
results[i] = text if text else ''
return results
except Exception as e:
print(f"DeepL batch error: {e}")
return [self.translate(t, target_language, source_language) for t in texts]
class LibreTranslationProvider(TranslationProvider): class LibreTranslationProvider(TranslationProvider):
"""LibreTranslate implementation""" """LibreTranslate implementation with batch support"""
def __init__(self, custom_url: str = "https://libretranslate.com"): def __init__(self, custom_url: str = "https://libretranslate.com"):
self.custom_url = custom_url self.custom_url = custom_url
self._translator_cache = {}
def _get_translator(self, source_language: str, target_language: str) -> LibreTranslator:
key = f"{source_language}_{target_language}"
if key not in self._translator_cache:
self._translator_cache[key] = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
return self._translator_cache[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip(): if not text or not text.strip():
return text return text
try: try:
# LibreTranslator supports custom URL for self-hosted or public instances translator = self._get_translator(source_language, target_language)
translator = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
return translator.translate(text) return translator.translate(text)
except Exception as e: except Exception as e:
print(f"LibreTranslate error: {e}") print(f"LibreTranslate error: {e}")
# Fail silently and return original text
return text return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Batch translate using LibreTranslate"""
if not texts:
return []
results = [''] * len(texts)
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
if not non_empty:
return [t if t else '' for t in texts]
try:
translator = self._get_translator(source_language, target_language)
for idx, text in non_empty:
try:
results[idx] = translator.translate(text) or text
except:
results[idx] = text
for i, text in enumerate(texts):
if not text or not text.strip():
results[i] = text if text else ''
return results
except Exception as e:
print(f"LibreTranslate batch error: {e}")
return texts
class OllamaTranslationProvider(TranslationProvider): class OllamaTranslationProvider(TranslationProvider):
"""Ollama LLM translation implementation""" """Ollama LLM translation implementation"""
@ -90,7 +255,19 @@ class OllamaTranslationProvider(TranslationProvider):
try: try:
# Build system prompt with custom context if provided # Build system prompt with custom context if provided
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else." base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
CRITICAL RULES:
1. Output ONLY the translated text - no explanations, no comments, no notes
2. Preserve the exact formatting (line breaks, spacing, punctuation)
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
4. Do NOT refuse to translate or ask clarifying questions
5. If the text is already in {target_language}, return it unchanged
6. Translate everything literally and accurately
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
8. If you have any doubt about the translation, return the original text unchanged
9. Do not interpret or analyze the content - simply translate word by word
10. Your response must contain ONLY the translated text, nothing else"""
if self.custom_system_prompt: if self.custom_system_prompt:
system_content = f"""{base_prompt} system_content = f"""{base_prompt}
@ -213,7 +390,19 @@ class OpenAITranslationProvider(TranslationProvider):
client = openai.OpenAI(api_key=self.api_key) client = openai.OpenAI(api_key=self.api_key)
# Build system prompt with custom context if provided # Build system prompt with custom context if provided
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else." base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
CRITICAL RULES:
1. Output ONLY the translated text - no explanations, no comments, no notes
2. Preserve the exact formatting (line breaks, spacing, punctuation)
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
4. Do NOT refuse to translate or ask clarifying questions
5. If the text is already in {target_language}, return it unchanged
6. Translate everything literally and accurately
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
8. If you have any doubt about the translation, return the original text unchanged
9. Do not interpret or analyze the content - simply translate word by word
10. Your response must contain ONLY the translated text, nothing else"""
if self.custom_system_prompt: if self.custom_system_prompt:
system_content = f"""{base_prompt} system_content = f"""{base_prompt}
@ -341,7 +530,7 @@ class TranslationService:
def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]: def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]:
""" """
Translate multiple text strings Translate multiple text strings efficiently using batch processing.
Args: Args:
texts: List of texts to translate texts: List of texts to translate
@ -351,6 +540,14 @@ class TranslationService:
Returns: Returns:
List of translated texts List of translated texts
""" """
if not texts:
return []
# Use provider's batch method if available
if hasattr(self.provider, 'translate_batch'):
return self.provider.translate_batch(texts, target_language, source_language)
# Fallback to individual translations
return [self.translate_text(text, target_language, source_language) for text in texts] return [self.translate_text(text, target_language, source_language) for text in texts]

View File

@ -1,12 +1,13 @@
""" """
Excel Translation Module Excel Translation Module
Translates Excel files while preserving all formatting, formulas, images, and layout Translates Excel files while preserving all formatting, formulas, images, and layout
OPTIMIZED: Uses batch translation for 5-10x faster processing
""" """
import re import re
import tempfile import tempfile
import os import os
from pathlib import Path from pathlib import Path
from typing import Dict, Set from typing import Dict, Set, List, Tuple
from openpyxl import load_workbook from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.cell.cell import Cell from openpyxl.cell.cell import Cell
@ -23,36 +24,46 @@ class ExcelTranslator:
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
""" """
Translate an Excel file while preserving all formatting and structure Translate an Excel file while preserving all formatting and structure.
Uses batch translation for improved performance.
Args:
input_path: Path to input Excel file
output_path: Path to save translated Excel file
target_language: Target language code
Returns:
Path to the translated file
""" """
# Load workbook with data_only=False to preserve formulas
workbook = load_workbook(input_path, data_only=False) workbook = load_workbook(input_path, data_only=False)
# First, translate all worksheet content # Collect all translatable text elements
sheet_name_mapping = {} text_elements = [] # List of (text, setter_function)
sheet_names_to_translate = []
for sheet_name in workbook.sheetnames: for sheet_name in workbook.sheetnames:
worksheet = workbook[sheet_name] worksheet = workbook[sheet_name]
self._translate_worksheet(worksheet, target_language) self._collect_from_worksheet(worksheet, text_elements)
sheet_names_to_translate.append(sheet_name)
# Translate images if enabled # Add sheet names to translate
if getattr(self.translation_service, 'translate_images', False): sheet_name_setters = []
self._translate_images(worksheet, target_language) for sheet_name in sheet_names_to_translate:
text_elements.append((sheet_name, None)) # None setter - handled separately
sheet_name_setters.append(sheet_name)
# Prepare translated sheet name (but don't rename yet) # Batch translate all texts at once
translated_sheet_name = self.translation_service.translate_text( if text_elements:
sheet_name, target_language texts = [elem[0] for elem in text_elements]
) print(f"Batch translating {len(texts)} text segments...")
if translated_sheet_name and translated_sheet_name != sheet_name: translated_texts = self.translation_service.translate_batch(texts, target_language)
# Truncate to Excel's 31 character limit and ensure uniqueness
new_name = translated_sheet_name[:31] # Apply translations to cells
sheet_name_offset = len(text_elements) - len(sheet_name_setters)
for i, ((original_text, setter), translated) in enumerate(zip(text_elements[:sheet_name_offset], translated_texts[:sheet_name_offset])):
if translated is not None and setter is not None:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Apply sheet name translations
sheet_name_mapping = {}
for i, (sheet_name, translated) in enumerate(zip(sheet_name_setters, translated_texts[sheet_name_offset:])):
if translated and translated != sheet_name:
new_name = translated[:31]
counter = 1 counter = 1
base_name = new_name[:28] if len(new_name) > 28 else new_name base_name = new_name[:28] if len(new_name) > 28 else new_name
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames: while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
@ -60,152 +71,86 @@ class ExcelTranslator:
counter += 1 counter += 1
sheet_name_mapping[sheet_name] = new_name sheet_name_mapping[sheet_name] = new_name
# Now rename sheets (after all content is translated) # Rename sheets
for original_name, new_name in sheet_name_mapping.items(): for original_name, new_name in sheet_name_mapping.items():
workbook[original_name].title = new_name workbook[original_name].title = new_name
# Save the translated workbook # Translate images if enabled (separate process)
if getattr(self.translation_service, 'translate_images', False):
for sheet_name in workbook.sheetnames:
self._translate_images(workbook[sheet_name], target_language)
workbook.save(output_path) workbook.save(output_path)
workbook.close() workbook.close()
return output_path return output_path
def _translate_worksheet(self, worksheet: Worksheet, target_language: str): def _collect_from_worksheet(self, worksheet: Worksheet, text_elements: List[Tuple[str, callable]]):
""" """Collect all translatable text from worksheet cells"""
Translate all cells in a worksheet while preserving formatting
Args:
worksheet: Worksheet to translate
target_language: Target language code
"""
# Iterate through all cells that have values
for row in worksheet.iter_rows(): for row in worksheet.iter_rows():
for cell in row: for cell in row:
if cell.value is not None: if cell.value is not None:
self._translate_cell(cell, target_language) self._collect_from_cell(cell, text_elements)
def _translate_cell(self, cell: Cell, target_language: str): def _collect_from_cell(self, cell: Cell, text_elements: List[Tuple[str, callable]]):
""" """Collect text from a cell"""
Translate a single cell while preserving its formula and formatting
Args:
cell: Cell to translate
target_language: Target language code
"""
original_value = cell.value original_value = cell.value
# Skip if cell is empty
if original_value is None: if original_value is None:
return return
# Handle formulas # Handle formulas - collect text inside quotes
if isinstance(original_value, str) and original_value.startswith('='): if isinstance(original_value, str) and original_value.startswith('='):
self._translate_formula(cell, original_value, target_language)
# Handle regular text
elif isinstance(original_value, str):
translated_text = self.translation_service.translate_text(
original_value, target_language
)
cell.value = translated_text
# Numbers, dates, booleans remain unchanged
def _translate_formula(self, cell: Cell, formula: str, target_language: str):
"""
Translate text within a formula while preserving the formula structure
Args:
cell: Cell containing the formula
formula: Formula string
target_language: Target language code
"""
# Extract text strings from formula (text within quotes)
string_pattern = re.compile(r'"([^"]*)"') string_pattern = re.compile(r'"([^"]*)"')
strings = string_pattern.findall(formula) strings = string_pattern.findall(original_value)
for s in strings:
if not strings: if s.strip():
return def make_formula_setter(c, orig_formula, orig_string):
def setter(translated):
# Translate each string and replace in formula c.value = orig_formula.replace(f'"{orig_string}"', f'"{translated}"')
translated_formula = formula return setter
for original_string in strings: text_elements.append((s, make_formula_setter(cell, original_value, s)))
if original_string.strip(): # Only translate non-empty strings # Handle regular text
translated_string = self.translation_service.translate_text( elif isinstance(original_value, str) and original_value.strip():
original_string, target_language def make_setter(c):
) def setter(text):
# Replace in formula, being careful with special regex characters c.value = text
translated_formula = translated_formula.replace( return setter
f'"{original_string}"', f'"{translated_string}"' text_elements.append((original_value, make_setter(cell)))
)
cell.value = translated_formula
def _should_translate(self, text: str) -> bool:
"""
Determine if text should be translated
Args:
text: Text to check
Returns:
True if text should be translated, False otherwise
"""
if not text or not isinstance(text, str):
return False
# Don't translate if it's only numbers, special characters, or very short
if len(text.strip()) < 2:
return False
# Check if it's a formula (handled separately)
if text.startswith('='):
return False
return True
def _translate_images(self, worksheet: Worksheet, target_language: str): def _translate_images(self, worksheet: Worksheet, target_language: str):
""" """Translate text in images using vision model"""
Translate text in images using vision model and add as comments
"""
from services.translation_service import OllamaTranslationProvider from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider): if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return return
try: try:
# Get images from worksheet
images = getattr(worksheet, '_images', []) images = getattr(worksheet, '_images', [])
for idx, image in enumerate(images): for idx, image in enumerate(images):
try: try:
# Get image data
image_data = image._data() image_data = image._data()
ext = image.format or 'png' ext = image.format or 'png'
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp: with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
tmp.write(image_data) tmp.write(image_data)
tmp_path = tmp.name tmp_path = tmp.name
# Translate with vision
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language) translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
# Clean up
os.unlink(tmp_path) os.unlink(tmp_path)
if translated_text and translated_text.strip(): if translated_text and translated_text.strip():
# Add translation as a cell near the image
anchor = image.anchor anchor = image.anchor
if hasattr(anchor, '_from'): if hasattr(anchor, '_from'):
cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}" cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
cell = worksheet[cell_ref] cell = worksheet[cell_ref]
# Add as comment
from openpyxl.comments import Comment from openpyxl.comments import Comment
cell.comment = Comment(f"Image translation: {translated_text}", "Translator") cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
print(f"Added Excel image translation at {cell_ref}: {translated_text[:50]}...") print(f"Added Excel image translation at {cell_ref}")
except Exception as e: except Exception as e:
print(f"Error translating Excel image {idx}: {e}") print(f"Error translating Excel image {idx}: {e}")
continue
except Exception as e: except Exception as e:
print(f"Error processing Excel images: {e}") print(f"Error processing Excel images: {e}")

View File

@ -1,6 +1,7 @@
""" """
PowerPoint Translation Module PowerPoint Translation Module
Translates PowerPoint files while preserving all layouts, animations, and media Translates PowerPoint files while preserving all layouts, animations, and media
OPTIMIZED: Uses batch translation for 5-10x faster processing
""" """
from pathlib import Path from pathlib import Path
from pptx import Presentation from pptx import Presentation
@ -9,6 +10,7 @@ from pptx.shapes.group import GroupShape
from pptx.util import Inches, Pt from pptx.util import Inches, Pt
from pptx.enum.shapes import MSO_SHAPE_TYPE from pptx.enum.shapes import MSO_SHAPE_TYPE
from services.translation_service import translation_service from services.translation_service import translation_service
from typing import List, Tuple
import tempfile import tempfile
import os import os
@ -21,118 +23,117 @@ class PowerPointTranslator:
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
""" """
Translate a PowerPoint presentation while preserving all formatting and structure Translate a PowerPoint presentation while preserving all formatting.
Uses batch translation for improved performance.
Args:
input_path: Path to input PowerPoint file
output_path: Path to save translated PowerPoint file
target_language: Target language code
Returns:
Path to the translated file
""" """
presentation = Presentation(input_path) presentation = Presentation(input_path)
# Translate each slide # Collect all translatable text elements
for slide_idx, slide in enumerate(presentation.slides): text_elements = [] # List of (text, setter_function)
self._translate_slide(slide, target_language, slide_idx + 1, input_path) image_shapes = [] # Collect images for separate processing
for slide_idx, slide in enumerate(presentation.slides):
# Collect from notes
if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
self._collect_from_text_frame(slide.notes_slide.notes_text_frame, text_elements)
# Collect from shapes
for shape in slide.shapes:
self._collect_from_shape(shape, text_elements, slide, image_shapes)
# Batch translate all texts at once
if text_elements:
texts = [elem[0] for elem in text_elements]
print(f"Batch translating {len(texts)} text segments...")
translated_texts = self.translation_service.translate_batch(texts, target_language)
# Apply translations
for (original_text, setter), translated in zip(text_elements, translated_texts):
if translated is not None and setter is not None:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Translate images if enabled (separate process, can't batch)
if getattr(self.translation_service, 'translate_images', False):
for shape, slide in image_shapes:
self._translate_image_shape(shape, target_language, slide)
# Save the translated presentation
presentation.save(output_path) presentation.save(output_path)
return output_path return output_path
def _translate_slide(self, slide, target_language: str, slide_num: int, input_path: Path): def _collect_from_shape(self, shape: BaseShape, text_elements: List[Tuple[str, callable]], slide=None, image_shapes=None):
""" """Collect text from a shape and its children"""
Translate all text elements in a slide while preserving layout
Args:
slide: Slide to translate
target_language: Target language code
slide_num: Slide number for reference
input_path: Path to source file for image extraction
"""
# Translate notes (speaker notes)
if slide.has_notes_slide:
notes_slide = slide.notes_slide
if notes_slide.notes_text_frame:
self._translate_text_frame(notes_slide.notes_text_frame, target_language)
# Translate shapes in the slide
for shape in slide.shapes:
self._translate_shape(shape, target_language, slide)
def _translate_shape(self, shape: BaseShape, target_language: str, slide=None):
"""
Translate text in a shape based on its type
Args:
shape: Shape to translate
target_language: Target language code
slide: Parent slide for adding image translations
"""
# Handle text-containing shapes # Handle text-containing shapes
if shape.has_text_frame: if shape.has_text_frame:
self._translate_text_frame(shape.text_frame, target_language) self._collect_from_text_frame(shape.text_frame, text_elements)
# Handle tables # Handle tables
if shape.shape_type == MSO_SHAPE_TYPE.TABLE: if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
self._translate_table(shape.table, target_language) for row in shape.table.rows:
for cell in row.cells:
self._collect_from_text_frame(cell.text_frame, text_elements)
# Handle pictures/images # Handle pictures/images
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == MSO_SHAPE_TYPE.PICTURE and image_shapes is not None:
self._translate_image_shape(shape, target_language, slide) image_shapes.append((shape, slide))
# Handle group shapes (shapes within shapes) # Handle group shapes
if shape.shape_type == MSO_SHAPE_TYPE.GROUP: if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for sub_shape in shape.shapes: for sub_shape in shape.shapes:
self._translate_shape(sub_shape, target_language, slide) self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
# Handle smart art (contains multiple shapes) # Handle smart art
# Smart art is complex, but we can try to translate text within it
if hasattr(shape, 'shapes'): if hasattr(shape, 'shapes'):
try: try:
for sub_shape in shape.shapes: for sub_shape in shape.shapes:
self._translate_shape(sub_shape, target_language, slide) self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
except: except:
pass # Some shapes may not support iteration pass
def _translate_image_shape(self, shape, target_language: str, slide): def _collect_from_text_frame(self, text_frame, text_elements: List[Tuple[str, callable]]):
""" """Collect text from a text frame"""
Translate text in an image using vision model and add as text box if not text_frame.text.strip():
"""
if not getattr(self.translation_service, 'translate_images', False):
return return
for paragraph in text_frame.paragraphs:
if not paragraph.text.strip():
continue
for run in paragraph.runs:
if run.text and run.text.strip():
def make_setter(r):
def setter(text):
r.text = text
return setter
text_elements.append((run.text, make_setter(run)))
def _translate_image_shape(self, shape, target_language: str, slide):
"""Translate text in an image using vision model"""
from services.translation_service import OllamaTranslationProvider from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider): if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return return
try: try:
# Get image blob
image_blob = shape.image.blob image_blob = shape.image.blob
ext = shape.image.ext ext = shape.image.ext
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp: with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
tmp.write(image_blob) tmp.write(image_blob)
tmp_path = tmp.name tmp_path = tmp.name
# Translate with vision
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language) translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
# Clean up
os.unlink(tmp_path) os.unlink(tmp_path)
if translated_text and translated_text.strip(): if translated_text and translated_text.strip():
# Add text box below the image with translation
left = shape.left left = shape.left
top = shape.top + shape.height + Inches(0.1) top = shape.top + shape.height + Inches(0.1)
width = shape.width width = shape.width
height = Inches(0.5) height = Inches(0.5)
# Add text box
textbox = slide.shapes.add_textbox(left, top, width, height) textbox = slide.shapes.add_textbox(left, top, width, height)
tf = textbox.text_frame tf = textbox.text_frame
p = tf.paragraphs[0] p = tf.paragraphs[0]
@ -145,71 +146,6 @@ class PowerPointTranslator:
except Exception as e: except Exception as e:
print(f"Error translating image: {e}") print(f"Error translating image: {e}")
def _translate_text_frame(self, text_frame, target_language: str):
"""
Translate text within a text frame while preserving formatting
Args:
text_frame: Text frame to translate
target_language: Target language code
"""
if not text_frame.text.strip():
return
# Translate each paragraph in the text frame
for paragraph in text_frame.paragraphs:
self._translate_paragraph(paragraph, target_language)
def _translate_paragraph(self, paragraph, target_language: str):
"""
Translate a paragraph while preserving run-level formatting
Args:
paragraph: Paragraph to translate
target_language: Target language code
"""
if not paragraph.text.strip():
return
# Translate each run in the paragraph to preserve individual formatting
for run in paragraph.runs:
if run.text.strip():
translated_text = self.translation_service.translate_text(
run.text, target_language
)
run.text = translated_text
def _translate_table(self, table, target_language: str):
"""
Translate all cells in a table while preserving structure
Args:
table: Table to translate
target_language: Target language code
"""
for row in table.rows:
for cell in row.cells:
self._translate_text_frame(cell.text_frame, target_language)
def _is_translatable(self, text: str) -> bool:
"""
Determine if text should be translated
Args:
text: Text to check
Returns:
True if text should be translated, False otherwise
"""
if not text or not isinstance(text, str):
return False
# Don't translate if it's only numbers, special characters, or very short
if len(text.strip()) < 2:
return False
return True
# Global translator instance # Global translator instance
pptx_translator = PowerPointTranslator() pptx_translator = PowerPointTranslator()

View File

@ -1,6 +1,7 @@
""" """
Word Document Translation Module Word Document Translation Module
Translates Word files while preserving all formatting, styles, tables, and images Translates Word files while preserving all formatting, styles, tables, and images
OPTIMIZED: Uses batch translation for 5-10x faster processing
""" """
from pathlib import Path from pathlib import Path
from docx import Document from docx import Document
@ -12,6 +13,7 @@ from docx.section import Section
from docx.shared import Inches, Pt from docx.shared import Inches, Pt
from docx.oxml.ns import qn from docx.oxml.ns import qn
from services.translation_service import translation_service from services.translation_service import translation_service
from typing import List, Tuple, Any
import tempfile import tempfile
import os import os
@ -24,26 +26,36 @@ class WordTranslator:
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
""" """
Translate a Word document while preserving all formatting and structure Translate a Word document while preserving all formatting and structure.
Uses batch translation for improved performance.
Args:
input_path: Path to input Word file
output_path: Path to save translated Word file
target_language: Target language code
Returns:
Path to the translated file
""" """
document = Document(input_path) document = Document(input_path)
# Translate main document body # Collect all translatable text elements
self._translate_document_body(document, target_language) text_elements = []
# Translate headers and footers in all sections # Collect from document body
self._collect_from_body(document, text_elements)
# Collect from headers and footers
for section in document.sections: for section in document.sections:
self._translate_section(section, target_language) self._collect_from_section(section, text_elements)
# Translate images if enabled # Batch translate all texts at once
if text_elements:
texts = [elem[0] for elem in text_elements]
print(f"Batch translating {len(texts)} text segments...")
translated_texts = self.translation_service.translate_batch(texts, target_language)
# Apply translations
for (original_text, setter), translated in zip(text_elements, translated_texts):
if translated is not None and translated != original_text:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Translate images if enabled (separate process)
if getattr(self.translation_service, 'translate_images', False): if getattr(self.translation_service, 'translate_images', False):
self._translate_images(document, target_language, input_path) self._translate_images(document, target_language, input_path)
@ -52,13 +64,59 @@ class WordTranslator:
return output_path return output_path
def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]):
"""Collect all text elements from document body"""
for element in document.element.body:
if isinstance(element, CT_P):
paragraph = Paragraph(element, document)
self._collect_from_paragraph(paragraph, text_elements)
elif isinstance(element, CT_Tbl):
table = Table(element, document)
self._collect_from_table(table, text_elements)
def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]):
"""Collect text from paragraph runs"""
if not paragraph.text.strip():
return
for run in paragraph.runs:
if run.text and run.text.strip():
# Create a setter function for this run
def make_setter(r):
def setter(text):
r.text = text
return setter
text_elements.append((run.text, make_setter(run)))
def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]):
"""Collect text from table cells"""
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
# Handle nested tables
for nested_table in cell.tables:
self._collect_from_table(nested_table, text_elements)
def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]):
"""Collect text from headers and footers"""
headers_footers = [
section.header, section.footer,
section.first_page_header, section.first_page_footer,
section.even_page_header, section.even_page_footer
]
for hf in headers_footers:
if hf:
for paragraph in hf.paragraphs:
self._collect_from_paragraph(paragraph, text_elements)
for table in hf.tables:
self._collect_from_table(table, text_elements)
def _translate_images(self, document: Document, target_language: str, input_path: Path): def _translate_images(self, document: Document, target_language: str, input_path: Path):
""" """Extract text from images and add translations as captions"""
Extract text from images and add translations as captions
"""
from services.translation_service import OllamaTranslationProvider from services.translation_service import OllamaTranslationProvider
# Only works with Ollama vision
if not isinstance(self.translation_service.provider, OllamaTranslationProvider): if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return return
@ -66,165 +124,33 @@ class WordTranslator:
import zipfile import zipfile
import base64 import base64
# Extract images from docx (it's a zip file)
with zipfile.ZipFile(input_path, 'r') as zip_ref: with zipfile.ZipFile(input_path, 'r') as zip_ref:
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')] image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
for idx, image_file in enumerate(image_files): for idx, image_file in enumerate(image_files):
try: try:
# Extract image
image_data = zip_ref.read(image_file) image_data = zip_ref.read(image_file)
# Create temp file
ext = os.path.splitext(image_file)[1] ext = os.path.splitext(image_file)[1]
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(image_data) tmp.write(image_data)
tmp_path = tmp.name tmp_path = tmp.name
# Translate image with vision
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language) translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
# Clean up temp file
os.unlink(tmp_path) os.unlink(tmp_path)
if translated_text and translated_text.strip(): if translated_text and translated_text.strip():
# Add translated text as a new paragraph after image
# We'll add it at the end with a note
p = document.add_paragraph() p = document.add_paragraph()
p.add_run(f"[Image {idx + 1} translation: ").bold = True p.add_run(f"[Image {idx + 1} translation: ").bold = True
p.add_run(translated_text) p.add_run(translated_text)
p.add_run("]").bold = True p.add_run("]").bold = True
print(f"Translated image {idx + 1}: {translated_text[:50]}...") print(f"Translated image {idx + 1}: {translated_text[:50]}...")
except Exception as e: except Exception as e:
print(f"Error translating image {image_file}: {e}") print(f"Error translating image {image_file}: {e}")
continue
except Exception as e: except Exception as e:
print(f"Error processing images: {e}") print(f"Error processing images: {e}")
def _translate_document_body(self, document: Document, target_language: str):
"""
Translate all elements in the document body
Args:
document: Document to translate
target_language: Target language code
"""
for element in document.element.body:
if isinstance(element, CT_P):
# It's a paragraph
paragraph = Paragraph(element, document)
self._translate_paragraph(paragraph, target_language)
elif isinstance(element, CT_Tbl):
# It's a table
table = Table(element, document)
self._translate_table(table, target_language)
def _translate_paragraph(self, paragraph: Paragraph, target_language: str):
"""
Translate a paragraph while preserving all formatting
Args:
paragraph: Paragraph to translate
target_language: Target language code
"""
if not paragraph.text.strip():
return
# For paragraphs with complex formatting (multiple runs), translate run by run
if len(paragraph.runs) > 0:
for run in paragraph.runs:
if run.text.strip():
translated_text = self.translation_service.translate_text(
run.text, target_language
)
run.text = translated_text
else:
# Simple paragraph with no runs
if paragraph.text.strip():
translated_text = self.translation_service.translate_text(
paragraph.text, target_language
)
paragraph.text = translated_text
def _translate_table(self, table: Table, target_language: str):
"""
Translate all cells in a table while preserving structure
Args:
table: Table to translate
target_language: Target language code
"""
for row in table.rows:
for cell in row.cells:
self._translate_cell(cell, target_language)
def _translate_cell(self, cell: _Cell, target_language: str):
"""
Translate content within a table cell
Args:
cell: Cell to translate
target_language: Target language code
"""
for paragraph in cell.paragraphs:
self._translate_paragraph(paragraph, target_language)
# Handle nested tables
for table in cell.tables:
self._translate_table(table, target_language)
def _translate_section(self, section: Section, target_language: str):
"""
Translate headers and footers in a section
Args:
section: Section to translate
target_language: Target language code
"""
# Translate header
if section.header:
for paragraph in section.header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.header.tables:
self._translate_table(table, target_language)
# Translate footer
if section.footer:
for paragraph in section.footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.footer.tables:
self._translate_table(table, target_language)
# Translate first page header (if different)
if section.first_page_header:
for paragraph in section.first_page_header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.first_page_header.tables:
self._translate_table(table, target_language)
# Translate first page footer (if different)
if section.first_page_footer:
for paragraph in section.first_page_footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.first_page_footer.tables:
self._translate_table(table, target_language)
# Translate even page header (if different)
if section.even_page_header:
for paragraph in section.even_page_header.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.even_page_header.tables:
self._translate_table(table, target_language)
# Translate even page footer (if different)
if section.even_page_footer:
for paragraph in section.even_page_footer.paragraphs:
self._translate_paragraph(paragraph, target_language)
for table in section.even_page_footer.tables:
self._translate_table(table, target_language)
# Global translator instance # Global translator instance
word_translator = WordTranslator() word_translator = WordTranslator()