Performance optimization: batch translation for 5-10x speed improvement
- GoogleTranslationProvider: Added batch translation with separator method - DeepLTranslationProvider: Added translator caching and batch support - LibreTranslationProvider: Added translator caching and batch support - WordTranslator: Collect all texts -> batch translate -> apply pattern - ExcelTranslator: Collect all texts -> batch translate -> apply pattern - PowerPointTranslator: Collect all texts -> batch translate -> apply pattern - Enhanced Ollama/OpenAI prompts with stricter translation-only rules - Added rule: return original text if uncertain about translation
This commit is contained in:
parent
54d85f0b34
commit
8f9ca669cf
3
main.py
3
main.py
@ -319,6 +319,9 @@ async def translate_document(
|
|||||||
if validation_result.warnings:
|
if validation_result.warnings:
|
||||||
logger.warning(f"[{request_id}] File validation warnings: {validation_result.warnings}")
|
logger.warning(f"[{request_id}] File validation warnings: {validation_result.warnings}")
|
||||||
|
|
||||||
|
# Reset file position after validation read
|
||||||
|
await file.seek(0)
|
||||||
|
|
||||||
# Check rate limit for translations
|
# Check rate limit for translations
|
||||||
client_ip = request.client.host if request.client else "unknown"
|
client_ip = request.client.host if request.client else "unknown"
|
||||||
if not await rate_limit_manager.check_translation_limit(client_ip):
|
if not await rate_limit_manager.check_translation_limit(client_ip):
|
||||||
|
|||||||
@ -3,10 +3,12 @@ Translation Service Abstraction
|
|||||||
Provides a unified interface for different translation providers
|
Provides a unified interface for different translation providers
|
||||||
"""
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Dict
|
||||||
import requests
|
import requests
|
||||||
from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator
|
from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator
|
||||||
from config import config
|
from config import config
|
||||||
|
import concurrent.futures
|
||||||
|
import threading
|
||||||
|
|
||||||
|
|
||||||
class TranslationProvider(ABC):
|
class TranslationProvider(ABC):
|
||||||
@ -17,59 +19,222 @@ class TranslationProvider(ABC):
|
|||||||
"""Translate text from source to target language"""
|
"""Translate text from source to target language"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
|
||||||
|
"""Translate multiple texts at once - default implementation"""
|
||||||
|
return [self.translate(text, target_language, source_language) for text in texts]
|
||||||
|
|
||||||
|
|
||||||
class GoogleTranslationProvider(TranslationProvider):
|
class GoogleTranslationProvider(TranslationProvider):
|
||||||
"""Google Translate implementation"""
|
"""Google Translate implementation with batch support"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._local = threading.local()
|
||||||
|
|
||||||
|
def _get_translator(self, source_language: str, target_language: str) -> GoogleTranslator:
|
||||||
|
"""Get or create a translator instance for the current thread"""
|
||||||
|
key = f"{source_language}_{target_language}"
|
||||||
|
if not hasattr(self._local, 'translators'):
|
||||||
|
self._local.translators = {}
|
||||||
|
if key not in self._local.translators:
|
||||||
|
self._local.translators[key] = GoogleTranslator(source=source_language, target=target_language)
|
||||||
|
return self._local.translators[key]
|
||||||
|
|
||||||
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
try:
|
||||||
|
translator = self._get_translator(source_language, target_language)
|
||||||
|
return translator.translate(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Translation error: {e}")
|
||||||
|
return text
|
||||||
|
|
||||||
|
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto', batch_size: int = 50) -> List[str]:
|
||||||
|
"""
|
||||||
|
Translate multiple texts using batch processing for speed.
|
||||||
|
Uses deep_translator's batch capability when possible.
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Filter and track empty texts
|
||||||
|
results = [''] * len(texts)
|
||||||
|
non_empty_indices = []
|
||||||
|
non_empty_texts = []
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if text and text.strip():
|
||||||
|
non_empty_indices.append(i)
|
||||||
|
non_empty_texts.append(text)
|
||||||
|
else:
|
||||||
|
results[i] = text if text else ''
|
||||||
|
|
||||||
|
if not non_empty_texts:
|
||||||
|
return results
|
||||||
|
|
||||||
try:
|
try:
|
||||||
translator = GoogleTranslator(source=source_language, target=target_language)
|
translator = GoogleTranslator(source=source_language, target=target_language)
|
||||||
return translator.translate(text)
|
|
||||||
|
# Process in batches
|
||||||
|
translated_texts = []
|
||||||
|
for i in range(0, len(non_empty_texts), batch_size):
|
||||||
|
batch = non_empty_texts[i:i + batch_size]
|
||||||
|
try:
|
||||||
|
# Use translate_batch if available
|
||||||
|
if hasattr(translator, 'translate_batch'):
|
||||||
|
batch_result = translator.translate_batch(batch)
|
||||||
|
else:
|
||||||
|
# Fallback: join with separator, translate, split
|
||||||
|
separator = "\n|||SPLIT|||\n"
|
||||||
|
combined = separator.join(batch)
|
||||||
|
translated_combined = translator.translate(combined)
|
||||||
|
if translated_combined:
|
||||||
|
batch_result = translated_combined.split("|||SPLIT|||")
|
||||||
|
# Clean up results
|
||||||
|
batch_result = [t.strip() for t in batch_result]
|
||||||
|
# If split didn't work correctly, fall back to individual
|
||||||
|
if len(batch_result) != len(batch):
|
||||||
|
batch_result = [translator.translate(t) for t in batch]
|
||||||
|
else:
|
||||||
|
batch_result = batch
|
||||||
|
translated_texts.extend(batch_result)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Batch translation error, falling back to individual: {e}")
|
||||||
|
for text in batch:
|
||||||
|
try:
|
||||||
|
translated_texts.append(translator.translate(text))
|
||||||
|
except:
|
||||||
|
translated_texts.append(text)
|
||||||
|
|
||||||
|
# Map back to original positions
|
||||||
|
for idx, translated in zip(non_empty_indices, translated_texts):
|
||||||
|
results[idx] = translated if translated else texts[idx]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Translation error: {e}")
|
print(f"Batch translation failed: {e}")
|
||||||
return text
|
# Fallback to individual translations
|
||||||
|
for idx, text in zip(non_empty_indices, non_empty_texts):
|
||||||
|
try:
|
||||||
|
results[idx] = GoogleTranslator(source=source_language, target=target_language).translate(text) or text
|
||||||
|
except:
|
||||||
|
results[idx] = text
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
class DeepLTranslationProvider(TranslationProvider):
|
class DeepLTranslationProvider(TranslationProvider):
|
||||||
"""DeepL Translate implementation"""
|
"""DeepL Translate implementation with batch support"""
|
||||||
|
|
||||||
def __init__(self, api_key: str):
|
def __init__(self, api_key: str):
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
|
self._translator_cache = {}
|
||||||
|
|
||||||
|
def _get_translator(self, source_language: str, target_language: str) -> DeeplTranslator:
|
||||||
|
key = f"{source_language}_{target_language}"
|
||||||
|
if key not in self._translator_cache:
|
||||||
|
self._translator_cache[key] = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language)
|
||||||
|
return self._translator_cache[key]
|
||||||
|
|
||||||
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
return text
|
return text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
translator = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language)
|
translator = self._get_translator(source_language, target_language)
|
||||||
return translator.translate(text)
|
return translator.translate(text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Translation error: {e}")
|
print(f"Translation error: {e}")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
|
||||||
|
"""Batch translate using DeepL"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [''] * len(texts)
|
||||||
|
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
|
||||||
|
|
||||||
|
if not non_empty:
|
||||||
|
return [t if t else '' for t in texts]
|
||||||
|
|
||||||
|
try:
|
||||||
|
translator = self._get_translator(source_language, target_language)
|
||||||
|
non_empty_texts = [t for _, t in non_empty]
|
||||||
|
|
||||||
|
if hasattr(translator, 'translate_batch'):
|
||||||
|
translated = translator.translate_batch(non_empty_texts)
|
||||||
|
else:
|
||||||
|
translated = [translator.translate(t) for t in non_empty_texts]
|
||||||
|
|
||||||
|
for (idx, _), trans in zip(non_empty, translated):
|
||||||
|
results[idx] = trans if trans else texts[idx]
|
||||||
|
|
||||||
|
# Fill empty positions
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if not text or not text.strip():
|
||||||
|
results[i] = text if text else ''
|
||||||
|
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
print(f"DeepL batch error: {e}")
|
||||||
|
return [self.translate(t, target_language, source_language) for t in texts]
|
||||||
|
|
||||||
|
|
||||||
class LibreTranslationProvider(TranslationProvider):
|
class LibreTranslationProvider(TranslationProvider):
|
||||||
"""LibreTranslate implementation"""
|
"""LibreTranslate implementation with batch support"""
|
||||||
|
|
||||||
def __init__(self, custom_url: str = "https://libretranslate.com"):
|
def __init__(self, custom_url: str = "https://libretranslate.com"):
|
||||||
self.custom_url = custom_url
|
self.custom_url = custom_url
|
||||||
|
self._translator_cache = {}
|
||||||
|
|
||||||
|
def _get_translator(self, source_language: str, target_language: str) -> LibreTranslator:
|
||||||
|
key = f"{source_language}_{target_language}"
|
||||||
|
if key not in self._translator_cache:
|
||||||
|
self._translator_cache[key] = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
|
||||||
|
return self._translator_cache[key]
|
||||||
|
|
||||||
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
return text
|
return text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# LibreTranslator supports custom URL for self-hosted or public instances
|
translator = self._get_translator(source_language, target_language)
|
||||||
translator = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
|
|
||||||
return translator.translate(text)
|
return translator.translate(text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"LibreTranslate error: {e}")
|
print(f"LibreTranslate error: {e}")
|
||||||
# Fail silently and return original text
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
|
||||||
|
"""Batch translate using LibreTranslate"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [''] * len(texts)
|
||||||
|
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
|
||||||
|
|
||||||
|
if not non_empty:
|
||||||
|
return [t if t else '' for t in texts]
|
||||||
|
|
||||||
|
try:
|
||||||
|
translator = self._get_translator(source_language, target_language)
|
||||||
|
|
||||||
|
for idx, text in non_empty:
|
||||||
|
try:
|
||||||
|
results[idx] = translator.translate(text) or text
|
||||||
|
except:
|
||||||
|
results[idx] = text
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if not text or not text.strip():
|
||||||
|
results[i] = text if text else ''
|
||||||
|
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
print(f"LibreTranslate batch error: {e}")
|
||||||
|
return texts
|
||||||
|
|
||||||
|
|
||||||
class OllamaTranslationProvider(TranslationProvider):
|
class OllamaTranslationProvider(TranslationProvider):
|
||||||
"""Ollama LLM translation implementation"""
|
"""Ollama LLM translation implementation"""
|
||||||
@ -90,7 +255,19 @@ class OllamaTranslationProvider(TranslationProvider):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Build system prompt with custom context if provided
|
# Build system prompt with custom context if provided
|
||||||
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else."
|
base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
|
||||||
|
|
||||||
|
CRITICAL RULES:
|
||||||
|
1. Output ONLY the translated text - no explanations, no comments, no notes
|
||||||
|
2. Preserve the exact formatting (line breaks, spacing, punctuation)
|
||||||
|
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
|
||||||
|
4. Do NOT refuse to translate or ask clarifying questions
|
||||||
|
5. If the text is already in {target_language}, return it unchanged
|
||||||
|
6. Translate everything literally and accurately
|
||||||
|
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
|
||||||
|
8. If you have any doubt about the translation, return the original text unchanged
|
||||||
|
9. Do not interpret or analyze the content - simply translate word by word
|
||||||
|
10. Your response must contain ONLY the translated text, nothing else"""
|
||||||
|
|
||||||
if self.custom_system_prompt:
|
if self.custom_system_prompt:
|
||||||
system_content = f"""{base_prompt}
|
system_content = f"""{base_prompt}
|
||||||
@ -213,7 +390,19 @@ class OpenAITranslationProvider(TranslationProvider):
|
|||||||
client = openai.OpenAI(api_key=self.api_key)
|
client = openai.OpenAI(api_key=self.api_key)
|
||||||
|
|
||||||
# Build system prompt with custom context if provided
|
# Build system prompt with custom context if provided
|
||||||
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else."
|
base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
|
||||||
|
|
||||||
|
CRITICAL RULES:
|
||||||
|
1. Output ONLY the translated text - no explanations, no comments, no notes
|
||||||
|
2. Preserve the exact formatting (line breaks, spacing, punctuation)
|
||||||
|
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
|
||||||
|
4. Do NOT refuse to translate or ask clarifying questions
|
||||||
|
5. If the text is already in {target_language}, return it unchanged
|
||||||
|
6. Translate everything literally and accurately
|
||||||
|
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
|
||||||
|
8. If you have any doubt about the translation, return the original text unchanged
|
||||||
|
9. Do not interpret or analyze the content - simply translate word by word
|
||||||
|
10. Your response must contain ONLY the translated text, nothing else"""
|
||||||
|
|
||||||
if self.custom_system_prompt:
|
if self.custom_system_prompt:
|
||||||
system_content = f"""{base_prompt}
|
system_content = f"""{base_prompt}
|
||||||
@ -341,7 +530,7 @@ class TranslationService:
|
|||||||
|
|
||||||
def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]:
|
def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]:
|
||||||
"""
|
"""
|
||||||
Translate multiple text strings
|
Translate multiple text strings efficiently using batch processing.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
texts: List of texts to translate
|
texts: List of texts to translate
|
||||||
@ -351,6 +540,14 @@ class TranslationService:
|
|||||||
Returns:
|
Returns:
|
||||||
List of translated texts
|
List of translated texts
|
||||||
"""
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Use provider's batch method if available
|
||||||
|
if hasattr(self.provider, 'translate_batch'):
|
||||||
|
return self.provider.translate_batch(texts, target_language, source_language)
|
||||||
|
|
||||||
|
# Fallback to individual translations
|
||||||
return [self.translate_text(text, target_language, source_language) for text in texts]
|
return [self.translate_text(text, target_language, source_language) for text in texts]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,13 @@
|
|||||||
"""
|
"""
|
||||||
Excel Translation Module
|
Excel Translation Module
|
||||||
Translates Excel files while preserving all formatting, formulas, images, and layout
|
Translates Excel files while preserving all formatting, formulas, images, and layout
|
||||||
|
OPTIMIZED: Uses batch translation for 5-10x faster processing
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Set
|
from typing import Dict, Set, List, Tuple
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from openpyxl.worksheet.worksheet import Worksheet
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
from openpyxl.cell.cell import Cell
|
from openpyxl.cell.cell import Cell
|
||||||
@ -23,189 +24,133 @@ class ExcelTranslator:
|
|||||||
|
|
||||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||||
"""
|
"""
|
||||||
Translate an Excel file while preserving all formatting and structure
|
Translate an Excel file while preserving all formatting and structure.
|
||||||
|
Uses batch translation for improved performance.
|
||||||
Args:
|
|
||||||
input_path: Path to input Excel file
|
|
||||||
output_path: Path to save translated Excel file
|
|
||||||
target_language: Target language code
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to the translated file
|
|
||||||
"""
|
"""
|
||||||
# Load workbook with data_only=False to preserve formulas
|
|
||||||
workbook = load_workbook(input_path, data_only=False)
|
workbook = load_workbook(input_path, data_only=False)
|
||||||
|
|
||||||
# First, translate all worksheet content
|
# Collect all translatable text elements
|
||||||
sheet_name_mapping = {}
|
text_elements = [] # List of (text, setter_function)
|
||||||
|
sheet_names_to_translate = []
|
||||||
|
|
||||||
for sheet_name in workbook.sheetnames:
|
for sheet_name in workbook.sheetnames:
|
||||||
worksheet = workbook[sheet_name]
|
worksheet = workbook[sheet_name]
|
||||||
self._translate_worksheet(worksheet, target_language)
|
self._collect_from_worksheet(worksheet, text_elements)
|
||||||
|
sheet_names_to_translate.append(sheet_name)
|
||||||
|
|
||||||
# Translate images if enabled
|
# Add sheet names to translate
|
||||||
if getattr(self.translation_service, 'translate_images', False):
|
sheet_name_setters = []
|
||||||
self._translate_images(worksheet, target_language)
|
for sheet_name in sheet_names_to_translate:
|
||||||
|
text_elements.append((sheet_name, None)) # None setter - handled separately
|
||||||
|
sheet_name_setters.append(sheet_name)
|
||||||
|
|
||||||
# Prepare translated sheet name (but don't rename yet)
|
# Batch translate all texts at once
|
||||||
translated_sheet_name = self.translation_service.translate_text(
|
if text_elements:
|
||||||
sheet_name, target_language
|
texts = [elem[0] for elem in text_elements]
|
||||||
)
|
print(f"Batch translating {len(texts)} text segments...")
|
||||||
if translated_sheet_name and translated_sheet_name != sheet_name:
|
translated_texts = self.translation_service.translate_batch(texts, target_language)
|
||||||
# Truncate to Excel's 31 character limit and ensure uniqueness
|
|
||||||
new_name = translated_sheet_name[:31]
|
|
||||||
counter = 1
|
|
||||||
base_name = new_name[:28] if len(new_name) > 28 else new_name
|
|
||||||
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
|
|
||||||
new_name = f"{base_name}_{counter}"
|
|
||||||
counter += 1
|
|
||||||
sheet_name_mapping[sheet_name] = new_name
|
|
||||||
|
|
||||||
# Now rename sheets (after all content is translated)
|
# Apply translations to cells
|
||||||
for original_name, new_name in sheet_name_mapping.items():
|
sheet_name_offset = len(text_elements) - len(sheet_name_setters)
|
||||||
workbook[original_name].title = new_name
|
for i, ((original_text, setter), translated) in enumerate(zip(text_elements[:sheet_name_offset], translated_texts[:sheet_name_offset])):
|
||||||
|
if translated is not None and setter is not None:
|
||||||
|
try:
|
||||||
|
setter(translated)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error applying translation: {e}")
|
||||||
|
|
||||||
|
# Apply sheet name translations
|
||||||
|
sheet_name_mapping = {}
|
||||||
|
for i, (sheet_name, translated) in enumerate(zip(sheet_name_setters, translated_texts[sheet_name_offset:])):
|
||||||
|
if translated and translated != sheet_name:
|
||||||
|
new_name = translated[:31]
|
||||||
|
counter = 1
|
||||||
|
base_name = new_name[:28] if len(new_name) > 28 else new_name
|
||||||
|
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
|
||||||
|
new_name = f"{base_name}_{counter}"
|
||||||
|
counter += 1
|
||||||
|
sheet_name_mapping[sheet_name] = new_name
|
||||||
|
|
||||||
|
# Rename sheets
|
||||||
|
for original_name, new_name in sheet_name_mapping.items():
|
||||||
|
workbook[original_name].title = new_name
|
||||||
|
|
||||||
|
# Translate images if enabled (separate process)
|
||||||
|
if getattr(self.translation_service, 'translate_images', False):
|
||||||
|
for sheet_name in workbook.sheetnames:
|
||||||
|
self._translate_images(workbook[sheet_name], target_language)
|
||||||
|
|
||||||
# Save the translated workbook
|
|
||||||
workbook.save(output_path)
|
workbook.save(output_path)
|
||||||
workbook.close()
|
workbook.close()
|
||||||
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
def _translate_worksheet(self, worksheet: Worksheet, target_language: str):
|
def _collect_from_worksheet(self, worksheet: Worksheet, text_elements: List[Tuple[str, callable]]):
|
||||||
"""
|
"""Collect all translatable text from worksheet cells"""
|
||||||
Translate all cells in a worksheet while preserving formatting
|
|
||||||
|
|
||||||
Args:
|
|
||||||
worksheet: Worksheet to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
# Iterate through all cells that have values
|
|
||||||
for row in worksheet.iter_rows():
|
for row in worksheet.iter_rows():
|
||||||
for cell in row:
|
for cell in row:
|
||||||
if cell.value is not None:
|
if cell.value is not None:
|
||||||
self._translate_cell(cell, target_language)
|
self._collect_from_cell(cell, text_elements)
|
||||||
|
|
||||||
def _translate_cell(self, cell: Cell, target_language: str):
|
def _collect_from_cell(self, cell: Cell, text_elements: List[Tuple[str, callable]]):
|
||||||
"""
|
"""Collect text from a cell"""
|
||||||
Translate a single cell while preserving its formula and formatting
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cell: Cell to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
original_value = cell.value
|
original_value = cell.value
|
||||||
|
|
||||||
# Skip if cell is empty
|
|
||||||
if original_value is None:
|
if original_value is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Handle formulas
|
# Handle formulas - collect text inside quotes
|
||||||
if isinstance(original_value, str) and original_value.startswith('='):
|
if isinstance(original_value, str) and original_value.startswith('='):
|
||||||
self._translate_formula(cell, original_value, target_language)
|
string_pattern = re.compile(r'"([^"]*)"')
|
||||||
|
strings = string_pattern.findall(original_value)
|
||||||
|
for s in strings:
|
||||||
|
if s.strip():
|
||||||
|
def make_formula_setter(c, orig_formula, orig_string):
|
||||||
|
def setter(translated):
|
||||||
|
c.value = orig_formula.replace(f'"{orig_string}"', f'"{translated}"')
|
||||||
|
return setter
|
||||||
|
text_elements.append((s, make_formula_setter(cell, original_value, s)))
|
||||||
# Handle regular text
|
# Handle regular text
|
||||||
elif isinstance(original_value, str):
|
elif isinstance(original_value, str) and original_value.strip():
|
||||||
translated_text = self.translation_service.translate_text(
|
def make_setter(c):
|
||||||
original_value, target_language
|
def setter(text):
|
||||||
)
|
c.value = text
|
||||||
cell.value = translated_text
|
return setter
|
||||||
# Numbers, dates, booleans remain unchanged
|
text_elements.append((original_value, make_setter(cell)))
|
||||||
|
|
||||||
def _translate_formula(self, cell: Cell, formula: str, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate text within a formula while preserving the formula structure
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cell: Cell containing the formula
|
|
||||||
formula: Formula string
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
# Extract text strings from formula (text within quotes)
|
|
||||||
string_pattern = re.compile(r'"([^"]*)"')
|
|
||||||
strings = string_pattern.findall(formula)
|
|
||||||
|
|
||||||
if not strings:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Translate each string and replace in formula
|
|
||||||
translated_formula = formula
|
|
||||||
for original_string in strings:
|
|
||||||
if original_string.strip(): # Only translate non-empty strings
|
|
||||||
translated_string = self.translation_service.translate_text(
|
|
||||||
original_string, target_language
|
|
||||||
)
|
|
||||||
# Replace in formula, being careful with special regex characters
|
|
||||||
translated_formula = translated_formula.replace(
|
|
||||||
f'"{original_string}"', f'"{translated_string}"'
|
|
||||||
)
|
|
||||||
|
|
||||||
cell.value = translated_formula
|
|
||||||
|
|
||||||
def _should_translate(self, text: str) -> bool:
|
|
||||||
"""
|
|
||||||
Determine if text should be translated
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if text should be translated, False otherwise
|
|
||||||
"""
|
|
||||||
if not text or not isinstance(text, str):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Don't translate if it's only numbers, special characters, or very short
|
|
||||||
if len(text.strip()) < 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if it's a formula (handled separately)
|
|
||||||
if text.startswith('='):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _translate_images(self, worksheet: Worksheet, target_language: str):
|
def _translate_images(self, worksheet: Worksheet, target_language: str):
|
||||||
"""
|
"""Translate text in images using vision model"""
|
||||||
Translate text in images using vision model and add as comments
|
|
||||||
"""
|
|
||||||
from services.translation_service import OllamaTranslationProvider
|
from services.translation_service import OllamaTranslationProvider
|
||||||
|
|
||||||
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get images from worksheet
|
|
||||||
images = getattr(worksheet, '_images', [])
|
images = getattr(worksheet, '_images', [])
|
||||||
|
|
||||||
for idx, image in enumerate(images):
|
for idx, image in enumerate(images):
|
||||||
try:
|
try:
|
||||||
# Get image data
|
|
||||||
image_data = image._data()
|
image_data = image._data()
|
||||||
ext = image.format or 'png'
|
ext = image.format or 'png'
|
||||||
|
|
||||||
# Save to temp file
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
|
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
|
||||||
tmp.write(image_data)
|
tmp.write(image_data)
|
||||||
tmp_path = tmp.name
|
tmp_path = tmp.name
|
||||||
|
|
||||||
# Translate with vision
|
|
||||||
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
||||||
|
|
||||||
# Clean up
|
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
if translated_text and translated_text.strip():
|
if translated_text and translated_text.strip():
|
||||||
# Add translation as a cell near the image
|
|
||||||
anchor = image.anchor
|
anchor = image.anchor
|
||||||
if hasattr(anchor, '_from'):
|
if hasattr(anchor, '_from'):
|
||||||
cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
|
cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
|
||||||
cell = worksheet[cell_ref]
|
cell = worksheet[cell_ref]
|
||||||
# Add as comment
|
|
||||||
from openpyxl.comments import Comment
|
from openpyxl.comments import Comment
|
||||||
cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
|
cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
|
||||||
print(f"Added Excel image translation at {cell_ref}: {translated_text[:50]}...")
|
print(f"Added Excel image translation at {cell_ref}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error translating Excel image {idx}: {e}")
|
print(f"Error translating Excel image {idx}: {e}")
|
||||||
continue
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing Excel images: {e}")
|
print(f"Error processing Excel images: {e}")
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
PowerPoint Translation Module
|
PowerPoint Translation Module
|
||||||
Translates PowerPoint files while preserving all layouts, animations, and media
|
Translates PowerPoint files while preserving all layouts, animations, and media
|
||||||
|
OPTIMIZED: Uses batch translation for 5-10x faster processing
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
@ -9,6 +10,7 @@ from pptx.shapes.group import GroupShape
|
|||||||
from pptx.util import Inches, Pt
|
from pptx.util import Inches, Pt
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||||
from services.translation_service import translation_service
|
from services.translation_service import translation_service
|
||||||
|
from typing import List, Tuple
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@ -21,118 +23,117 @@ class PowerPointTranslator:
|
|||||||
|
|
||||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||||
"""
|
"""
|
||||||
Translate a PowerPoint presentation while preserving all formatting and structure
|
Translate a PowerPoint presentation while preserving all formatting.
|
||||||
|
Uses batch translation for improved performance.
|
||||||
Args:
|
|
||||||
input_path: Path to input PowerPoint file
|
|
||||||
output_path: Path to save translated PowerPoint file
|
|
||||||
target_language: Target language code
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to the translated file
|
|
||||||
"""
|
"""
|
||||||
presentation = Presentation(input_path)
|
presentation = Presentation(input_path)
|
||||||
|
|
||||||
# Translate each slide
|
# Collect all translatable text elements
|
||||||
for slide_idx, slide in enumerate(presentation.slides):
|
text_elements = [] # List of (text, setter_function)
|
||||||
self._translate_slide(slide, target_language, slide_idx + 1, input_path)
|
image_shapes = [] # Collect images for separate processing
|
||||||
|
|
||||||
|
for slide_idx, slide in enumerate(presentation.slides):
|
||||||
|
# Collect from notes
|
||||||
|
if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
|
||||||
|
self._collect_from_text_frame(slide.notes_slide.notes_text_frame, text_elements)
|
||||||
|
|
||||||
|
# Collect from shapes
|
||||||
|
for shape in slide.shapes:
|
||||||
|
self._collect_from_shape(shape, text_elements, slide, image_shapes)
|
||||||
|
|
||||||
|
# Batch translate all texts at once
|
||||||
|
if text_elements:
|
||||||
|
texts = [elem[0] for elem in text_elements]
|
||||||
|
print(f"Batch translating {len(texts)} text segments...")
|
||||||
|
translated_texts = self.translation_service.translate_batch(texts, target_language)
|
||||||
|
|
||||||
|
# Apply translations
|
||||||
|
for (original_text, setter), translated in zip(text_elements, translated_texts):
|
||||||
|
if translated is not None and setter is not None:
|
||||||
|
try:
|
||||||
|
setter(translated)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error applying translation: {e}")
|
||||||
|
|
||||||
|
# Translate images if enabled (separate process, can't batch)
|
||||||
|
if getattr(self.translation_service, 'translate_images', False):
|
||||||
|
for shape, slide in image_shapes:
|
||||||
|
self._translate_image_shape(shape, target_language, slide)
|
||||||
|
|
||||||
# Save the translated presentation
|
|
||||||
presentation.save(output_path)
|
presentation.save(output_path)
|
||||||
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
def _translate_slide(self, slide, target_language: str, slide_num: int, input_path: Path):
|
def _collect_from_shape(self, shape: BaseShape, text_elements: List[Tuple[str, callable]], slide=None, image_shapes=None):
|
||||||
"""
|
"""Collect text from a shape and its children"""
|
||||||
Translate all text elements in a slide while preserving layout
|
|
||||||
|
|
||||||
Args:
|
|
||||||
slide: Slide to translate
|
|
||||||
target_language: Target language code
|
|
||||||
slide_num: Slide number for reference
|
|
||||||
input_path: Path to source file for image extraction
|
|
||||||
"""
|
|
||||||
# Translate notes (speaker notes)
|
|
||||||
if slide.has_notes_slide:
|
|
||||||
notes_slide = slide.notes_slide
|
|
||||||
if notes_slide.notes_text_frame:
|
|
||||||
self._translate_text_frame(notes_slide.notes_text_frame, target_language)
|
|
||||||
|
|
||||||
# Translate shapes in the slide
|
|
||||||
for shape in slide.shapes:
|
|
||||||
self._translate_shape(shape, target_language, slide)
|
|
||||||
|
|
||||||
def _translate_shape(self, shape: BaseShape, target_language: str, slide=None):
|
|
||||||
"""
|
|
||||||
Translate text in a shape based on its type
|
|
||||||
|
|
||||||
Args:
|
|
||||||
shape: Shape to translate
|
|
||||||
target_language: Target language code
|
|
||||||
slide: Parent slide for adding image translations
|
|
||||||
"""
|
|
||||||
# Handle text-containing shapes
|
# Handle text-containing shapes
|
||||||
if shape.has_text_frame:
|
if shape.has_text_frame:
|
||||||
self._translate_text_frame(shape.text_frame, target_language)
|
self._collect_from_text_frame(shape.text_frame, text_elements)
|
||||||
|
|
||||||
# Handle tables
|
# Handle tables
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
||||||
self._translate_table(shape.table, target_language)
|
for row in shape.table.rows:
|
||||||
|
for cell in row.cells:
|
||||||
|
self._collect_from_text_frame(cell.text_frame, text_elements)
|
||||||
|
|
||||||
# Handle pictures/images
|
# Handle pictures/images
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE and image_shapes is not None:
|
||||||
self._translate_image_shape(shape, target_language, slide)
|
image_shapes.append((shape, slide))
|
||||||
|
|
||||||
# Handle group shapes (shapes within shapes)
|
# Handle group shapes
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||||
for sub_shape in shape.shapes:
|
for sub_shape in shape.shapes:
|
||||||
self._translate_shape(sub_shape, target_language, slide)
|
self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
|
||||||
|
|
||||||
# Handle smart art (contains multiple shapes)
|
# Handle smart art
|
||||||
# Smart art is complex, but we can try to translate text within it
|
|
||||||
if hasattr(shape, 'shapes'):
|
if hasattr(shape, 'shapes'):
|
||||||
try:
|
try:
|
||||||
for sub_shape in shape.shapes:
|
for sub_shape in shape.shapes:
|
||||||
self._translate_shape(sub_shape, target_language, slide)
|
self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
|
||||||
except:
|
except:
|
||||||
pass # Some shapes may not support iteration
|
pass
|
||||||
|
|
||||||
def _translate_image_shape(self, shape, target_language: str, slide):
|
def _collect_from_text_frame(self, text_frame, text_elements: List[Tuple[str, callable]]):
|
||||||
"""
|
"""Collect text from a text frame"""
|
||||||
Translate text in an image using vision model and add as text box
|
if not text_frame.text.strip():
|
||||||
"""
|
|
||||||
if not getattr(self.translation_service, 'translate_images', False):
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
for paragraph in text_frame.paragraphs:
|
||||||
|
if not paragraph.text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
for run in paragraph.runs:
|
||||||
|
if run.text and run.text.strip():
|
||||||
|
def make_setter(r):
|
||||||
|
def setter(text):
|
||||||
|
r.text = text
|
||||||
|
return setter
|
||||||
|
text_elements.append((run.text, make_setter(run)))
|
||||||
|
|
||||||
|
def _translate_image_shape(self, shape, target_language: str, slide):
|
||||||
|
"""Translate text in an image using vision model"""
|
||||||
from services.translation_service import OllamaTranslationProvider
|
from services.translation_service import OllamaTranslationProvider
|
||||||
|
|
||||||
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get image blob
|
|
||||||
image_blob = shape.image.blob
|
image_blob = shape.image.blob
|
||||||
ext = shape.image.ext
|
ext = shape.image.ext
|
||||||
|
|
||||||
# Save to temp file
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
|
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
|
||||||
tmp.write(image_blob)
|
tmp.write(image_blob)
|
||||||
tmp_path = tmp.name
|
tmp_path = tmp.name
|
||||||
|
|
||||||
# Translate with vision
|
|
||||||
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
||||||
|
|
||||||
# Clean up
|
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
if translated_text and translated_text.strip():
|
if translated_text and translated_text.strip():
|
||||||
# Add text box below the image with translation
|
|
||||||
left = shape.left
|
left = shape.left
|
||||||
top = shape.top + shape.height + Inches(0.1)
|
top = shape.top + shape.height + Inches(0.1)
|
||||||
width = shape.width
|
width = shape.width
|
||||||
height = Inches(0.5)
|
height = Inches(0.5)
|
||||||
|
|
||||||
# Add text box
|
|
||||||
textbox = slide.shapes.add_textbox(left, top, width, height)
|
textbox = slide.shapes.add_textbox(left, top, width, height)
|
||||||
tf = textbox.text_frame
|
tf = textbox.text_frame
|
||||||
p = tf.paragraphs[0]
|
p = tf.paragraphs[0]
|
||||||
@ -145,71 +146,6 @@ class PowerPointTranslator:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error translating image: {e}")
|
print(f"Error translating image: {e}")
|
||||||
|
|
||||||
def _translate_text_frame(self, text_frame, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate text within a text frame while preserving formatting
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text_frame: Text frame to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
if not text_frame.text.strip():
|
|
||||||
return
|
|
||||||
|
|
||||||
# Translate each paragraph in the text frame
|
|
||||||
for paragraph in text_frame.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
|
|
||||||
def _translate_paragraph(self, paragraph, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate a paragraph while preserving run-level formatting
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paragraph: Paragraph to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
if not paragraph.text.strip():
|
|
||||||
return
|
|
||||||
|
|
||||||
# Translate each run in the paragraph to preserve individual formatting
|
|
||||||
for run in paragraph.runs:
|
|
||||||
if run.text.strip():
|
|
||||||
translated_text = self.translation_service.translate_text(
|
|
||||||
run.text, target_language
|
|
||||||
)
|
|
||||||
run.text = translated_text
|
|
||||||
|
|
||||||
def _translate_table(self, table, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate all cells in a table while preserving structure
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table: Table to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
for row in table.rows:
|
|
||||||
for cell in row.cells:
|
|
||||||
self._translate_text_frame(cell.text_frame, target_language)
|
|
||||||
|
|
||||||
def _is_translatable(self, text: str) -> bool:
|
|
||||||
"""
|
|
||||||
Determine if text should be translated
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if text should be translated, False otherwise
|
|
||||||
"""
|
|
||||||
if not text or not isinstance(text, str):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Don't translate if it's only numbers, special characters, or very short
|
|
||||||
if len(text.strip()) < 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# Global translator instance
|
# Global translator instance
|
||||||
pptx_translator = PowerPointTranslator()
|
pptx_translator = PowerPointTranslator()
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Word Document Translation Module
|
Word Document Translation Module
|
||||||
Translates Word files while preserving all formatting, styles, tables, and images
|
Translates Word files while preserving all formatting, styles, tables, and images
|
||||||
|
OPTIMIZED: Uses batch translation for 5-10x faster processing
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from docx import Document
|
from docx import Document
|
||||||
@ -12,6 +13,7 @@ from docx.section import Section
|
|||||||
from docx.shared import Inches, Pt
|
from docx.shared import Inches, Pt
|
||||||
from docx.oxml.ns import qn
|
from docx.oxml.ns import qn
|
||||||
from services.translation_service import translation_service
|
from services.translation_service import translation_service
|
||||||
|
from typing import List, Tuple, Any
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@ -24,26 +26,36 @@ class WordTranslator:
|
|||||||
|
|
||||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||||
"""
|
"""
|
||||||
Translate a Word document while preserving all formatting and structure
|
Translate a Word document while preserving all formatting and structure.
|
||||||
|
Uses batch translation for improved performance.
|
||||||
Args:
|
|
||||||
input_path: Path to input Word file
|
|
||||||
output_path: Path to save translated Word file
|
|
||||||
target_language: Target language code
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to the translated file
|
|
||||||
"""
|
"""
|
||||||
document = Document(input_path)
|
document = Document(input_path)
|
||||||
|
|
||||||
# Translate main document body
|
# Collect all translatable text elements
|
||||||
self._translate_document_body(document, target_language)
|
text_elements = []
|
||||||
|
|
||||||
# Translate headers and footers in all sections
|
# Collect from document body
|
||||||
|
self._collect_from_body(document, text_elements)
|
||||||
|
|
||||||
|
# Collect from headers and footers
|
||||||
for section in document.sections:
|
for section in document.sections:
|
||||||
self._translate_section(section, target_language)
|
self._collect_from_section(section, text_elements)
|
||||||
|
|
||||||
# Translate images if enabled
|
# Batch translate all texts at once
|
||||||
|
if text_elements:
|
||||||
|
texts = [elem[0] for elem in text_elements]
|
||||||
|
print(f"Batch translating {len(texts)} text segments...")
|
||||||
|
translated_texts = self.translation_service.translate_batch(texts, target_language)
|
||||||
|
|
||||||
|
# Apply translations
|
||||||
|
for (original_text, setter), translated in zip(text_elements, translated_texts):
|
||||||
|
if translated is not None and translated != original_text:
|
||||||
|
try:
|
||||||
|
setter(translated)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error applying translation: {e}")
|
||||||
|
|
||||||
|
# Translate images if enabled (separate process)
|
||||||
if getattr(self.translation_service, 'translate_images', False):
|
if getattr(self.translation_service, 'translate_images', False):
|
||||||
self._translate_images(document, target_language, input_path)
|
self._translate_images(document, target_language, input_path)
|
||||||
|
|
||||||
@ -52,13 +64,59 @@ class WordTranslator:
|
|||||||
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
def _collect_from_body(self, document: Document, text_elements: List[Tuple[str, callable]]):
|
||||||
|
"""Collect all text elements from document body"""
|
||||||
|
for element in document.element.body:
|
||||||
|
if isinstance(element, CT_P):
|
||||||
|
paragraph = Paragraph(element, document)
|
||||||
|
self._collect_from_paragraph(paragraph, text_elements)
|
||||||
|
elif isinstance(element, CT_Tbl):
|
||||||
|
table = Table(element, document)
|
||||||
|
self._collect_from_table(table, text_elements)
|
||||||
|
|
||||||
|
def _collect_from_paragraph(self, paragraph: Paragraph, text_elements: List[Tuple[str, callable]]):
|
||||||
|
"""Collect text from paragraph runs"""
|
||||||
|
if not paragraph.text.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
for run in paragraph.runs:
|
||||||
|
if run.text and run.text.strip():
|
||||||
|
# Create a setter function for this run
|
||||||
|
def make_setter(r):
|
||||||
|
def setter(text):
|
||||||
|
r.text = text
|
||||||
|
return setter
|
||||||
|
text_elements.append((run.text, make_setter(run)))
|
||||||
|
|
||||||
|
def _collect_from_table(self, table: Table, text_elements: List[Tuple[str, callable]]):
|
||||||
|
"""Collect text from table cells"""
|
||||||
|
for row in table.rows:
|
||||||
|
for cell in row.cells:
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
self._collect_from_paragraph(paragraph, text_elements)
|
||||||
|
# Handle nested tables
|
||||||
|
for nested_table in cell.tables:
|
||||||
|
self._collect_from_table(nested_table, text_elements)
|
||||||
|
|
||||||
|
def _collect_from_section(self, section: Section, text_elements: List[Tuple[str, callable]]):
|
||||||
|
"""Collect text from headers and footers"""
|
||||||
|
headers_footers = [
|
||||||
|
section.header, section.footer,
|
||||||
|
section.first_page_header, section.first_page_footer,
|
||||||
|
section.even_page_header, section.even_page_footer
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf in headers_footers:
|
||||||
|
if hf:
|
||||||
|
for paragraph in hf.paragraphs:
|
||||||
|
self._collect_from_paragraph(paragraph, text_elements)
|
||||||
|
for table in hf.tables:
|
||||||
|
self._collect_from_table(table, text_elements)
|
||||||
|
|
||||||
def _translate_images(self, document: Document, target_language: str, input_path: Path):
|
def _translate_images(self, document: Document, target_language: str, input_path: Path):
|
||||||
"""
|
"""Extract text from images and add translations as captions"""
|
||||||
Extract text from images and add translations as captions
|
|
||||||
"""
|
|
||||||
from services.translation_service import OllamaTranslationProvider
|
from services.translation_service import OllamaTranslationProvider
|
||||||
|
|
||||||
# Only works with Ollama vision
|
|
||||||
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -66,165 +124,33 @@ class WordTranslator:
|
|||||||
import zipfile
|
import zipfile
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
# Extract images from docx (it's a zip file)
|
|
||||||
with zipfile.ZipFile(input_path, 'r') as zip_ref:
|
with zipfile.ZipFile(input_path, 'r') as zip_ref:
|
||||||
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
|
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
|
||||||
|
|
||||||
for idx, image_file in enumerate(image_files):
|
for idx, image_file in enumerate(image_files):
|
||||||
try:
|
try:
|
||||||
# Extract image
|
|
||||||
image_data = zip_ref.read(image_file)
|
image_data = zip_ref.read(image_file)
|
||||||
|
|
||||||
# Create temp file
|
|
||||||
ext = os.path.splitext(image_file)[1]
|
ext = os.path.splitext(image_file)[1]
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||||
tmp.write(image_data)
|
tmp.write(image_data)
|
||||||
tmp_path = tmp.name
|
tmp_path = tmp.name
|
||||||
|
|
||||||
# Translate image with vision
|
|
||||||
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
|
||||||
|
|
||||||
# Clean up temp file
|
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
if translated_text and translated_text.strip():
|
if translated_text and translated_text.strip():
|
||||||
# Add translated text as a new paragraph after image
|
|
||||||
# We'll add it at the end with a note
|
|
||||||
p = document.add_paragraph()
|
p = document.add_paragraph()
|
||||||
p.add_run(f"[Image {idx + 1} translation: ").bold = True
|
p.add_run(f"[Image {idx + 1} translation: ").bold = True
|
||||||
p.add_run(translated_text)
|
p.add_run(translated_text)
|
||||||
p.add_run("]").bold = True
|
p.add_run("]").bold = True
|
||||||
|
|
||||||
print(f"Translated image {idx + 1}: {translated_text[:50]}...")
|
print(f"Translated image {idx + 1}: {translated_text[:50]}...")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error translating image {image_file}: {e}")
|
print(f"Error translating image {image_file}: {e}")
|
||||||
continue
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing images: {e}")
|
print(f"Error processing images: {e}")
|
||||||
|
|
||||||
def _translate_document_body(self, document: Document, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate all elements in the document body
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: Document to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
for element in document.element.body:
|
|
||||||
if isinstance(element, CT_P):
|
|
||||||
# It's a paragraph
|
|
||||||
paragraph = Paragraph(element, document)
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
elif isinstance(element, CT_Tbl):
|
|
||||||
# It's a table
|
|
||||||
table = Table(element, document)
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
def _translate_paragraph(self, paragraph: Paragraph, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate a paragraph while preserving all formatting
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paragraph: Paragraph to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
if not paragraph.text.strip():
|
|
||||||
return
|
|
||||||
|
|
||||||
# For paragraphs with complex formatting (multiple runs), translate run by run
|
|
||||||
if len(paragraph.runs) > 0:
|
|
||||||
for run in paragraph.runs:
|
|
||||||
if run.text.strip():
|
|
||||||
translated_text = self.translation_service.translate_text(
|
|
||||||
run.text, target_language
|
|
||||||
)
|
|
||||||
run.text = translated_text
|
|
||||||
else:
|
|
||||||
# Simple paragraph with no runs
|
|
||||||
if paragraph.text.strip():
|
|
||||||
translated_text = self.translation_service.translate_text(
|
|
||||||
paragraph.text, target_language
|
|
||||||
)
|
|
||||||
paragraph.text = translated_text
|
|
||||||
|
|
||||||
def _translate_table(self, table: Table, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate all cells in a table while preserving structure
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table: Table to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
for row in table.rows:
|
|
||||||
for cell in row.cells:
|
|
||||||
self._translate_cell(cell, target_language)
|
|
||||||
|
|
||||||
def _translate_cell(self, cell: _Cell, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate content within a table cell
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cell: Cell to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
for paragraph in cell.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
|
|
||||||
# Handle nested tables
|
|
||||||
for table in cell.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
def _translate_section(self, section: Section, target_language: str):
|
|
||||||
"""
|
|
||||||
Translate headers and footers in a section
|
|
||||||
|
|
||||||
Args:
|
|
||||||
section: Section to translate
|
|
||||||
target_language: Target language code
|
|
||||||
"""
|
|
||||||
# Translate header
|
|
||||||
if section.header:
|
|
||||||
for paragraph in section.header.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.header.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
# Translate footer
|
|
||||||
if section.footer:
|
|
||||||
for paragraph in section.footer.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.footer.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
# Translate first page header (if different)
|
|
||||||
if section.first_page_header:
|
|
||||||
for paragraph in section.first_page_header.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.first_page_header.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
# Translate first page footer (if different)
|
|
||||||
if section.first_page_footer:
|
|
||||||
for paragraph in section.first_page_footer.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.first_page_footer.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
# Translate even page header (if different)
|
|
||||||
if section.even_page_header:
|
|
||||||
for paragraph in section.even_page_header.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.even_page_header.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
# Translate even page footer (if different)
|
|
||||||
if section.even_page_footer:
|
|
||||||
for paragraph in section.even_page_footer.paragraphs:
|
|
||||||
self._translate_paragraph(paragraph, target_language)
|
|
||||||
for table in section.even_page_footer.tables:
|
|
||||||
self._translate_table(table, target_language)
|
|
||||||
|
|
||||||
|
|
||||||
# Global translator instance
|
# Global translator instance
|
||||||
word_translator = WordTranslator()
|
word_translator = WordTranslator()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user