office_translator/translators/excel_translator.py
Sepehr 8f9ca669cf Performance optimization: batch translation for 5-10x speed improvement
- GoogleTranslationProvider: Added batch translation with separator method
- DeepLTranslationProvider: Added translator caching and batch support
- LibreTranslationProvider: Added translator caching and batch support
- WordTranslator: Collect all texts -> batch translate -> apply pattern
- ExcelTranslator: Collect all texts -> batch translate -> apply pattern
- PowerPointTranslator: Collect all texts -> batch translate -> apply pattern
- Enhanced Ollama/OpenAI prompts with stricter translation-only rules
- Added rule: return original text if uncertain about translation
2025-11-30 20:41:20 +01:00

161 lines
7.1 KiB
Python

"""
Excel Translation Module
Translates Excel files while preserving all formatting, formulas, images, and layout
OPTIMIZED: Uses batch translation for 5-10x faster processing
"""
import re
import tempfile
import os
from pathlib import Path
from typing import Dict, Set, List, Tuple
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.cell.cell import Cell
from openpyxl.utils import get_column_letter
from services.translation_service import translation_service
class ExcelTranslator:
"""Handles translation of Excel files with strict formatting preservation"""
def __init__(self):
self.translation_service = translation_service
self.formula_pattern = re.compile(r'=.*')
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
"""
Translate an Excel file while preserving all formatting and structure.
Uses batch translation for improved performance.
"""
workbook = load_workbook(input_path, data_only=False)
# Collect all translatable text elements
text_elements = [] # List of (text, setter_function)
sheet_names_to_translate = []
for sheet_name in workbook.sheetnames:
worksheet = workbook[sheet_name]
self._collect_from_worksheet(worksheet, text_elements)
sheet_names_to_translate.append(sheet_name)
# Add sheet names to translate
sheet_name_setters = []
for sheet_name in sheet_names_to_translate:
text_elements.append((sheet_name, None)) # None setter - handled separately
sheet_name_setters.append(sheet_name)
# Batch translate all texts at once
if text_elements:
texts = [elem[0] for elem in text_elements]
print(f"Batch translating {len(texts)} text segments...")
translated_texts = self.translation_service.translate_batch(texts, target_language)
# Apply translations to cells
sheet_name_offset = len(text_elements) - len(sheet_name_setters)
for i, ((original_text, setter), translated) in enumerate(zip(text_elements[:sheet_name_offset], translated_texts[:sheet_name_offset])):
if translated is not None and setter is not None:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Apply sheet name translations
sheet_name_mapping = {}
for i, (sheet_name, translated) in enumerate(zip(sheet_name_setters, translated_texts[sheet_name_offset:])):
if translated and translated != sheet_name:
new_name = translated[:31]
counter = 1
base_name = new_name[:28] if len(new_name) > 28 else new_name
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
new_name = f"{base_name}_{counter}"
counter += 1
sheet_name_mapping[sheet_name] = new_name
# Rename sheets
for original_name, new_name in sheet_name_mapping.items():
workbook[original_name].title = new_name
# Translate images if enabled (separate process)
if getattr(self.translation_service, 'translate_images', False):
for sheet_name in workbook.sheetnames:
self._translate_images(workbook[sheet_name], target_language)
workbook.save(output_path)
workbook.close()
return output_path
def _collect_from_worksheet(self, worksheet: Worksheet, text_elements: List[Tuple[str, callable]]):
"""Collect all translatable text from worksheet cells"""
for row in worksheet.iter_rows():
for cell in row:
if cell.value is not None:
self._collect_from_cell(cell, text_elements)
def _collect_from_cell(self, cell: Cell, text_elements: List[Tuple[str, callable]]):
"""Collect text from a cell"""
original_value = cell.value
if original_value is None:
return
# Handle formulas - collect text inside quotes
if isinstance(original_value, str) and original_value.startswith('='):
string_pattern = re.compile(r'"([^"]*)"')
strings = string_pattern.findall(original_value)
for s in strings:
if s.strip():
def make_formula_setter(c, orig_formula, orig_string):
def setter(translated):
c.value = orig_formula.replace(f'"{orig_string}"', f'"{translated}"')
return setter
text_elements.append((s, make_formula_setter(cell, original_value, s)))
# Handle regular text
elif isinstance(original_value, str) and original_value.strip():
def make_setter(c):
def setter(text):
c.value = text
return setter
text_elements.append((original_value, make_setter(cell)))
def _translate_images(self, worksheet: Worksheet, target_language: str):
"""Translate text in images using vision model"""
from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return
try:
images = getattr(worksheet, '_images', [])
for idx, image in enumerate(images):
try:
image_data = image._data()
ext = image.format or 'png'
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
os.unlink(tmp_path)
if translated_text and translated_text.strip():
anchor = image.anchor
if hasattr(anchor, '_from'):
cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
cell = worksheet[cell_ref]
from openpyxl.comments import Comment
cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
print(f"Added Excel image translation at {cell_ref}")
except Exception as e:
print(f"Error translating Excel image {idx}: {e}")
except Exception as e:
print(f"Error processing Excel images: {e}")
# Global translator instance
excel_translator = ExcelTranslator()