office_translator/translators/excel_translator.py

216 lines
8.2 KiB
Python

"""
Excel Translation Module
Translates Excel files while preserving all formatting, formulas, images, and layout
"""
import re
import tempfile
import os
from pathlib import Path
from typing import Dict, Set
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.cell.cell import Cell
from openpyxl.utils import get_column_letter
from services.translation_service import translation_service
class ExcelTranslator:
"""Handles translation of Excel files with strict formatting preservation"""
def __init__(self):
self.translation_service = translation_service
self.formula_pattern = re.compile(r'=.*')
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
"""
Translate an Excel file while preserving all formatting and structure
Args:
input_path: Path to input Excel file
output_path: Path to save translated Excel file
target_language: Target language code
Returns:
Path to the translated file
"""
# Load workbook with data_only=False to preserve formulas
workbook = load_workbook(input_path, data_only=False)
# First, translate all worksheet content
sheet_name_mapping = {}
for sheet_name in workbook.sheetnames:
worksheet = workbook[sheet_name]
self._translate_worksheet(worksheet, target_language)
# Translate images if enabled
if getattr(self.translation_service, 'translate_images', False):
self._translate_images(worksheet, target_language)
# Prepare translated sheet name (but don't rename yet)
translated_sheet_name = self.translation_service.translate_text(
sheet_name, target_language
)
if translated_sheet_name and translated_sheet_name != sheet_name:
# Truncate to Excel's 31 character limit and ensure uniqueness
new_name = translated_sheet_name[:31]
counter = 1
base_name = new_name[:28] if len(new_name) > 28 else new_name
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
new_name = f"{base_name}_{counter}"
counter += 1
sheet_name_mapping[sheet_name] = new_name
# Now rename sheets (after all content is translated)
for original_name, new_name in sheet_name_mapping.items():
workbook[original_name].title = new_name
# Save the translated workbook
workbook.save(output_path)
workbook.close()
return output_path
def _translate_worksheet(self, worksheet: Worksheet, target_language: str):
"""
Translate all cells in a worksheet while preserving formatting
Args:
worksheet: Worksheet to translate
target_language: Target language code
"""
# Iterate through all cells that have values
for row in worksheet.iter_rows():
for cell in row:
if cell.value is not None:
self._translate_cell(cell, target_language)
def _translate_cell(self, cell: Cell, target_language: str):
"""
Translate a single cell while preserving its formula and formatting
Args:
cell: Cell to translate
target_language: Target language code
"""
original_value = cell.value
# Skip if cell is empty
if original_value is None:
return
# Handle formulas
if isinstance(original_value, str) and original_value.startswith('='):
self._translate_formula(cell, original_value, target_language)
# Handle regular text
elif isinstance(original_value, str):
translated_text = self.translation_service.translate_text(
original_value, target_language
)
cell.value = translated_text
# Numbers, dates, booleans remain unchanged
def _translate_formula(self, cell: Cell, formula: str, target_language: str):
"""
Translate text within a formula while preserving the formula structure
Args:
cell: Cell containing the formula
formula: Formula string
target_language: Target language code
"""
# Extract text strings from formula (text within quotes)
string_pattern = re.compile(r'"([^"]*)"')
strings = string_pattern.findall(formula)
if not strings:
return
# Translate each string and replace in formula
translated_formula = formula
for original_string in strings:
if original_string.strip(): # Only translate non-empty strings
translated_string = self.translation_service.translate_text(
original_string, target_language
)
# Replace in formula, being careful with special regex characters
translated_formula = translated_formula.replace(
f'"{original_string}"', f'"{translated_string}"'
)
cell.value = translated_formula
def _should_translate(self, text: str) -> bool:
"""
Determine if text should be translated
Args:
text: Text to check
Returns:
True if text should be translated, False otherwise
"""
if not text or not isinstance(text, str):
return False
# Don't translate if it's only numbers, special characters, or very short
if len(text.strip()) < 2:
return False
# Check if it's a formula (handled separately)
if text.startswith('='):
return False
return True
def _translate_images(self, worksheet: Worksheet, target_language: str):
"""
Translate text in images using vision model and add as comments
"""
from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return
try:
# Get images from worksheet
images = getattr(worksheet, '_images', [])
for idx, image in enumerate(images):
try:
# Get image data
image_data = image._data()
ext = image.format or 'png'
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
# Translate with vision
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
# Clean up
os.unlink(tmp_path)
if translated_text and translated_text.strip():
# Add translation as a cell near the image
anchor = image.anchor
if hasattr(anchor, '_from'):
cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
cell = worksheet[cell_ref]
# Add as comment
from openpyxl.comments import Comment
cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
print(f"Added Excel image translation at {cell_ref}: {translated_text[:50]}...")
except Exception as e:
print(f"Error translating Excel image {idx}: {e}")
continue
except Exception as e:
print(f"Error processing Excel images: {e}")
# Global translator instance
excel_translator = ExcelTranslator()