Enhance Excel translation functionality with robust formatting and error handling

This commit is contained in:
sepehr 2025-04-06 21:58:19 +02:00
parent 11b85d6111
commit cc1decc9ed

323
main.py
View File

@ -5,130 +5,279 @@ import os
from tqdm import tqdm from tqdm import tqdm
import copy import copy
import re import re
import shutil
import logging
import xml.etree.ElementTree as ET
import zipfile
import tempfile
from openpyxl.utils import get_column_letter from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
async def translate_text(translator, text, target_language): async def translate_text(translator, text, target_language):
"""Translate text to target language"""
try: try:
translation = await translator.translate(text, dest=target_language) translation = await translator.translate(text, dest=target_language)
return translation.text return translation.text
except Exception as e: except Exception as e:
print(f"Error translating '{text}': {e}") logging.error(f"Translation error: {e}")
return text # Return the original text if translation fails return text # Return original if translation fails
def is_formula(text): def is_formula(text):
"""Check if a cell value is a formula""" """Check if cell value is a formula"""
if isinstance(text, str): if isinstance(text, str):
return text.startswith('=') return text.startswith('=')
return False return False
def copy_cell_format(source_cell, target_cell): def should_translate(cell):
"""Copy formatting from source cell to target cell without copying the problematic style index""" """Determine if a cell should be translated"""
if source_cell.has_style: if cell.value is None:
try: return False
# Copy individual style attributes instead of the entire style object
if source_cell.font:
target_cell.font = copy.copy(source_cell.font)
if source_cell.border:
target_cell.border = copy.copy(source_cell.border)
if source_cell.fill:
target_cell.fill = copy.copy(source_cell.fill)
if source_cell.number_format:
target_cell.number_format = source_cell.number_format
if source_cell.protection:
target_cell.protection = copy.copy(source_cell.protection)
if source_cell.alignment:
target_cell.alignment = copy.copy(source_cell.alignment)
# Copy any hyperlink
if source_cell.hyperlink:
target_cell.hyperlink = source_cell.hyperlink
except Exception as e:
print(f"Error copying format for cell {source_cell.coordinate}: {e}")
def copy_sheet_properties(source_sheet, target_sheet): # Skip formulas
"""Copy all sheet properties from source to target sheet""" if is_formula(cell.value):
# Copy column dimensions for all columns return False
# Only translate string values
if not isinstance(cell.value, str):
return False
return True
def copy_cell_formatting(source_cell, target_cell):
"""Complete and robust copy of cell formatting"""
if not source_cell or not target_cell:
return
# Font
if source_cell.font:
font = Font(
name=source_cell.font.name,
size=source_cell.font.size,
bold=source_cell.font.bold,
italic=source_cell.font.italic,
vertAlign=source_cell.font.vertAlign,
underline=source_cell.font.underline,
strike=source_cell.font.strike,
color=source_cell.font.color
)
target_cell.font = font
# Fill
if source_cell.fill:
fill = PatternFill(
fill_type=source_cell.fill.fill_type,
start_color=source_cell.fill.start_color,
end_color=source_cell.fill.end_color
)
target_cell.fill = fill
# Border
if source_cell.border:
border = Border(
left=copy.copy(source_cell.border.left) if source_cell.border.left else None,
right=copy.copy(source_cell.border.right) if source_cell.border.right else None,
top=copy.copy(source_cell.border.top) if source_cell.border.top else None,
bottom=copy.copy(source_cell.border.bottom) if source_cell.border.bottom else None,
diagonal=copy.copy(source_cell.border.diagonal) if source_cell.border.diagonal else None,
diagonal_direction=source_cell.border.diagonal_direction,
outline=source_cell.border.outline,
vertical=source_cell.border.vertical,
horizontal=source_cell.border.horizontal
)
target_cell.border = border
# Alignment
if source_cell.alignment:
alignment = Alignment(
horizontal=source_cell.alignment.horizontal,
vertical=source_cell.alignment.vertical,
textRotation=source_cell.alignment.textRotation,
wrapText=source_cell.alignment.wrapText,
shrinkToFit=source_cell.alignment.shrinkToFit,
indent=source_cell.alignment.indent,
relativeIndent=source_cell.alignment.relativeIndent,
justifyLastLine=source_cell.alignment.justifyLastLine,
readingOrder=source_cell.alignment.readingOrder
)
target_cell.alignment = alignment
# Number Format
if source_cell.number_format:
target_cell.number_format = source_cell.number_format
# Protection
if source_cell.protection:
protection = Protection(
locked=source_cell.protection.locked,
hidden=source_cell.protection.hidden
)
target_cell.protection = protection
# Hyperlink
if source_cell.hyperlink:
target_cell.hyperlink = copy.copy(source_cell.hyperlink)
def copy_sheet_formatting(source_sheet, target_sheet):
"""Copy all formatting aspects of a sheet"""
# Copy column dimensions
for col_idx in range(1, source_sheet.max_column + 1): for col_idx in range(1, source_sheet.max_column + 1):
col_letter = get_column_letter(col_idx) col_letter = get_column_letter(col_idx)
if col_letter in source_sheet.column_dimensions: if col_letter in source_sheet.column_dimensions:
source_dim = source_sheet.column_dimensions[col_letter] source_dim = source_sheet.column_dimensions[col_letter]
target_dim = target_sheet.column_dimensions[col_letter] target_sheet.column_dimensions[col_letter].width = source_dim.width
target_sheet.column_dimensions[col_letter].hidden = source_dim.hidden
# Copy all available attributes # Copy row dimensions
if hasattr(source_dim, 'width') and source_dim.width:
target_dim.width = source_dim.width
if hasattr(source_dim, 'hidden'):
target_dim.hidden = source_dim.hidden
if hasattr(source_dim, 'outlineLevel'):
target_dim.outlineLevel = source_dim.outlineLevel
# Copy row dimensions for all rows
for row_idx in range(1, source_sheet.max_row + 1): for row_idx in range(1, source_sheet.max_row + 1):
if row_idx in source_sheet.row_dimensions: if row_idx in source_sheet.row_dimensions:
source_dim = source_sheet.row_dimensions[row_idx] source_row = source_sheet.row_dimensions[row_idx]
target_dim = target_sheet.row_dimensions[row_idx] target_sheet.row_dimensions[row_idx].height = source_row.height
if hasattr(source_dim, 'height') and source_dim.height: async def process_table_xml_safely(zip_path, target_language, translator, translated_cache):
target_dim.height = source_dim.height """Process table XML files in Excel to translate headers with proper ZIP handling"""
if hasattr(source_dim, 'hidden'): # Create temp directory
target_dim.hidden = source_dim.hidden temp_dir = tempfile.mkdtemp()
if hasattr(source_dim, 'outlineLevel'):
target_dim.outlineLevel = source_dim.outlineLevel try:
# Extract all files
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# Find all table XML files
table_files = []
for root, dirs, files in os.walk(os.path.join(temp_dir, 'xl', 'tables')):
for file in files:
if file.endswith('.xml'):
table_files.append(os.path.join(root, file))
if not table_files:
logging.info("No table XML files found in the Excel file")
return
logging.info(f"Found {len(table_files)} table XML files to process")
# Process each table XML
for table_file in table_files:
logging.info(f"Processing table XML: {os.path.basename(table_file)}")
# Parse XML
tree = ET.parse(table_file)
root = tree.getroot()
# Find namespace
ns = root.tag.split('}')[0].strip('{') if '}' in root.tag else ''
# Find and translate table name/displayName if present
for attr in ['displayName', 'name']:
if attr in root.attrib:
original_text = root.attrib[attr]
if original_text in translated_cache:
root.attrib[attr] = translated_cache[original_text]
else:
translated_text = await translate_text(translator, original_text, target_language)
root.attrib[attr] = translated_text
translated_cache[original_text] = translated_text
# Find table columns and translate headers
ns_prefix = '{' + ns + '}' if ns else ''
columns_tag = f"{ns_prefix}tableColumns" if ns else "tableColumns"
column_tag = f"{ns_prefix}tableColumn" if ns else "tableColumn"
columns_element = root.find(f".//{columns_tag}")
if columns_element is not None:
for column in columns_element.findall(f".//{column_tag}"):
if 'name' in column.attrib:
header_text = column.attrib['name']
if header_text in translated_cache:
column.attrib['name'] = translated_cache[header_text]
else:
translated_header = await translate_text(translator, header_text, target_language)
column.attrib['name'] = translated_header
translated_cache[header_text] = translated_header
# Save the changes
tree.write(table_file, encoding='UTF-8', xml_declaration=True)
# Create a new zip file
new_zip_path = zip_path + '.new'
with zipfile.ZipFile(new_zip_path, 'w') as new_zip:
for folder_path, subfolders, files in os.walk(temp_dir):
for file in files:
absolute_path = os.path.join(folder_path, file)
relative_path = os.path.relpath(absolute_path, temp_dir)
new_zip.write(absolute_path, relative_path)
# Replace the old zip with the new one
shutil.move(new_zip_path, zip_path)
finally:
# Clean up
shutil.rmtree(temp_dir)
async def translate_excel(file_path: str, target_language: str): async def translate_excel(file_path: str, target_language: str):
"""Translate Excel file while preserving all formatting including tables"""
# Verify file exists
if not os.path.exists(file_path):
logging.error(f"File not found: {file_path}")
raise FileNotFoundError(f"The file {file_path} does not exist.")
# Create a copy of the original file to work with
base_name = os.path.splitext(file_path)[0]
translated_file_path = f"{base_name}_translated_{target_language}.xlsx"
logging.info(f"Creating a copy of the original file...")
shutil.copy2(file_path, translated_file_path)
# Open the copied file and modify it in-place
workbook = load_workbook(translated_file_path)
translator = Translator() translator = Translator()
workbook = load_workbook(file_path)
translated_workbook = Workbook() # Track unique values to minimize API calls
translated_cache = {}
# Count total cells for progress bar # Count total cells for progress bar
total_cells = sum( total_cells = sum(
sum(1 for _ in sheet.iter_rows()) sheet.max_row * sheet.max_column
for sheet in workbook.worksheets for sheet in workbook.worksheets
) )
progress_bar = tqdm(total=total_cells, desc="Translating cells") with tqdm(total=total_cells, desc=f"Translating to {target_language}") as progress_bar:
# Process each sheet
for sheet in workbook.worksheets:
logging.info(f"Processing sheet: {sheet.title} ({sheet.max_row} rows × {sheet.max_column} columns)")
for sheet_name in workbook.sheetnames: # Process cells row by row, column by column
original_sheet = workbook[sheet_name] for row in range(1, sheet.max_row + 1):
translated_sheet = translated_workbook.create_sheet(title=sheet_name) for col in range(1, sheet.max_column + 1):
cell = sheet.cell(row=row, column=col)
progress_bar.update(1)
print(f"Processing sheet: {sheet_name} with {original_sheet.max_column} columns") # Check if cell should be translated
if should_translate(cell):
original_text = str(cell.value)
# Copy sheet properties using the improved function # Use cached translation if available
copy_sheet_properties(original_sheet, translated_sheet) if original_text in translated_cache:
cell.value = translated_cache[original_text]
else:
translated_text = await translate_text(translator, original_text, target_language)
cell.value = translated_text
translated_cache[original_text] = translated_text
# Copy merged cells # Save the translated workbook
for merged_cell_range in original_sheet.merged_cells: workbook.save(translated_file_path)
translated_sheet.merge_cells(str(merged_cell_range)) logging.info(f"Basic cell translation complete")
# Process each cell # Process table XML files separately to fix table headers
for row in original_sheet.iter_rows(): logging.info("Processing table structures...")
for cell in row: await process_table_xml_safely(translated_file_path, target_language, translator, translated_cache)
progress_bar.update(1)
col_idx = cell.column
row_idx = cell.row
# Create the cell at the same position in the new sheet logging.info(f"Translation complete! File saved as: {translated_file_path}")
if cell.value is not None: # Use is not None to include 0 values return translated_file_path
if is_formula(cell.value):
# Don't translate formulas
translated_cell = translated_sheet.cell(row=row_idx, column=col_idx, value=cell.value)
else:
translated_text = await translate_text(translator, str(cell.value), target_language)
translated_cell = translated_sheet.cell(row=row_idx, column=col_idx, value=translated_text)
else:
translated_cell = translated_sheet.cell(row=row_idx, column=col_idx)
# Copy formatting
copy_cell_format(cell, translated_cell)
# Remove the default sheet created by Workbook
if "Sheet" in translated_workbook.sheetnames:
del translated_workbook["Sheet"]
translated_file_path = os.path.splitext(file_path)[0] + f"_translated_{target_language}.xlsx"
translated_workbook.save(translated_file_path)
progress_bar.close()
print(f"Translated file saved as: {translated_file_path}")
async def main(): async def main():
input_file = r"F:\Dev\excel-translator\data\sample\test_sample.xlsx" input_file = r"F:\Dev\excel-translator\data\sample\test_sample.xlsx"