From cc1decc9ede16d3361555ef812c9e9fe542e010d Mon Sep 17 00:00:00 2001 From: sepehr Date: Sun, 6 Apr 2025 21:58:19 +0200 Subject: [PATCH] Enhance Excel translation functionality with robust formatting and error handling --- main.py | 337 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 243 insertions(+), 94 deletions(-) diff --git a/main.py b/main.py index 371f097..95d17f5 100644 --- a/main.py +++ b/main.py @@ -5,130 +5,279 @@ import os from tqdm import tqdm import copy import re +import shutil +import logging +import xml.etree.ElementTree as ET +import zipfile +import tempfile from openpyxl.utils import get_column_letter +from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') async def translate_text(translator, text, target_language): + """Translate text to target language""" try: translation = await translator.translate(text, dest=target_language) return translation.text except Exception as e: - print(f"Error translating '{text}': {e}") - return text # Return the original text if translation fails + logging.error(f"Translation error: {e}") + return text # Return original if translation fails def is_formula(text): - """Check if a cell value is a formula""" + """Check if cell value is a formula""" if isinstance(text, str): return text.startswith('=') return False -def copy_cell_format(source_cell, target_cell): - """Copy formatting from source cell to target cell without copying the problematic style index""" - if source_cell.has_style: - try: - # Copy individual style attributes instead of the entire style object - if source_cell.font: - target_cell.font = copy.copy(source_cell.font) - if source_cell.border: - target_cell.border = copy.copy(source_cell.border) - if source_cell.fill: - target_cell.fill = copy.copy(source_cell.fill) - if source_cell.number_format: - target_cell.number_format = source_cell.number_format - if source_cell.protection: - target_cell.protection = copy.copy(source_cell.protection) - if source_cell.alignment: - target_cell.alignment = copy.copy(source_cell.alignment) - # Copy any hyperlink - if source_cell.hyperlink: - target_cell.hyperlink = source_cell.hyperlink - except Exception as e: - print(f"Error copying format for cell {source_cell.coordinate}: {e}") +def should_translate(cell): + """Determine if a cell should be translated""" + if cell.value is None: + return False + + # Skip formulas + if is_formula(cell.value): + return False + + # Only translate string values + if not isinstance(cell.value, str): + return False + + return True -def copy_sheet_properties(source_sheet, target_sheet): - """Copy all sheet properties from source to target sheet""" - # Copy column dimensions for all columns +def copy_cell_formatting(source_cell, target_cell): + """Complete and robust copy of cell formatting""" + if not source_cell or not target_cell: + return + + # Font + if source_cell.font: + font = Font( + name=source_cell.font.name, + size=source_cell.font.size, + bold=source_cell.font.bold, + italic=source_cell.font.italic, + vertAlign=source_cell.font.vertAlign, + underline=source_cell.font.underline, + strike=source_cell.font.strike, + color=source_cell.font.color + ) + target_cell.font = font + + # Fill + if source_cell.fill: + fill = PatternFill( + fill_type=source_cell.fill.fill_type, + start_color=source_cell.fill.start_color, + end_color=source_cell.fill.end_color + ) + target_cell.fill = fill + + # Border + if source_cell.border: + border = Border( + left=copy.copy(source_cell.border.left) if source_cell.border.left else None, + right=copy.copy(source_cell.border.right) if source_cell.border.right else None, + top=copy.copy(source_cell.border.top) if source_cell.border.top else None, + bottom=copy.copy(source_cell.border.bottom) if source_cell.border.bottom else None, + diagonal=copy.copy(source_cell.border.diagonal) if source_cell.border.diagonal else None, + diagonal_direction=source_cell.border.diagonal_direction, + outline=source_cell.border.outline, + vertical=source_cell.border.vertical, + horizontal=source_cell.border.horizontal + ) + target_cell.border = border + + # Alignment + if source_cell.alignment: + alignment = Alignment( + horizontal=source_cell.alignment.horizontal, + vertical=source_cell.alignment.vertical, + textRotation=source_cell.alignment.textRotation, + wrapText=source_cell.alignment.wrapText, + shrinkToFit=source_cell.alignment.shrinkToFit, + indent=source_cell.alignment.indent, + relativeIndent=source_cell.alignment.relativeIndent, + justifyLastLine=source_cell.alignment.justifyLastLine, + readingOrder=source_cell.alignment.readingOrder + ) + target_cell.alignment = alignment + + # Number Format + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + + # Protection + if source_cell.protection: + protection = Protection( + locked=source_cell.protection.locked, + hidden=source_cell.protection.hidden + ) + target_cell.protection = protection + + # Hyperlink + if source_cell.hyperlink: + target_cell.hyperlink = copy.copy(source_cell.hyperlink) + +def copy_sheet_formatting(source_sheet, target_sheet): + """Copy all formatting aspects of a sheet""" + + # Copy column dimensions for col_idx in range(1, source_sheet.max_column + 1): col_letter = get_column_letter(col_idx) if col_letter in source_sheet.column_dimensions: source_dim = source_sheet.column_dimensions[col_letter] - target_dim = target_sheet.column_dimensions[col_letter] - - # Copy all available attributes - if hasattr(source_dim, 'width') and source_dim.width: - target_dim.width = source_dim.width - if hasattr(source_dim, 'hidden'): - target_dim.hidden = source_dim.hidden - if hasattr(source_dim, 'outlineLevel'): - target_dim.outlineLevel = source_dim.outlineLevel + target_sheet.column_dimensions[col_letter].width = source_dim.width + target_sheet.column_dimensions[col_letter].hidden = source_dim.hidden - # Copy row dimensions for all rows + # Copy row dimensions for row_idx in range(1, source_sheet.max_row + 1): if row_idx in source_sheet.row_dimensions: - source_dim = source_sheet.row_dimensions[row_idx] - target_dim = target_sheet.row_dimensions[row_idx] + source_row = source_sheet.row_dimensions[row_idx] + target_sheet.row_dimensions[row_idx].height = source_row.height + +async def process_table_xml_safely(zip_path, target_language, translator, translated_cache): + """Process table XML files in Excel to translate headers with proper ZIP handling""" + # Create temp directory + temp_dir = tempfile.mkdtemp() + + try: + # Extract all files + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # Find all table XML files + table_files = [] + for root, dirs, files in os.walk(os.path.join(temp_dir, 'xl', 'tables')): + for file in files: + if file.endswith('.xml'): + table_files.append(os.path.join(root, file)) + + if not table_files: + logging.info("No table XML files found in the Excel file") + return + + logging.info(f"Found {len(table_files)} table XML files to process") + + # Process each table XML + for table_file in table_files: + logging.info(f"Processing table XML: {os.path.basename(table_file)}") - if hasattr(source_dim, 'height') and source_dim.height: - target_dim.height = source_dim.height - if hasattr(source_dim, 'hidden'): - target_dim.hidden = source_dim.hidden - if hasattr(source_dim, 'outlineLevel'): - target_dim.outlineLevel = source_dim.outlineLevel + # Parse XML + tree = ET.parse(table_file) + root = tree.getroot() + + # Find namespace + ns = root.tag.split('}')[0].strip('{') if '}' in root.tag else '' + + # Find and translate table name/displayName if present + for attr in ['displayName', 'name']: + if attr in root.attrib: + original_text = root.attrib[attr] + if original_text in translated_cache: + root.attrib[attr] = translated_cache[original_text] + else: + translated_text = await translate_text(translator, original_text, target_language) + root.attrib[attr] = translated_text + translated_cache[original_text] = translated_text + + # Find table columns and translate headers + ns_prefix = '{' + ns + '}' if ns else '' + columns_tag = f"{ns_prefix}tableColumns" if ns else "tableColumns" + column_tag = f"{ns_prefix}tableColumn" if ns else "tableColumn" + + columns_element = root.find(f".//{columns_tag}") + if columns_element is not None: + for column in columns_element.findall(f".//{column_tag}"): + if 'name' in column.attrib: + header_text = column.attrib['name'] + if header_text in translated_cache: + column.attrib['name'] = translated_cache[header_text] + else: + translated_header = await translate_text(translator, header_text, target_language) + column.attrib['name'] = translated_header + translated_cache[header_text] = translated_header + + # Save the changes + tree.write(table_file, encoding='UTF-8', xml_declaration=True) + + # Create a new zip file + new_zip_path = zip_path + '.new' + with zipfile.ZipFile(new_zip_path, 'w') as new_zip: + for folder_path, subfolders, files in os.walk(temp_dir): + for file in files: + absolute_path = os.path.join(folder_path, file) + relative_path = os.path.relpath(absolute_path, temp_dir) + new_zip.write(absolute_path, relative_path) + + # Replace the old zip with the new one + shutil.move(new_zip_path, zip_path) + + finally: + # Clean up + shutil.rmtree(temp_dir) async def translate_excel(file_path: str, target_language: str): + """Translate Excel file while preserving all formatting including tables""" + # Verify file exists + if not os.path.exists(file_path): + logging.error(f"File not found: {file_path}") + raise FileNotFoundError(f"The file {file_path} does not exist.") + + # Create a copy of the original file to work with + base_name = os.path.splitext(file_path)[0] + translated_file_path = f"{base_name}_translated_{target_language}.xlsx" + + logging.info(f"Creating a copy of the original file...") + shutil.copy2(file_path, translated_file_path) + + # Open the copied file and modify it in-place + workbook = load_workbook(translated_file_path) translator = Translator() - workbook = load_workbook(file_path) - translated_workbook = Workbook() - + + # Track unique values to minimize API calls + translated_cache = {} + # Count total cells for progress bar total_cells = sum( - sum(1 for _ in sheet.iter_rows()) + sheet.max_row * sheet.max_column for sheet in workbook.worksheets ) - progress_bar = tqdm(total=total_cells, desc="Translating cells") - - for sheet_name in workbook.sheetnames: - original_sheet = workbook[sheet_name] - translated_sheet = translated_workbook.create_sheet(title=sheet_name) - - print(f"Processing sheet: {sheet_name} with {original_sheet.max_column} columns") - - # Copy sheet properties using the improved function - copy_sheet_properties(original_sheet, translated_sheet) - - # Copy merged cells - for merged_cell_range in original_sheet.merged_cells: - translated_sheet.merge_cells(str(merged_cell_range)) - - # Process each cell - for row in original_sheet.iter_rows(): - for cell in row: - progress_bar.update(1) - col_idx = cell.column - row_idx = cell.row - - # Create the cell at the same position in the new sheet - if cell.value is not None: # Use is not None to include 0 values - if is_formula(cell.value): - # Don't translate formulas - translated_cell = translated_sheet.cell(row=row_idx, column=col_idx, value=cell.value) - else: - translated_text = await translate_text(translator, str(cell.value), target_language) - translated_cell = translated_sheet.cell(row=row_idx, column=col_idx, value=translated_text) - else: - translated_cell = translated_sheet.cell(row=row_idx, column=col_idx) - - # Copy formatting - copy_cell_format(cell, translated_cell) - - # Remove the default sheet created by Workbook - if "Sheet" in translated_workbook.sheetnames: - del translated_workbook["Sheet"] - - translated_file_path = os.path.splitext(file_path)[0] + f"_translated_{target_language}.xlsx" - translated_workbook.save(translated_file_path) - progress_bar.close() - print(f"Translated file saved as: {translated_file_path}") + with tqdm(total=total_cells, desc=f"Translating to {target_language}") as progress_bar: + # Process each sheet + for sheet in workbook.worksheets: + logging.info(f"Processing sheet: {sheet.title} ({sheet.max_row} rows × {sheet.max_column} columns)") + + # Process cells row by row, column by column + for row in range(1, sheet.max_row + 1): + for col in range(1, sheet.max_column + 1): + cell = sheet.cell(row=row, column=col) + progress_bar.update(1) + + # Check if cell should be translated + if should_translate(cell): + original_text = str(cell.value) + + # Use cached translation if available + if original_text in translated_cache: + cell.value = translated_cache[original_text] + else: + translated_text = await translate_text(translator, original_text, target_language) + cell.value = translated_text + translated_cache[original_text] = translated_text + + # Save the translated workbook + workbook.save(translated_file_path) + logging.info(f"Basic cell translation complete") + + # Process table XML files separately to fix table headers + logging.info("Processing table structures...") + await process_table_xml_safely(translated_file_path, target_language, translator, translated_cache) + + logging.info(f"Translation complete! File saved as: {translated_file_path}") + return translated_file_path async def main(): input_file = r"F:\Dev\excel-translator\data\sample\test_sample.xlsx"