Initial commit: Document Translation API with Excel, Word, PowerPoint support
This commit is contained in:
10
translators/__init__.py
Normal file
10
translators/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Translators package initialization"""
|
||||
from .excel_translator import ExcelTranslator, excel_translator
|
||||
from .word_translator import WordTranslator, word_translator
|
||||
from .pptx_translator import PowerPointTranslator, pptx_translator
|
||||
|
||||
__all__ = [
|
||||
'ExcelTranslator', 'excel_translator',
|
||||
'WordTranslator', 'word_translator',
|
||||
'PowerPointTranslator', 'pptx_translator'
|
||||
]
|
||||
161
translators/excel_translator.py
Normal file
161
translators/excel_translator.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Excel Translation Module
|
||||
Translates Excel files while preserving all formatting, formulas, images, and layout
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Set
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
from openpyxl.cell.cell import Cell
|
||||
from openpyxl.utils import get_column_letter
|
||||
from services.translation_service import translation_service
|
||||
|
||||
|
||||
class ExcelTranslator:
|
||||
"""Handles translation of Excel files with strict formatting preservation"""
|
||||
|
||||
def __init__(self):
|
||||
self.translation_service = translation_service
|
||||
self.formula_pattern = re.compile(r'=.*')
|
||||
|
||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||
"""
|
||||
Translate an Excel file while preserving all formatting and structure
|
||||
|
||||
Args:
|
||||
input_path: Path to input Excel file
|
||||
output_path: Path to save translated Excel file
|
||||
target_language: Target language code
|
||||
|
||||
Returns:
|
||||
Path to the translated file
|
||||
"""
|
||||
# Load workbook with data_only=False to preserve formulas
|
||||
workbook = load_workbook(input_path, data_only=False)
|
||||
|
||||
# First, translate all worksheet content
|
||||
sheet_name_mapping = {}
|
||||
for sheet_name in workbook.sheetnames:
|
||||
worksheet = workbook[sheet_name]
|
||||
self._translate_worksheet(worksheet, target_language)
|
||||
|
||||
# Prepare translated sheet name (but don't rename yet)
|
||||
translated_sheet_name = self.translation_service.translate_text(
|
||||
sheet_name, target_language
|
||||
)
|
||||
if translated_sheet_name and translated_sheet_name != sheet_name:
|
||||
# Truncate to Excel's 31 character limit and ensure uniqueness
|
||||
new_name = translated_sheet_name[:31]
|
||||
counter = 1
|
||||
base_name = new_name[:28] if len(new_name) > 28 else new_name
|
||||
while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames:
|
||||
new_name = f"{base_name}_{counter}"
|
||||
counter += 1
|
||||
sheet_name_mapping[sheet_name] = new_name
|
||||
|
||||
# Now rename sheets (after all content is translated)
|
||||
for original_name, new_name in sheet_name_mapping.items():
|
||||
workbook[original_name].title = new_name
|
||||
|
||||
# Save the translated workbook
|
||||
workbook.save(output_path)
|
||||
workbook.close()
|
||||
|
||||
return output_path
|
||||
|
||||
def _translate_worksheet(self, worksheet: Worksheet, target_language: str):
|
||||
"""
|
||||
Translate all cells in a worksheet while preserving formatting
|
||||
|
||||
Args:
|
||||
worksheet: Worksheet to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
# Iterate through all cells that have values
|
||||
for row in worksheet.iter_rows():
|
||||
for cell in row:
|
||||
if cell.value is not None:
|
||||
self._translate_cell(cell, target_language)
|
||||
|
||||
def _translate_cell(self, cell: Cell, target_language: str):
|
||||
"""
|
||||
Translate a single cell while preserving its formula and formatting
|
||||
|
||||
Args:
|
||||
cell: Cell to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
original_value = cell.value
|
||||
|
||||
# Skip if cell is empty
|
||||
if original_value is None:
|
||||
return
|
||||
|
||||
# Handle formulas
|
||||
if isinstance(original_value, str) and original_value.startswith('='):
|
||||
self._translate_formula(cell, original_value, target_language)
|
||||
# Handle regular text
|
||||
elif isinstance(original_value, str):
|
||||
translated_text = self.translation_service.translate_text(
|
||||
original_value, target_language
|
||||
)
|
||||
cell.value = translated_text
|
||||
# Numbers, dates, booleans remain unchanged
|
||||
|
||||
def _translate_formula(self, cell: Cell, formula: str, target_language: str):
|
||||
"""
|
||||
Translate text within a formula while preserving the formula structure
|
||||
|
||||
Args:
|
||||
cell: Cell containing the formula
|
||||
formula: Formula string
|
||||
target_language: Target language code
|
||||
"""
|
||||
# Extract text strings from formula (text within quotes)
|
||||
string_pattern = re.compile(r'"([^"]*)"')
|
||||
strings = string_pattern.findall(formula)
|
||||
|
||||
if not strings:
|
||||
return
|
||||
|
||||
# Translate each string and replace in formula
|
||||
translated_formula = formula
|
||||
for original_string in strings:
|
||||
if original_string.strip(): # Only translate non-empty strings
|
||||
translated_string = self.translation_service.translate_text(
|
||||
original_string, target_language
|
||||
)
|
||||
# Replace in formula, being careful with special regex characters
|
||||
translated_formula = translated_formula.replace(
|
||||
f'"{original_string}"', f'"{translated_string}"'
|
||||
)
|
||||
|
||||
cell.value = translated_formula
|
||||
|
||||
def _should_translate(self, text: str) -> bool:
|
||||
"""
|
||||
Determine if text should be translated
|
||||
|
||||
Args:
|
||||
text: Text to check
|
||||
|
||||
Returns:
|
||||
True if text should be translated, False otherwise
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return False
|
||||
|
||||
# Don't translate if it's only numbers, special characters, or very short
|
||||
if len(text.strip()) < 2:
|
||||
return False
|
||||
|
||||
# Check if it's a formula (handled separately)
|
||||
if text.startswith('='):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Global translator instance
|
||||
excel_translator = ExcelTranslator()
|
||||
158
translators/pptx_translator.py
Normal file
158
translators/pptx_translator.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
PowerPoint Translation Module
|
||||
Translates PowerPoint files while preserving all layouts, animations, and media
|
||||
"""
|
||||
from pathlib import Path
|
||||
from pptx import Presentation
|
||||
from pptx.shapes.base import BaseShape
|
||||
from pptx.shapes.group import GroupShape
|
||||
from pptx.util import Inches, Pt
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
from services.translation_service import translation_service
|
||||
|
||||
|
||||
class PowerPointTranslator:
|
||||
"""Handles translation of PowerPoint presentations with strict formatting preservation"""
|
||||
|
||||
def __init__(self):
|
||||
self.translation_service = translation_service
|
||||
|
||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||
"""
|
||||
Translate a PowerPoint presentation while preserving all formatting and structure
|
||||
|
||||
Args:
|
||||
input_path: Path to input PowerPoint file
|
||||
output_path: Path to save translated PowerPoint file
|
||||
target_language: Target language code
|
||||
|
||||
Returns:
|
||||
Path to the translated file
|
||||
"""
|
||||
presentation = Presentation(input_path)
|
||||
|
||||
# Translate each slide
|
||||
for slide in presentation.slides:
|
||||
self._translate_slide(slide, target_language)
|
||||
|
||||
# Save the translated presentation
|
||||
presentation.save(output_path)
|
||||
|
||||
return output_path
|
||||
|
||||
def _translate_slide(self, slide, target_language: str):
|
||||
"""
|
||||
Translate all text elements in a slide while preserving layout
|
||||
|
||||
Args:
|
||||
slide: Slide to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
# Translate notes (speaker notes)
|
||||
if slide.has_notes_slide:
|
||||
notes_slide = slide.notes_slide
|
||||
if notes_slide.notes_text_frame:
|
||||
self._translate_text_frame(notes_slide.notes_text_frame, target_language)
|
||||
|
||||
# Translate shapes in the slide
|
||||
for shape in slide.shapes:
|
||||
self._translate_shape(shape, target_language)
|
||||
|
||||
def _translate_shape(self, shape: BaseShape, target_language: str):
|
||||
"""
|
||||
Translate text in a shape based on its type
|
||||
|
||||
Args:
|
||||
shape: Shape to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
# Handle text-containing shapes
|
||||
if shape.has_text_frame:
|
||||
self._translate_text_frame(shape.text_frame, target_language)
|
||||
|
||||
# Handle tables
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
||||
self._translate_table(shape.table, target_language)
|
||||
|
||||
# Handle group shapes (shapes within shapes)
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||
for sub_shape in shape.shapes:
|
||||
self._translate_shape(sub_shape, target_language)
|
||||
|
||||
# Handle smart art (contains multiple shapes)
|
||||
# Smart art is complex, but we can try to translate text within it
|
||||
if hasattr(shape, 'shapes'):
|
||||
try:
|
||||
for sub_shape in shape.shapes:
|
||||
self._translate_shape(sub_shape, target_language)
|
||||
except:
|
||||
pass # Some shapes may not support iteration
|
||||
|
||||
def _translate_text_frame(self, text_frame, target_language: str):
|
||||
"""
|
||||
Translate text within a text frame while preserving formatting
|
||||
|
||||
Args:
|
||||
text_frame: Text frame to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
if not text_frame.text.strip():
|
||||
return
|
||||
|
||||
# Translate each paragraph in the text frame
|
||||
for paragraph in text_frame.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
|
||||
def _translate_paragraph(self, paragraph, target_language: str):
|
||||
"""
|
||||
Translate a paragraph while preserving run-level formatting
|
||||
|
||||
Args:
|
||||
paragraph: Paragraph to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
if not paragraph.text.strip():
|
||||
return
|
||||
|
||||
# Translate each run in the paragraph to preserve individual formatting
|
||||
for run in paragraph.runs:
|
||||
if run.text.strip():
|
||||
translated_text = self.translation_service.translate_text(
|
||||
run.text, target_language
|
||||
)
|
||||
run.text = translated_text
|
||||
|
||||
def _translate_table(self, table, target_language: str):
|
||||
"""
|
||||
Translate all cells in a table while preserving structure
|
||||
|
||||
Args:
|
||||
table: Table to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
self._translate_text_frame(cell.text_frame, target_language)
|
||||
|
||||
def _is_translatable(self, text: str) -> bool:
|
||||
"""
|
||||
Determine if text should be translated
|
||||
|
||||
Args:
|
||||
text: Text to check
|
||||
|
||||
Returns:
|
||||
True if text should be translated, False otherwise
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return False
|
||||
|
||||
# Don't translate if it's only numbers, special characters, or very short
|
||||
if len(text.strip()) < 2:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Global translator instance
|
||||
pptx_translator = PowerPointTranslator()
|
||||
171
translators/word_translator.py
Normal file
171
translators/word_translator.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Word Document Translation Module
|
||||
Translates Word files while preserving all formatting, styles, tables, and images
|
||||
"""
|
||||
from pathlib import Path
|
||||
from docx import Document
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.table import Table, _Cell
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.section import Section
|
||||
from services.translation_service import translation_service
|
||||
|
||||
|
||||
class WordTranslator:
|
||||
"""Handles translation of Word documents with strict formatting preservation"""
|
||||
|
||||
def __init__(self):
|
||||
self.translation_service = translation_service
|
||||
|
||||
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
|
||||
"""
|
||||
Translate a Word document while preserving all formatting and structure
|
||||
|
||||
Args:
|
||||
input_path: Path to input Word file
|
||||
output_path: Path to save translated Word file
|
||||
target_language: Target language code
|
||||
|
||||
Returns:
|
||||
Path to the translated file
|
||||
"""
|
||||
document = Document(input_path)
|
||||
|
||||
# Translate main document body
|
||||
self._translate_document_body(document, target_language)
|
||||
|
||||
# Translate headers and footers in all sections
|
||||
for section in document.sections:
|
||||
self._translate_section(section, target_language)
|
||||
|
||||
# Save the translated document
|
||||
document.save(output_path)
|
||||
|
||||
return output_path
|
||||
|
||||
def _translate_document_body(self, document: Document, target_language: str):
|
||||
"""
|
||||
Translate all elements in the document body
|
||||
|
||||
Args:
|
||||
document: Document to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
for element in document.element.body:
|
||||
if isinstance(element, CT_P):
|
||||
# It's a paragraph
|
||||
paragraph = Paragraph(element, document)
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
elif isinstance(element, CT_Tbl):
|
||||
# It's a table
|
||||
table = Table(element, document)
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
def _translate_paragraph(self, paragraph: Paragraph, target_language: str):
|
||||
"""
|
||||
Translate a paragraph while preserving all formatting
|
||||
|
||||
Args:
|
||||
paragraph: Paragraph to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
if not paragraph.text.strip():
|
||||
return
|
||||
|
||||
# For paragraphs with complex formatting (multiple runs), translate run by run
|
||||
if len(paragraph.runs) > 0:
|
||||
for run in paragraph.runs:
|
||||
if run.text.strip():
|
||||
translated_text = self.translation_service.translate_text(
|
||||
run.text, target_language
|
||||
)
|
||||
run.text = translated_text
|
||||
else:
|
||||
# Simple paragraph with no runs
|
||||
if paragraph.text.strip():
|
||||
translated_text = self.translation_service.translate_text(
|
||||
paragraph.text, target_language
|
||||
)
|
||||
paragraph.text = translated_text
|
||||
|
||||
def _translate_table(self, table: Table, target_language: str):
|
||||
"""
|
||||
Translate all cells in a table while preserving structure
|
||||
|
||||
Args:
|
||||
table: Table to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
self._translate_cell(cell, target_language)
|
||||
|
||||
def _translate_cell(self, cell: _Cell, target_language: str):
|
||||
"""
|
||||
Translate content within a table cell
|
||||
|
||||
Args:
|
||||
cell: Cell to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
for paragraph in cell.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
|
||||
# Handle nested tables
|
||||
for table in cell.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
def _translate_section(self, section: Section, target_language: str):
|
||||
"""
|
||||
Translate headers and footers in a section
|
||||
|
||||
Args:
|
||||
section: Section to translate
|
||||
target_language: Target language code
|
||||
"""
|
||||
# Translate header
|
||||
if section.header:
|
||||
for paragraph in section.header.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.header.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
# Translate footer
|
||||
if section.footer:
|
||||
for paragraph in section.footer.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.footer.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
# Translate first page header (if different)
|
||||
if section.first_page_header:
|
||||
for paragraph in section.first_page_header.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.first_page_header.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
# Translate first page footer (if different)
|
||||
if section.first_page_footer:
|
||||
for paragraph in section.first_page_footer.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.first_page_footer.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
# Translate even page header (if different)
|
||||
if section.even_page_header:
|
||||
for paragraph in section.even_page_header.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.even_page_header.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
# Translate even page footer (if different)
|
||||
if section.even_page_footer:
|
||||
for paragraph in section.even_page_footer.paragraphs:
|
||||
self._translate_paragraph(paragraph, target_language)
|
||||
for table in section.even_page_footer.tables:
|
||||
self._translate_table(table, target_language)
|
||||
|
||||
|
||||
# Global translator instance
|
||||
word_translator = WordTranslator()
|
||||
Reference in New Issue
Block a user