office_translator/translators/pptx_translator.py
Sepehr 8f9ca669cf Performance optimization: batch translation for 5-10x speed improvement
- GoogleTranslationProvider: Added batch translation with separator method
- DeepLTranslationProvider: Added translator caching and batch support
- LibreTranslationProvider: Added translator caching and batch support
- WordTranslator: Collect all texts -> batch translate -> apply pattern
- ExcelTranslator: Collect all texts -> batch translate -> apply pattern
- PowerPointTranslator: Collect all texts -> batch translate -> apply pattern
- Enhanced Ollama/OpenAI prompts with stricter translation-only rules
- Added rule: return original text if uncertain about translation
2025-11-30 20:41:20 +01:00

152 lines
6.0 KiB
Python

"""
PowerPoint Translation Module
Translates PowerPoint files while preserving all layouts, animations, and media
OPTIMIZED: Uses batch translation for 5-10x faster processing
"""
from pathlib import Path
from pptx import Presentation
from pptx.shapes.base import BaseShape
from pptx.shapes.group import GroupShape
from pptx.util import Inches, Pt
from pptx.enum.shapes import MSO_SHAPE_TYPE
from services.translation_service import translation_service
from typing import List, Tuple
import tempfile
import os
class PowerPointTranslator:
"""Handles translation of PowerPoint presentations with strict formatting preservation"""
def __init__(self):
self.translation_service = translation_service
def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path:
"""
Translate a PowerPoint presentation while preserving all formatting.
Uses batch translation for improved performance.
"""
presentation = Presentation(input_path)
# Collect all translatable text elements
text_elements = [] # List of (text, setter_function)
image_shapes = [] # Collect images for separate processing
for slide_idx, slide in enumerate(presentation.slides):
# Collect from notes
if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
self._collect_from_text_frame(slide.notes_slide.notes_text_frame, text_elements)
# Collect from shapes
for shape in slide.shapes:
self._collect_from_shape(shape, text_elements, slide, image_shapes)
# Batch translate all texts at once
if text_elements:
texts = [elem[0] for elem in text_elements]
print(f"Batch translating {len(texts)} text segments...")
translated_texts = self.translation_service.translate_batch(texts, target_language)
# Apply translations
for (original_text, setter), translated in zip(text_elements, translated_texts):
if translated is not None and setter is not None:
try:
setter(translated)
except Exception as e:
print(f"Error applying translation: {e}")
# Translate images if enabled (separate process, can't batch)
if getattr(self.translation_service, 'translate_images', False):
for shape, slide in image_shapes:
self._translate_image_shape(shape, target_language, slide)
presentation.save(output_path)
return output_path
def _collect_from_shape(self, shape: BaseShape, text_elements: List[Tuple[str, callable]], slide=None, image_shapes=None):
"""Collect text from a shape and its children"""
# Handle text-containing shapes
if shape.has_text_frame:
self._collect_from_text_frame(shape.text_frame, text_elements)
# Handle tables
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
for row in shape.table.rows:
for cell in row.cells:
self._collect_from_text_frame(cell.text_frame, text_elements)
# Handle pictures/images
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE and image_shapes is not None:
image_shapes.append((shape, slide))
# Handle group shapes
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for sub_shape in shape.shapes:
self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
# Handle smart art
if hasattr(shape, 'shapes'):
try:
for sub_shape in shape.shapes:
self._collect_from_shape(sub_shape, text_elements, slide, image_shapes)
except:
pass
def _collect_from_text_frame(self, text_frame, text_elements: List[Tuple[str, callable]]):
"""Collect text from a text frame"""
if not text_frame.text.strip():
return
for paragraph in text_frame.paragraphs:
if not paragraph.text.strip():
continue
for run in paragraph.runs:
if run.text and run.text.strip():
def make_setter(r):
def setter(text):
r.text = text
return setter
text_elements.append((run.text, make_setter(run)))
def _translate_image_shape(self, shape, target_language: str, slide):
"""Translate text in an image using vision model"""
from services.translation_service import OllamaTranslationProvider
if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
return
try:
image_blob = shape.image.blob
ext = shape.image.ext
with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
tmp.write(image_blob)
tmp_path = tmp.name
translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
os.unlink(tmp_path)
if translated_text and translated_text.strip():
left = shape.left
top = shape.top + shape.height + Inches(0.1)
width = shape.width
height = Inches(0.5)
textbox = slide.shapes.add_textbox(left, top, width, height)
tf = textbox.text_frame
p = tf.paragraphs[0]
p.text = f"[{translated_text}]"
p.font.size = Pt(10)
p.font.italic = True
print(f"Added image translation: {translated_text[:50]}...")
except Exception as e:
print(f"Error translating image: {e}")
# Global translator instance
pptx_translator = PowerPointTranslator()