Performance optimization: batch translation for 5-10x speed improvement

- GoogleTranslationProvider: Added batch translation with separator method
- DeepLTranslationProvider: Added translator caching and batch support
- LibreTranslationProvider: Added translator caching and batch support
- WordTranslator: Collect all texts -> batch translate -> apply pattern
- ExcelTranslator: Collect all texts -> batch translate -> apply pattern
- PowerPointTranslator: Collect all texts -> batch translate -> apply pattern
- Enhanced Ollama/OpenAI prompts with stricter translation-only rules
- Added rule: return original text if uncertain about translation
This commit is contained in:
2025-11-30 20:41:20 +01:00
parent 54d85f0b34
commit 8f9ca669cf
5 changed files with 430 additions and 423 deletions

View File

@@ -3,10 +3,12 @@ Translation Service Abstraction
Provides a unified interface for different translation providers
"""
from abc import ABC, abstractmethod
from typing import Optional, List
from typing import Optional, List, Dict
import requests
from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator
from config import config
import concurrent.futures
import threading
class TranslationProvider(ABC):
@@ -16,59 +18,222 @@ class TranslationProvider(ABC):
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
"""Translate text from source to target language"""
pass
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Translate multiple texts at once - default implementation"""
return [self.translate(text, target_language, source_language) for text in texts]
class GoogleTranslationProvider(TranslationProvider):
"""Google Translate implementation"""
"""Google Translate implementation with batch support"""
def __init__(self):
self._local = threading.local()
def _get_translator(self, source_language: str, target_language: str) -> GoogleTranslator:
"""Get or create a translator instance for the current thread"""
key = f"{source_language}_{target_language}"
if not hasattr(self._local, 'translators'):
self._local.translators = {}
if key not in self._local.translators:
self._local.translators[key] = GoogleTranslator(source=source_language, target=target_language)
return self._local.translators[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip():
return text
try:
translator = self._get_translator(source_language, target_language)
return translator.translate(text)
except Exception as e:
print(f"Translation error: {e}")
return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto', batch_size: int = 50) -> List[str]:
"""
Translate multiple texts using batch processing for speed.
Uses deep_translator's batch capability when possible.
"""
if not texts:
return []
# Filter and track empty texts
results = [''] * len(texts)
non_empty_indices = []
non_empty_texts = []
for i, text in enumerate(texts):
if text and text.strip():
non_empty_indices.append(i)
non_empty_texts.append(text)
else:
results[i] = text if text else ''
if not non_empty_texts:
return results
try:
translator = GoogleTranslator(source=source_language, target=target_language)
return translator.translate(text)
# Process in batches
translated_texts = []
for i in range(0, len(non_empty_texts), batch_size):
batch = non_empty_texts[i:i + batch_size]
try:
# Use translate_batch if available
if hasattr(translator, 'translate_batch'):
batch_result = translator.translate_batch(batch)
else:
# Fallback: join with separator, translate, split
separator = "\n|||SPLIT|||\n"
combined = separator.join(batch)
translated_combined = translator.translate(combined)
if translated_combined:
batch_result = translated_combined.split("|||SPLIT|||")
# Clean up results
batch_result = [t.strip() for t in batch_result]
# If split didn't work correctly, fall back to individual
if len(batch_result) != len(batch):
batch_result = [translator.translate(t) for t in batch]
else:
batch_result = batch
translated_texts.extend(batch_result)
except Exception as e:
print(f"Batch translation error, falling back to individual: {e}")
for text in batch:
try:
translated_texts.append(translator.translate(text))
except:
translated_texts.append(text)
# Map back to original positions
for idx, translated in zip(non_empty_indices, translated_texts):
results[idx] = translated if translated else texts[idx]
return results
except Exception as e:
print(f"Translation error: {e}")
return text
print(f"Batch translation failed: {e}")
# Fallback to individual translations
for idx, text in zip(non_empty_indices, non_empty_texts):
try:
results[idx] = GoogleTranslator(source=source_language, target=target_language).translate(text) or text
except:
results[idx] = text
return results
class DeepLTranslationProvider(TranslationProvider):
"""DeepL Translate implementation"""
"""DeepL Translate implementation with batch support"""
def __init__(self, api_key: str):
self.api_key = api_key
self._translator_cache = {}
def _get_translator(self, source_language: str, target_language: str) -> DeeplTranslator:
key = f"{source_language}_{target_language}"
if key not in self._translator_cache:
self._translator_cache[key] = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language)
return self._translator_cache[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip():
return text
try:
translator = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language)
translator = self._get_translator(source_language, target_language)
return translator.translate(text)
except Exception as e:
print(f"Translation error: {e}")
return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Batch translate using DeepL"""
if not texts:
return []
results = [''] * len(texts)
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
if not non_empty:
return [t if t else '' for t in texts]
try:
translator = self._get_translator(source_language, target_language)
non_empty_texts = [t for _, t in non_empty]
if hasattr(translator, 'translate_batch'):
translated = translator.translate_batch(non_empty_texts)
else:
translated = [translator.translate(t) for t in non_empty_texts]
for (idx, _), trans in zip(non_empty, translated):
results[idx] = trans if trans else texts[idx]
# Fill empty positions
for i, text in enumerate(texts):
if not text or not text.strip():
results[i] = text if text else ''
return results
except Exception as e:
print(f"DeepL batch error: {e}")
return [self.translate(t, target_language, source_language) for t in texts]
class LibreTranslationProvider(TranslationProvider):
"""LibreTranslate implementation"""
"""LibreTranslate implementation with batch support"""
def __init__(self, custom_url: str = "https://libretranslate.com"):
self.custom_url = custom_url
self._translator_cache = {}
def _get_translator(self, source_language: str, target_language: str) -> LibreTranslator:
key = f"{source_language}_{target_language}"
if key not in self._translator_cache:
self._translator_cache[key] = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
return self._translator_cache[key]
def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
if not text or not text.strip():
return text
try:
# LibreTranslator supports custom URL for self-hosted or public instances
translator = LibreTranslator(source=source_language, target=target_language, custom_url=self.custom_url)
translator = self._get_translator(source_language, target_language)
return translator.translate(text)
except Exception as e:
print(f"LibreTranslate error: {e}")
# Fail silently and return original text
return text
def translate_batch(self, texts: List[str], target_language: str, source_language: str = 'auto') -> List[str]:
"""Batch translate using LibreTranslate"""
if not texts:
return []
results = [''] * len(texts)
non_empty = [(i, t) for i, t in enumerate(texts) if t and t.strip()]
if not non_empty:
return [t if t else '' for t in texts]
try:
translator = self._get_translator(source_language, target_language)
for idx, text in non_empty:
try:
results[idx] = translator.translate(text) or text
except:
results[idx] = text
for i, text in enumerate(texts):
if not text or not text.strip():
results[i] = text if text else ''
return results
except Exception as e:
print(f"LibreTranslate batch error: {e}")
return texts
class OllamaTranslationProvider(TranslationProvider):
@@ -90,7 +255,19 @@ class OllamaTranslationProvider(TranslationProvider):
try:
# Build system prompt with custom context if provided
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else."
base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
CRITICAL RULES:
1. Output ONLY the translated text - no explanations, no comments, no notes
2. Preserve the exact formatting (line breaks, spacing, punctuation)
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
4. Do NOT refuse to translate or ask clarifying questions
5. If the text is already in {target_language}, return it unchanged
6. Translate everything literally and accurately
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
8. If you have any doubt about the translation, return the original text unchanged
9. Do not interpret or analyze the content - simply translate word by word
10. Your response must contain ONLY the translated text, nothing else"""
if self.custom_system_prompt:
system_content = f"""{base_prompt}
@@ -213,7 +390,19 @@ class OpenAITranslationProvider(TranslationProvider):
client = openai.OpenAI(api_key=self.api_key)
# Build system prompt with custom context if provided
base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else."
base_prompt = f"""You are a professional translator. Your ONLY task is to translate text to {target_language}.
CRITICAL RULES:
1. Output ONLY the translated text - no explanations, no comments, no notes
2. Preserve the exact formatting (line breaks, spacing, punctuation)
3. Do NOT add any prefixes like "Here's the translation:" or "Translation:"
4. Do NOT refuse to translate or ask clarifying questions
5. If the text is already in {target_language}, return it unchanged
6. Translate everything literally and accurately
7. NEVER provide comments, opinions, or explanations - you are JUST a translator
8. If you have any doubt about the translation, return the original text unchanged
9. Do not interpret or analyze the content - simply translate word by word
10. Your response must contain ONLY the translated text, nothing else"""
if self.custom_system_prompt:
system_content = f"""{base_prompt}
@@ -341,7 +530,7 @@ class TranslationService:
def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]:
"""
Translate multiple text strings
Translate multiple text strings efficiently using batch processing.
Args:
texts: List of texts to translate
@@ -351,6 +540,14 @@ class TranslationService:
Returns:
List of translated texts
"""
if not texts:
return []
# Use provider's batch method if available
if hasattr(self.provider, 'translate_batch'):
return self.provider.translate_batch(texts, target_language, source_language)
# Fallback to individual translations
return [self.translate_text(text, target_language, source_language) for text in texts]