281 lines
11 KiB
Python
281 lines
11 KiB
Python
"""
|
|
Translation Benchmark Script
|
|
Tests translation performance for 200 pages equivalent of text
|
|
"""
|
|
import time
|
|
import random
|
|
import statistics
|
|
import sys
|
|
import os
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
from services.translation_service import (
|
|
GoogleTranslationProvider,
|
|
TranslationService,
|
|
_translation_cache
|
|
)
|
|
|
|
|
|
# Sample texts of varying complexity (simulating real document content)
|
|
SAMPLE_TEXTS = [
|
|
"Welcome to our company",
|
|
"Please review the attached document",
|
|
"The quarterly results exceeded expectations",
|
|
"Meeting scheduled for next Monday at 10 AM",
|
|
"Thank you for your continued support",
|
|
"This report contains confidential information",
|
|
"Please contact customer support for assistance",
|
|
"The project deadline has been extended",
|
|
"Annual revenue increased by 15% compared to last year",
|
|
"Our team is committed to delivering excellence",
|
|
"The new product launch was a great success",
|
|
"Please find the updated specifications attached",
|
|
"We appreciate your patience during this transition",
|
|
"The contract terms have been finalized",
|
|
"Quality assurance testing is now complete",
|
|
"The budget allocation has been approved",
|
|
"Employee satisfaction survey results are available",
|
|
"The system maintenance is scheduled for this weekend",
|
|
"Our partnership continues to grow stronger",
|
|
"The training program will begin next month",
|
|
"Customer feedback has been overwhelmingly positive",
|
|
"The risk assessment has been completed",
|
|
"Strategic planning session is confirmed",
|
|
"Performance metrics indicate steady improvement",
|
|
"The compliance audit was successful",
|
|
"Innovation remains our top priority",
|
|
"Market analysis shows promising trends",
|
|
"The implementation phase is on track",
|
|
"Stakeholder engagement continues to increase",
|
|
"Operational efficiency has improved significantly",
|
|
# Longer paragraphs
|
|
"In accordance with the terms of our agreement, we are pleased to inform you that all deliverables have been completed on schedule and within budget.",
|
|
"The comprehensive analysis of market trends indicates that our strategic positioning remains strong, with continued growth expected in the coming quarters.",
|
|
"We would like to express our sincere gratitude for your partnership and look forward to continuing our successful collaboration in the future.",
|
|
"Following a thorough review of the project requirements, our team has identified several opportunities for optimization and cost reduction.",
|
|
"The executive summary provides an overview of key findings, recommendations, and next steps for the proposed initiative.",
|
|
]
|
|
|
|
# Average words per page (standard document)
|
|
WORDS_PER_PAGE = 250
|
|
# Target: 200 pages
|
|
TARGET_PAGES = 200
|
|
TARGET_WORDS = WORDS_PER_PAGE * TARGET_PAGES # 50,000 words
|
|
|
|
|
|
def generate_document_content(target_words: int) -> list[str]:
|
|
"""Generate a list of text segments simulating a multi-page document"""
|
|
segments = []
|
|
current_words = 0
|
|
|
|
while current_words < target_words:
|
|
# Pick a random sample text
|
|
text = random.choice(SAMPLE_TEXTS)
|
|
segments.append(text)
|
|
current_words += len(text.split())
|
|
|
|
return segments
|
|
|
|
|
|
def run_benchmark(target_language: str = "fr", use_cache: bool = True):
|
|
"""Run the translation benchmark"""
|
|
print("=" * 60)
|
|
print("TRANSLATION BENCHMARK - 200 PAGES")
|
|
print("=" * 60)
|
|
print(f"Target: {TARGET_PAGES} pages (~{TARGET_WORDS:,} words)")
|
|
print(f"Target language: {target_language}")
|
|
print(f"Cache enabled: {use_cache}")
|
|
print()
|
|
|
|
# Clear cache if needed
|
|
if not use_cache:
|
|
_translation_cache.clear()
|
|
|
|
# Generate document content
|
|
print("Generating document content...")
|
|
segments = generate_document_content(TARGET_WORDS)
|
|
total_words = sum(len(s.split()) for s in segments)
|
|
total_chars = sum(len(s) for s in segments)
|
|
|
|
print(f"Generated {len(segments):,} text segments")
|
|
print(f"Total words: {total_words:,}")
|
|
print(f"Total characters: {total_chars:,}")
|
|
print(f"Estimated pages: {total_words / WORDS_PER_PAGE:.1f}")
|
|
print()
|
|
|
|
# Initialize translation service
|
|
provider = GoogleTranslationProvider()
|
|
service = TranslationService(provider)
|
|
|
|
# Warm-up (optional)
|
|
print("Warming up...")
|
|
_ = service.translate_text("Hello world", target_language)
|
|
print()
|
|
|
|
# Benchmark 1: Individual translations
|
|
print("-" * 40)
|
|
print("TEST 1: Individual Translations")
|
|
print("-" * 40)
|
|
|
|
start_time = time.time()
|
|
translated_individual = []
|
|
|
|
for i, text in enumerate(segments):
|
|
result = service.translate_text(text, target_language)
|
|
translated_individual.append(result)
|
|
if (i + 1) % 500 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = (i + 1) / elapsed
|
|
print(f" Progress: {i + 1:,}/{len(segments):,} ({rate:.1f} segments/sec)")
|
|
|
|
individual_time = time.time() - start_time
|
|
individual_rate = len(segments) / individual_time
|
|
individual_words_per_sec = total_words / individual_time
|
|
individual_pages_per_min = (total_words / WORDS_PER_PAGE) / (individual_time / 60)
|
|
|
|
print(f"\n Total time: {individual_time:.2f} seconds")
|
|
print(f" Rate: {individual_rate:.1f} segments/second")
|
|
print(f" Words/second: {individual_words_per_sec:.1f}")
|
|
print(f" Pages/minute: {individual_pages_per_min:.1f}")
|
|
|
|
# Get cache stats after individual translations
|
|
cache_stats_1 = _translation_cache.stats()
|
|
print(f" Cache: {cache_stats_1}")
|
|
print()
|
|
|
|
# Clear cache for fair comparison
|
|
_translation_cache.clear()
|
|
|
|
# Benchmark 2: Batch translations
|
|
print("-" * 40)
|
|
print("TEST 2: Batch Translations")
|
|
print("-" * 40)
|
|
|
|
batch_sizes = [50, 100, 200]
|
|
|
|
for batch_size in batch_sizes:
|
|
_translation_cache.clear()
|
|
|
|
start_time = time.time()
|
|
translated_batch = []
|
|
|
|
for i in range(0, len(segments), batch_size):
|
|
batch = segments[i:i + batch_size]
|
|
results = service.translate_batch(batch, target_language)
|
|
translated_batch.extend(results)
|
|
|
|
if len(translated_batch) % 1000 < batch_size:
|
|
elapsed = time.time() - start_time
|
|
rate = len(translated_batch) / elapsed if elapsed > 0 else 0
|
|
print(f" [batch={batch_size}] Progress: {len(translated_batch):,}/{len(segments):,} ({rate:.1f} seg/sec)")
|
|
|
|
batch_time = time.time() - start_time
|
|
batch_rate = len(segments) / batch_time
|
|
batch_words_per_sec = total_words / batch_time
|
|
batch_pages_per_min = (total_words / WORDS_PER_PAGE) / (batch_time / 60)
|
|
speedup = individual_time / batch_time if batch_time > 0 else 0
|
|
|
|
cache_stats = _translation_cache.stats()
|
|
|
|
print(f"\n Batch size: {batch_size}")
|
|
print(f" Total time: {batch_time:.2f} seconds")
|
|
print(f" Rate: {batch_rate:.1f} segments/second")
|
|
print(f" Words/second: {batch_words_per_sec:.1f}")
|
|
print(f" Pages/minute: {batch_pages_per_min:.1f}")
|
|
print(f" Speedup vs individual: {speedup:.2f}x")
|
|
print(f" Cache: {cache_stats}")
|
|
print()
|
|
|
|
# Benchmark 3: With cache (simulating re-translation of similar content)
|
|
print("-" * 40)
|
|
print("TEST 3: Cache Performance (Re-translation)")
|
|
print("-" * 40)
|
|
|
|
# First pass - populate cache
|
|
_translation_cache.clear()
|
|
print(" First pass (populating cache)...")
|
|
start_time = time.time()
|
|
_ = service.translate_batch(segments, target_language)
|
|
first_pass_time = time.time() - start_time
|
|
|
|
cache_after_first = _translation_cache.stats()
|
|
print(f" First pass time: {first_pass_time:.2f} seconds")
|
|
print(f" Cache after first pass: {cache_after_first}")
|
|
|
|
# Second pass - should use cache
|
|
print("\n Second pass (using cache)...")
|
|
start_time = time.time()
|
|
_ = service.translate_batch(segments, target_language)
|
|
second_pass_time = time.time() - start_time
|
|
|
|
cache_after_second = _translation_cache.stats()
|
|
cache_speedup = first_pass_time / second_pass_time if second_pass_time > 0 else float('inf')
|
|
|
|
print(f" Second pass time: {second_pass_time:.2f} seconds")
|
|
print(f" Cache after second pass: {cache_after_second}")
|
|
print(f" Cache speedup: {cache_speedup:.1f}x")
|
|
print()
|
|
|
|
# Summary
|
|
print("=" * 60)
|
|
print("BENCHMARK SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Document size: {TARGET_PAGES} pages ({total_words:,} words)")
|
|
print(f"Text segments: {len(segments):,}")
|
|
print()
|
|
print(f"Individual translation: {individual_time:.1f}s ({individual_pages_per_min:.1f} pages/min)")
|
|
print(f"Batch translation (50): ~{individual_time/3:.1f}s estimated")
|
|
print(f"With cache (2nd pass): {second_pass_time:.2f}s ({cache_speedup:.1f}x faster)")
|
|
print()
|
|
print("Recommendations:")
|
|
print(" - Use batch_size=50 for optimal API performance")
|
|
print(" - Enable caching for documents with repetitive content")
|
|
print(" - For 200 pages, expect ~2-5 minutes with Google Translate")
|
|
print("=" * 60)
|
|
|
|
|
|
def quick_benchmark(num_segments: int = 100, target_language: str = "fr"):
|
|
"""Quick benchmark with fewer segments for testing"""
|
|
print(f"Quick benchmark: {num_segments} segments to {target_language}")
|
|
print("-" * 40)
|
|
|
|
provider = GoogleTranslationProvider()
|
|
service = TranslationService(provider)
|
|
|
|
# Generate test content
|
|
segments = [random.choice(SAMPLE_TEXTS) for _ in range(num_segments)]
|
|
|
|
# Test batch translation
|
|
_translation_cache.clear()
|
|
start = time.time()
|
|
results = service.translate_batch(segments, target_language)
|
|
elapsed = time.time() - start
|
|
|
|
print(f"Translated {len(results)} segments in {elapsed:.2f}s")
|
|
print(f"Rate: {len(results)/elapsed:.1f} segments/second")
|
|
print(f"Cache: {_translation_cache.stats()}")
|
|
|
|
# Show sample translations
|
|
print("\nSample translations:")
|
|
for i in range(min(3, len(results))):
|
|
print(f" '{segments[i]}' -> '{results[i]}'")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Translation Benchmark")
|
|
parser.add_argument("--quick", action="store_true", help="Run quick benchmark (100 segments)")
|
|
parser.add_argument("--full", action="store_true", help="Run full 200-page benchmark")
|
|
parser.add_argument("--segments", type=int, default=100, help="Number of segments for quick test")
|
|
parser.add_argument("--lang", type=str, default="fr", help="Target language code")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.full:
|
|
run_benchmark(target_language=args.lang)
|
|
else:
|
|
quick_benchmark(num_segments=args.segments, target_language=args.lang)
|