""" Translation Benchmark Script Tests translation performance for 200 pages equivalent of text """ import time import random import statistics import sys import os # Add project root to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from services.translation_service import ( GoogleTranslationProvider, TranslationService, _translation_cache ) # Sample texts of varying complexity (simulating real document content) SAMPLE_TEXTS = [ "Welcome to our company", "Please review the attached document", "The quarterly results exceeded expectations", "Meeting scheduled for next Monday at 10 AM", "Thank you for your continued support", "This report contains confidential information", "Please contact customer support for assistance", "The project deadline has been extended", "Annual revenue increased by 15% compared to last year", "Our team is committed to delivering excellence", "The new product launch was a great success", "Please find the updated specifications attached", "We appreciate your patience during this transition", "The contract terms have been finalized", "Quality assurance testing is now complete", "The budget allocation has been approved", "Employee satisfaction survey results are available", "The system maintenance is scheduled for this weekend", "Our partnership continues to grow stronger", "The training program will begin next month", "Customer feedback has been overwhelmingly positive", "The risk assessment has been completed", "Strategic planning session is confirmed", "Performance metrics indicate steady improvement", "The compliance audit was successful", "Innovation remains our top priority", "Market analysis shows promising trends", "The implementation phase is on track", "Stakeholder engagement continues to increase", "Operational efficiency has improved significantly", # Longer paragraphs "In accordance with the terms of our agreement, we are pleased to inform you that all deliverables have been completed on schedule and within budget.", "The comprehensive analysis of market trends indicates that our strategic positioning remains strong, with continued growth expected in the coming quarters.", "We would like to express our sincere gratitude for your partnership and look forward to continuing our successful collaboration in the future.", "Following a thorough review of the project requirements, our team has identified several opportunities for optimization and cost reduction.", "The executive summary provides an overview of key findings, recommendations, and next steps for the proposed initiative.", ] # Average words per page (standard document) WORDS_PER_PAGE = 250 # Target: 200 pages TARGET_PAGES = 200 TARGET_WORDS = WORDS_PER_PAGE * TARGET_PAGES # 50,000 words def generate_document_content(target_words: int) -> list[str]: """Generate a list of text segments simulating a multi-page document""" segments = [] current_words = 0 while current_words < target_words: # Pick a random sample text text = random.choice(SAMPLE_TEXTS) segments.append(text) current_words += len(text.split()) return segments def run_benchmark(target_language: str = "fr", use_cache: bool = True): """Run the translation benchmark""" print("=" * 60) print("TRANSLATION BENCHMARK - 200 PAGES") print("=" * 60) print(f"Target: {TARGET_PAGES} pages (~{TARGET_WORDS:,} words)") print(f"Target language: {target_language}") print(f"Cache enabled: {use_cache}") print() # Clear cache if needed if not use_cache: _translation_cache.clear() # Generate document content print("Generating document content...") segments = generate_document_content(TARGET_WORDS) total_words = sum(len(s.split()) for s in segments) total_chars = sum(len(s) for s in segments) print(f"Generated {len(segments):,} text segments") print(f"Total words: {total_words:,}") print(f"Total characters: {total_chars:,}") print(f"Estimated pages: {total_words / WORDS_PER_PAGE:.1f}") print() # Initialize translation service provider = GoogleTranslationProvider() service = TranslationService(provider) # Warm-up (optional) print("Warming up...") _ = service.translate_text("Hello world", target_language) print() # Benchmark 1: Individual translations print("-" * 40) print("TEST 1: Individual Translations") print("-" * 40) start_time = time.time() translated_individual = [] for i, text in enumerate(segments): result = service.translate_text(text, target_language) translated_individual.append(result) if (i + 1) % 500 == 0: elapsed = time.time() - start_time rate = (i + 1) / elapsed print(f" Progress: {i + 1:,}/{len(segments):,} ({rate:.1f} segments/sec)") individual_time = time.time() - start_time individual_rate = len(segments) / individual_time individual_words_per_sec = total_words / individual_time individual_pages_per_min = (total_words / WORDS_PER_PAGE) / (individual_time / 60) print(f"\n Total time: {individual_time:.2f} seconds") print(f" Rate: {individual_rate:.1f} segments/second") print(f" Words/second: {individual_words_per_sec:.1f}") print(f" Pages/minute: {individual_pages_per_min:.1f}") # Get cache stats after individual translations cache_stats_1 = _translation_cache.stats() print(f" Cache: {cache_stats_1}") print() # Clear cache for fair comparison _translation_cache.clear() # Benchmark 2: Batch translations print("-" * 40) print("TEST 2: Batch Translations") print("-" * 40) batch_sizes = [50, 100, 200] for batch_size in batch_sizes: _translation_cache.clear() start_time = time.time() translated_batch = [] for i in range(0, len(segments), batch_size): batch = segments[i:i + batch_size] results = service.translate_batch(batch, target_language) translated_batch.extend(results) if len(translated_batch) % 1000 < batch_size: elapsed = time.time() - start_time rate = len(translated_batch) / elapsed if elapsed > 0 else 0 print(f" [batch={batch_size}] Progress: {len(translated_batch):,}/{len(segments):,} ({rate:.1f} seg/sec)") batch_time = time.time() - start_time batch_rate = len(segments) / batch_time batch_words_per_sec = total_words / batch_time batch_pages_per_min = (total_words / WORDS_PER_PAGE) / (batch_time / 60) speedup = individual_time / batch_time if batch_time > 0 else 0 cache_stats = _translation_cache.stats() print(f"\n Batch size: {batch_size}") print(f" Total time: {batch_time:.2f} seconds") print(f" Rate: {batch_rate:.1f} segments/second") print(f" Words/second: {batch_words_per_sec:.1f}") print(f" Pages/minute: {batch_pages_per_min:.1f}") print(f" Speedup vs individual: {speedup:.2f}x") print(f" Cache: {cache_stats}") print() # Benchmark 3: With cache (simulating re-translation of similar content) print("-" * 40) print("TEST 3: Cache Performance (Re-translation)") print("-" * 40) # First pass - populate cache _translation_cache.clear() print(" First pass (populating cache)...") start_time = time.time() _ = service.translate_batch(segments, target_language) first_pass_time = time.time() - start_time cache_after_first = _translation_cache.stats() print(f" First pass time: {first_pass_time:.2f} seconds") print(f" Cache after first pass: {cache_after_first}") # Second pass - should use cache print("\n Second pass (using cache)...") start_time = time.time() _ = service.translate_batch(segments, target_language) second_pass_time = time.time() - start_time cache_after_second = _translation_cache.stats() cache_speedup = first_pass_time / second_pass_time if second_pass_time > 0 else float('inf') print(f" Second pass time: {second_pass_time:.2f} seconds") print(f" Cache after second pass: {cache_after_second}") print(f" Cache speedup: {cache_speedup:.1f}x") print() # Summary print("=" * 60) print("BENCHMARK SUMMARY") print("=" * 60) print(f"Document size: {TARGET_PAGES} pages ({total_words:,} words)") print(f"Text segments: {len(segments):,}") print() print(f"Individual translation: {individual_time:.1f}s ({individual_pages_per_min:.1f} pages/min)") print(f"Batch translation (50): ~{individual_time/3:.1f}s estimated") print(f"With cache (2nd pass): {second_pass_time:.2f}s ({cache_speedup:.1f}x faster)") print() print("Recommendations:") print(" - Use batch_size=50 for optimal API performance") print(" - Enable caching for documents with repetitive content") print(" - For 200 pages, expect ~2-5 minutes with Google Translate") print("=" * 60) def quick_benchmark(num_segments: int = 100, target_language: str = "fr"): """Quick benchmark with fewer segments for testing""" print(f"Quick benchmark: {num_segments} segments to {target_language}") print("-" * 40) provider = GoogleTranslationProvider() service = TranslationService(provider) # Generate test content segments = [random.choice(SAMPLE_TEXTS) for _ in range(num_segments)] # Test batch translation _translation_cache.clear() start = time.time() results = service.translate_batch(segments, target_language) elapsed = time.time() - start print(f"Translated {len(results)} segments in {elapsed:.2f}s") print(f"Rate: {len(results)/elapsed:.1f} segments/second") print(f"Cache: {_translation_cache.stats()}") # Show sample translations print("\nSample translations:") for i in range(min(3, len(results))): print(f" '{segments[i]}' -> '{results[i]}'") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Translation Benchmark") parser.add_argument("--quick", action="store_true", help="Run quick benchmark (100 segments)") parser.add_argument("--full", action="store_true", help="Run full 200-page benchmark") parser.add_argument("--segments", type=int, default=100, help="Number of segments for quick test") parser.add_argument("--lang", type=str, default="fr", help="Target language code") args = parser.parse_args() if args.full: run_benchmark(target_language=args.lang) else: quick_benchmark(num_segments=args.segments, target_language=args.lang)