1010 lines
39 KiB
Python
1010 lines
39 KiB
Python
"""
|
|
Document Translation API
|
|
FastAPI application for translating complex documents while preserving formatting
|
|
SaaS-ready with rate limiting, validation, and robust error handling
|
|
"""
|
|
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, Depends, Header
|
|
from fastapi.responses import FileResponse, JSONResponse
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.security import HTTPBasic, HTTPBasicCredentials
|
|
from contextlib import asynccontextmanager
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import secrets
|
|
import hashlib
|
|
import time
|
|
|
|
from config import config
|
|
from translators import excel_translator, word_translator, pptx_translator
|
|
from utils import file_handler, handle_translation_error, DocumentProcessingError
|
|
from services.translation_service import _translation_cache
|
|
|
|
# Import auth routes
|
|
from routes.auth_routes import router as auth_router
|
|
|
|
# Import SaaS middleware
|
|
from middleware.rate_limiting import RateLimitMiddleware, RateLimitManager, RateLimitConfig
|
|
from middleware.security import SecurityHeadersMiddleware, RequestLoggingMiddleware, ErrorHandlingMiddleware
|
|
from middleware.cleanup import FileCleanupManager, MemoryMonitor, HealthChecker, create_cleanup_manager
|
|
from middleware.validation import FileValidator, LanguageValidator, ProviderValidator, InputSanitizer, ValidationError
|
|
|
|
# Configure structured logging
|
|
logging.basicConfig(
|
|
level=getattr(logging, os.getenv("LOG_LEVEL", "INFO")),
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ============== Admin Authentication ==============
|
|
ADMIN_USERNAME = os.getenv("ADMIN_USERNAME", "admin")
|
|
ADMIN_PASSWORD_HASH = os.getenv("ADMIN_PASSWORD_HASH", "") # SHA256 hash of password
|
|
ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "changeme123") # Default password (change in production!)
|
|
ADMIN_TOKEN_SECRET = os.getenv("ADMIN_TOKEN_SECRET", secrets.token_hex(32))
|
|
|
|
# Store active admin sessions (token -> expiry timestamp)
|
|
admin_sessions: dict = {}
|
|
|
|
def hash_password(password: str) -> str:
|
|
"""Hash password with SHA256"""
|
|
return hashlib.sha256(password.encode()).hexdigest()
|
|
|
|
def verify_admin_password(password: str) -> bool:
|
|
"""Verify admin password"""
|
|
if ADMIN_PASSWORD_HASH:
|
|
return hash_password(password) == ADMIN_PASSWORD_HASH
|
|
return password == ADMIN_PASSWORD
|
|
|
|
def create_admin_token() -> str:
|
|
"""Create a new admin session token"""
|
|
token = secrets.token_urlsafe(32)
|
|
# Token expires in 24 hours
|
|
admin_sessions[token] = time.time() + (24 * 60 * 60)
|
|
return token
|
|
|
|
def verify_admin_token(token: str) -> bool:
|
|
"""Verify admin token is valid and not expired"""
|
|
if token not in admin_sessions:
|
|
return False
|
|
if time.time() > admin_sessions[token]:
|
|
del admin_sessions[token]
|
|
return False
|
|
return True
|
|
|
|
async def require_admin(authorization: Optional[str] = Header(None)) -> bool:
|
|
"""Dependency to require admin authentication"""
|
|
if not authorization:
|
|
raise HTTPException(status_code=401, detail="Authorization header required")
|
|
|
|
# Expect "Bearer <token>"
|
|
parts = authorization.split(" ")
|
|
if len(parts) != 2 or parts[0].lower() != "bearer":
|
|
raise HTTPException(status_code=401, detail="Invalid authorization format. Use: Bearer <token>")
|
|
|
|
token = parts[1]
|
|
if not verify_admin_token(token):
|
|
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
|
|
|
return True
|
|
|
|
# Initialize SaaS components
|
|
rate_limit_config = RateLimitConfig(
|
|
requests_per_minute=int(os.getenv("RATE_LIMIT_PER_MINUTE", "30")),
|
|
requests_per_hour=int(os.getenv("RATE_LIMIT_PER_HOUR", "200")),
|
|
translations_per_minute=int(os.getenv("TRANSLATIONS_PER_MINUTE", "10")),
|
|
translations_per_hour=int(os.getenv("TRANSLATIONS_PER_HOUR", "50")),
|
|
max_concurrent_translations=int(os.getenv("MAX_CONCURRENT_TRANSLATIONS", "5")),
|
|
)
|
|
rate_limit_manager = RateLimitManager(rate_limit_config)
|
|
|
|
cleanup_manager = create_cleanup_manager(config)
|
|
memory_monitor = MemoryMonitor(max_memory_percent=float(os.getenv("MAX_MEMORY_PERCENT", "80")))
|
|
health_checker = HealthChecker(cleanup_manager, memory_monitor)
|
|
|
|
file_validator = FileValidator(
|
|
max_size_mb=config.MAX_FILE_SIZE_MB,
|
|
allowed_extensions=config.SUPPORTED_EXTENSIONS
|
|
)
|
|
|
|
|
|
def build_full_prompt(system_prompt: str, glossary: str) -> str:
|
|
"""Combine system prompt and glossary into a single prompt for LLM translation."""
|
|
parts = []
|
|
|
|
# Add system prompt if provided
|
|
if system_prompt and system_prompt.strip():
|
|
parts.append(system_prompt.strip())
|
|
|
|
# Add glossary if provided
|
|
if glossary and glossary.strip():
|
|
glossary_section = """
|
|
TECHNICAL GLOSSARY - Use these exact translations for the following terms:
|
|
{}
|
|
|
|
Always use the translations from this glossary when you encounter these terms.""".format(glossary.strip())
|
|
parts.append(glossary_section)
|
|
|
|
return "\n\n".join(parts) if parts else ""
|
|
|
|
|
|
# Lifespan context manager for startup/shutdown
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Handle startup and shutdown events"""
|
|
# Startup
|
|
logger.info("Starting Document Translation API...")
|
|
config.ensure_directories()
|
|
await cleanup_manager.start()
|
|
logger.info("API ready to accept requests")
|
|
|
|
yield
|
|
|
|
# Shutdown
|
|
logger.info("Shutting down...")
|
|
await cleanup_manager.stop()
|
|
logger.info("Cleanup completed")
|
|
|
|
|
|
# Create FastAPI app with lifespan
|
|
app = FastAPI(
|
|
title=config.API_TITLE,
|
|
version=config.API_VERSION,
|
|
description=config.API_DESCRIPTION,
|
|
lifespan=lifespan
|
|
)
|
|
|
|
# Add middleware (order matters - first added is outermost)
|
|
app.add_middleware(ErrorHandlingMiddleware)
|
|
app.add_middleware(RequestLoggingMiddleware, log_body=False)
|
|
app.add_middleware(SecurityHeadersMiddleware, config={"enable_hsts": os.getenv("ENABLE_HSTS", "false").lower() == "true"})
|
|
app.add_middleware(RateLimitMiddleware, rate_limit_manager=rate_limit_manager)
|
|
|
|
# CORS - configure for production
|
|
allowed_origins = os.getenv("CORS_ORIGINS", "*").split(",")
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=allowed_origins,
|
|
allow_credentials=True,
|
|
allow_methods=["GET", "POST", "DELETE", "OPTIONS"],
|
|
allow_headers=["*"],
|
|
expose_headers=["X-Request-ID", "X-Original-Filename", "X-File-Size-MB", "X-Target-Language"]
|
|
)
|
|
|
|
# Mount static files
|
|
static_dir = Path(__file__).parent / "static"
|
|
if static_dir.exists():
|
|
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
|
|
|
|
# Include auth routes
|
|
app.include_router(auth_router)
|
|
|
|
|
|
# Custom exception handler for ValidationError
|
|
@app.exception_handler(ValidationError)
|
|
async def validation_error_handler(request: Request, exc: ValidationError):
|
|
"""Handle validation errors with user-friendly messages"""
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={
|
|
"error": exc.code,
|
|
"message": exc.message,
|
|
"details": exc.details
|
|
}
|
|
)
|
|
|
|
|
|
@app.get("/")
|
|
async def root():
|
|
"""Root endpoint with API information"""
|
|
return {
|
|
"name": config.API_TITLE,
|
|
"version": config.API_VERSION,
|
|
"status": "operational",
|
|
"supported_formats": list(config.SUPPORTED_EXTENSIONS),
|
|
"endpoints": {
|
|
"translate": "/translate",
|
|
"health": "/health",
|
|
"supported_languages": "/languages"
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint with detailed system status"""
|
|
health_status = await health_checker.check_health()
|
|
status_code = 200 if health_status.get("status") == "healthy" else 503
|
|
|
|
return JSONResponse(
|
|
status_code=status_code,
|
|
content={
|
|
"status": health_status.get("status", "unknown"),
|
|
"translation_service": config.TRANSLATION_SERVICE,
|
|
"memory": health_status.get("memory", {}),
|
|
"disk": health_status.get("disk", {}),
|
|
"cleanup_service": health_status.get("cleanup_service", {}),
|
|
"rate_limits": {
|
|
"requests_per_minute": rate_limit_config.requests_per_minute,
|
|
"translations_per_minute": rate_limit_config.translations_per_minute,
|
|
},
|
|
"translation_cache": _translation_cache.stats()
|
|
}
|
|
)
|
|
|
|
|
|
@app.get("/languages")
|
|
async def get_supported_languages():
|
|
"""Get list of supported language codes"""
|
|
return {
|
|
"supported_languages": {
|
|
"es": "Spanish",
|
|
"fr": "French",
|
|
"de": "German",
|
|
"it": "Italian",
|
|
"pt": "Portuguese",
|
|
"ru": "Russian",
|
|
"zh": "Chinese (Simplified)",
|
|
"ja": "Japanese",
|
|
"ko": "Korean",
|
|
"ar": "Arabic",
|
|
"hi": "Hindi",
|
|
"nl": "Dutch",
|
|
"pl": "Polish",
|
|
"tr": "Turkish",
|
|
"sv": "Swedish",
|
|
"da": "Danish",
|
|
"no": "Norwegian",
|
|
"fi": "Finnish",
|
|
"cs": "Czech",
|
|
"el": "Greek",
|
|
"th": "Thai",
|
|
"vi": "Vietnamese",
|
|
"id": "Indonesian",
|
|
"uk": "Ukrainian",
|
|
"ro": "Romanian",
|
|
"hu": "Hungarian"
|
|
},
|
|
"note": "Supported languages may vary depending on the translation service configured"
|
|
}
|
|
|
|
|
|
@app.post("/translate")
|
|
async def translate_document(
|
|
request: Request,
|
|
file: UploadFile = File(..., description="Document file to translate (.xlsx, .docx, or .pptx)"),
|
|
target_language: str = Form(..., description="Target language code (e.g., 'es', 'fr', 'de')"),
|
|
source_language: str = Form(default="auto", description="Source language code (default: auto-detect)"),
|
|
provider: str = Form(default="openrouter", description="Translation provider (openrouter, google, ollama, deepl, libre, openai)"),
|
|
translate_images: bool = Form(default=False, description="Translate images with multimodal Ollama/OpenAI model"),
|
|
ollama_model: str = Form(default="", description="Ollama model to use (also used for vision if multimodal)"),
|
|
system_prompt: str = Form(default="", description="Custom system prompt with context or instructions for LLM translation"),
|
|
glossary: str = Form(default="", description="Technical glossary (format: source=target, one per line)"),
|
|
libre_url: str = Form(default="https://libretranslate.com", description="LibreTranslate server URL"),
|
|
openai_api_key: str = Form(default="", description="OpenAI API key"),
|
|
openai_model: str = Form(default="gpt-4o-mini", description="OpenAI model to use (gpt-4o-mini is cheapest with vision)"),
|
|
openrouter_api_key: str = Form(default="", description="OpenRouter API key"),
|
|
openrouter_model: str = Form(default="deepseek/deepseek-chat", description="OpenRouter model (deepseek/deepseek-chat is best value)"),
|
|
cleanup: bool = Form(default=True, description="Delete input file after translation")
|
|
):
|
|
"""
|
|
Translate a document while preserving all formatting, layout, and embedded media
|
|
|
|
**Supported File Types:**
|
|
- Excel (.xlsx) - Preserves formulas, merged cells, styling, and images
|
|
- Word (.docx) - Preserves headings, tables, images, headers/footers
|
|
- PowerPoint (.pptx) - Preserves layouts, animations, and media
|
|
|
|
**Parameters:**
|
|
- **file**: The document file to translate
|
|
- **target_language**: Target language code (e.g., 'es' for Spanish, 'fr' for French)
|
|
- **source_language**: Source language code (optional, default: auto-detect)
|
|
- **cleanup**: Whether to delete the uploaded file after translation (default: True)
|
|
|
|
**Returns:**
|
|
- Translated document file with preserved formatting
|
|
"""
|
|
input_path = None
|
|
output_path = None
|
|
request_id = getattr(request.state, 'request_id', 'unknown')
|
|
|
|
try:
|
|
# Validate inputs
|
|
sanitized_language = InputSanitizer.sanitize_language_code(target_language)
|
|
LanguageValidator.validate(sanitized_language)
|
|
ProviderValidator.validate(provider)
|
|
|
|
# Validate file before processing
|
|
validation_result = await file_validator.validate_async(file)
|
|
if not validation_result.is_valid:
|
|
raise ValidationError(
|
|
message=f"File validation failed: {'; '.join(validation_result.errors)}",
|
|
code="INVALID_FILE",
|
|
details={"errors": validation_result.errors, "warnings": validation_result.warnings}
|
|
)
|
|
|
|
# Log any warnings
|
|
if validation_result.warnings:
|
|
logger.warning(f"[{request_id}] File validation warnings: {validation_result.warnings}")
|
|
|
|
# Reset file position after validation read
|
|
await file.seek(0)
|
|
|
|
# Check rate limit for translations
|
|
client_ip = request.client.host if request.client else "unknown"
|
|
if not await rate_limit_manager.check_translation_limit(client_ip):
|
|
raise HTTPException(
|
|
status_code=429,
|
|
detail="Translation rate limit exceeded. Please try again later."
|
|
)
|
|
|
|
# Validate file extension
|
|
file_extension = file_handler.validate_file_extension(file.filename)
|
|
logger.info(f"[{request_id}] Processing {file_extension} file: {file.filename}")
|
|
|
|
# Validate file size
|
|
file_handler.validate_file_size(file)
|
|
|
|
# Generate unique filenames
|
|
input_filename = file_handler.generate_unique_filename(file.filename, "input")
|
|
output_filename = file_handler.generate_unique_filename(file.filename, "translated")
|
|
|
|
# Save uploaded file
|
|
input_path = config.UPLOAD_DIR / input_filename
|
|
output_path = config.OUTPUT_DIR / output_filename
|
|
|
|
await file_handler.save_upload_file(file, input_path)
|
|
logger.info(f"[{request_id}] Saved input file to: {input_path}")
|
|
|
|
# Track file for cleanup
|
|
await cleanup_manager.track_file(input_path, ttl_minutes=30)
|
|
await cleanup_manager.track_file(output_path, ttl_minutes=60)
|
|
|
|
# Configure translation provider
|
|
from services.translation_service import GoogleTranslationProvider, DeepLTranslationProvider, LibreTranslationProvider, OllamaTranslationProvider, OpenAITranslationProvider, OpenRouterTranslationProvider, translation_service
|
|
|
|
if provider.lower() == "openrouter":
|
|
api_key = openrouter_api_key.strip() if openrouter_api_key else os.getenv("OPENROUTER_API_KEY", "")
|
|
if not api_key:
|
|
raise HTTPException(status_code=400, detail="OpenRouter API key not provided. Get one at https://openrouter.ai/keys")
|
|
model_to_use = openrouter_model.strip() if openrouter_model else "deepseek/deepseek-chat"
|
|
custom_prompt = build_full_prompt(system_prompt, glossary)
|
|
logger.info(f"Using OpenRouter model: {model_to_use}")
|
|
if custom_prompt:
|
|
logger.info(f"Custom system prompt provided ({len(custom_prompt)} chars)")
|
|
translation_provider = OpenRouterTranslationProvider(api_key, model_to_use, custom_prompt)
|
|
elif provider.lower() == "deepl":
|
|
if not config.DEEPL_API_KEY:
|
|
raise HTTPException(status_code=400, detail="DeepL API key not configured")
|
|
translation_provider = DeepLTranslationProvider(config.DEEPL_API_KEY)
|
|
elif provider.lower() == "libre":
|
|
libre_server = libre_url.strip() if libre_url else "https://libretranslate.com"
|
|
logger.info(f"Using LibreTranslate server: {libre_server}")
|
|
translation_provider = LibreTranslationProvider(libre_server)
|
|
elif provider.lower() == "openai":
|
|
api_key = openai_api_key.strip() if openai_api_key else ""
|
|
if not api_key:
|
|
raise HTTPException(status_code=400, detail="OpenAI API key not provided")
|
|
model_to_use = openai_model.strip() if openai_model else "gpt-4o-mini"
|
|
# Combine system prompt and glossary
|
|
custom_prompt = build_full_prompt(system_prompt, glossary)
|
|
logger.info(f"Using OpenAI model: {model_to_use}")
|
|
if custom_prompt:
|
|
logger.info(f"Custom system prompt provided ({len(custom_prompt)} chars)")
|
|
translation_provider = OpenAITranslationProvider(api_key, model_to_use, custom_prompt)
|
|
elif provider.lower() == "ollama":
|
|
# Use the same model for text and vision (multimodal models like gemma3, qwen3-vl)
|
|
model_to_use = ollama_model.strip() if ollama_model else config.OLLAMA_MODEL
|
|
# Combine system prompt and glossary
|
|
custom_prompt = build_full_prompt(system_prompt, glossary)
|
|
logger.info(f"Using Ollama model: {model_to_use} (text + vision)")
|
|
if custom_prompt:
|
|
logger.info(f"Custom system prompt provided ({len(custom_prompt)} chars)")
|
|
translation_provider = OllamaTranslationProvider(config.OLLAMA_BASE_URL, model_to_use, model_to_use, custom_prompt)
|
|
elif provider.lower() == "google":
|
|
translation_provider = GoogleTranslationProvider()
|
|
else:
|
|
# Default to OpenRouter with DeepSeek (best value)
|
|
api_key = openrouter_api_key.strip() if openrouter_api_key else os.getenv("OPENROUTER_API_KEY", "")
|
|
if api_key:
|
|
translation_provider = OpenRouterTranslationProvider(api_key, "deepseek/deepseek-chat", build_full_prompt(system_prompt, glossary))
|
|
else:
|
|
translation_provider = GoogleTranslationProvider()
|
|
|
|
# Update the global translation service
|
|
translation_service.provider = translation_provider
|
|
|
|
# Store translate_images flag for translators to access
|
|
translation_service.translate_images = translate_images
|
|
|
|
# Translate based on file type
|
|
if file_extension == ".xlsx":
|
|
logger.info("Translating Excel file...")
|
|
excel_translator.translate_file(input_path, output_path, target_language)
|
|
elif file_extension == ".docx":
|
|
logger.info("Translating Word document...")
|
|
word_translator.translate_file(input_path, output_path, target_language)
|
|
elif file_extension == ".pptx":
|
|
logger.info("Translating PowerPoint presentation...")
|
|
pptx_translator.translate_file(input_path, output_path, target_language)
|
|
else:
|
|
raise DocumentProcessingError(f"Unsupported file type: {file_extension}")
|
|
|
|
logger.info(f"Translation completed: {output_path}")
|
|
|
|
# Get file info
|
|
output_info = file_handler.get_file_info(output_path)
|
|
|
|
# Cleanup input file if requested
|
|
if cleanup and input_path:
|
|
file_handler.cleanup_file(input_path)
|
|
logger.info(f"Cleaned up input file: {input_path}")
|
|
|
|
# Return the translated file
|
|
return FileResponse(
|
|
path=output_path,
|
|
filename=f"translated_{file.filename}",
|
|
media_type="application/octet-stream",
|
|
headers={
|
|
"X-Original-Filename": file.filename,
|
|
"X-File-Size-MB": str(output_info.get("size_mb", 0)),
|
|
"X-Target-Language": target_language
|
|
}
|
|
)
|
|
|
|
except HTTPException:
|
|
# Re-raise HTTP exceptions
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Translation error: {str(e)}", exc_info=True)
|
|
|
|
# Cleanup files on error
|
|
if input_path:
|
|
file_handler.cleanup_file(input_path)
|
|
if output_path:
|
|
file_handler.cleanup_file(output_path)
|
|
|
|
raise handle_translation_error(e)
|
|
|
|
|
|
@app.delete("/cleanup/{filename}")
|
|
async def cleanup_translated_file(filename: str):
|
|
"""
|
|
Cleanup a translated file after download
|
|
|
|
**Parameters:**
|
|
- **filename**: Name of the file to delete from the outputs directory
|
|
"""
|
|
try:
|
|
file_path = config.OUTPUT_DIR / filename
|
|
|
|
if not file_path.exists():
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
file_handler.cleanup_file(file_path)
|
|
|
|
return {"message": f"File {filename} deleted successfully"}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Cleanup error: {str(e)}")
|
|
raise HTTPException(status_code=500, detail="Error cleaning up file")
|
|
|
|
|
|
@app.post("/translate-batch")
|
|
async def translate_batch_documents(
|
|
files: list[UploadFile] = File(..., description="Multiple document files to translate"),
|
|
target_language: str = Form(..., description="Target language code"),
|
|
source_language: str = Form(default="auto", description="Source language code")
|
|
):
|
|
"""
|
|
Translate multiple documents in batch
|
|
|
|
**Note:** This endpoint processes files sequentially. For large batches, consider
|
|
calling the single file endpoint multiple times with concurrent requests.
|
|
"""
|
|
results = []
|
|
|
|
for file in files:
|
|
try:
|
|
# Process each file using the same logic as single file translation
|
|
file_extension = file_handler.validate_file_extension(file.filename)
|
|
file_handler.validate_file_size(file)
|
|
|
|
input_filename = file_handler.generate_unique_filename(file.filename, "input")
|
|
output_filename = file_handler.generate_unique_filename(file.filename, "translated")
|
|
|
|
input_path = config.UPLOAD_DIR / input_filename
|
|
output_path = config.OUTPUT_DIR / output_filename
|
|
|
|
await file_handler.save_upload_file(file, input_path)
|
|
|
|
# Translate based on file type
|
|
if file_extension == ".xlsx":
|
|
excel_translator.translate_file(input_path, output_path, target_language)
|
|
elif file_extension == ".docx":
|
|
word_translator.translate_file(input_path, output_path, target_language)
|
|
elif file_extension == ".pptx":
|
|
pptx_translator.translate_file(input_path, output_path, target_language)
|
|
|
|
# Cleanup input file
|
|
file_handler.cleanup_file(input_path)
|
|
|
|
results.append({
|
|
"filename": file.filename,
|
|
"status": "success",
|
|
"output_file": output_filename,
|
|
"download_url": f"/download/{output_filename}"
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {file.filename}: {str(e)}")
|
|
results.append({
|
|
"filename": file.filename,
|
|
"status": "error",
|
|
"error": str(e)
|
|
})
|
|
|
|
return {
|
|
"total_files": len(files),
|
|
"successful": len([r for r in results if r["status"] == "success"]),
|
|
"failed": len([r for r in results if r["status"] == "error"]),
|
|
"results": results
|
|
}
|
|
|
|
|
|
@app.get("/download/{filename}")
|
|
async def download_file(filename: str):
|
|
"""
|
|
Download a translated file by filename
|
|
|
|
**Parameters:**
|
|
- **filename**: Name of the file to download from the outputs directory
|
|
"""
|
|
file_path = config.OUTPUT_DIR / filename
|
|
|
|
if not file_path.exists():
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
return FileResponse(
|
|
path=file_path,
|
|
filename=filename,
|
|
media_type="application/octet-stream"
|
|
)
|
|
|
|
|
|
@app.get("/ollama/models")
|
|
async def list_ollama_models(base_url: Optional[str] = None):
|
|
"""
|
|
List available Ollama models
|
|
|
|
**Parameters:**
|
|
- **base_url**: Ollama server URL (default: from config)
|
|
"""
|
|
from services.translation_service import OllamaTranslationProvider
|
|
|
|
url = base_url or config.OLLAMA_BASE_URL
|
|
models = OllamaTranslationProvider.list_models(url)
|
|
|
|
return {
|
|
"ollama_url": url,
|
|
"models": models,
|
|
"count": len(models)
|
|
}
|
|
|
|
|
|
@app.post("/ollama/configure")
|
|
async def configure_ollama(base_url: str = Form(...), model: str = Form(...)):
|
|
"""
|
|
Configure Ollama settings
|
|
|
|
**Parameters:**
|
|
- **base_url**: Ollama server URL (e.g., http://localhost:11434)
|
|
- **model**: Model name to use for translation (e.g., llama3, mistral)
|
|
"""
|
|
config.OLLAMA_BASE_URL = base_url
|
|
config.OLLAMA_MODEL = model
|
|
|
|
return {
|
|
"status": "success",
|
|
"message": "Ollama configuration updated",
|
|
"ollama_url": base_url,
|
|
"model": model
|
|
}
|
|
|
|
|
|
@app.post("/extract-texts")
|
|
async def extract_texts_from_document(
|
|
file: UploadFile = File(..., description="Document file to extract texts from"),
|
|
):
|
|
"""
|
|
Extract all translatable texts from a document for client-side translation (WebLLM).
|
|
Returns a list of texts and a session ID to use for reconstruction.
|
|
|
|
**Parameters:**
|
|
- **file**: The document file to extract texts from
|
|
|
|
**Returns:**
|
|
- session_id: Unique ID to reference this extraction
|
|
- texts: Array of texts to translate
|
|
- file_type: Type of the document
|
|
"""
|
|
import uuid
|
|
import json
|
|
|
|
try:
|
|
# Validate file extension
|
|
file_extension = file_handler.validate_file_extension(file.filename)
|
|
logger.info(f"Extracting texts from {file_extension} file: {file.filename}")
|
|
|
|
# Validate file size
|
|
file_handler.validate_file_size(file)
|
|
|
|
# Generate session ID
|
|
session_id = str(uuid.uuid4())
|
|
|
|
# Save uploaded file
|
|
input_filename = f"session_{session_id}{file_extension}"
|
|
input_path = config.UPLOAD_DIR / input_filename
|
|
await file_handler.save_upload_file(file, input_path)
|
|
|
|
# Extract texts based on file type
|
|
texts = []
|
|
|
|
if file_extension == ".xlsx":
|
|
from openpyxl import load_workbook
|
|
wb = load_workbook(input_path)
|
|
for sheet in wb.worksheets:
|
|
for row in sheet.iter_rows():
|
|
for cell in row:
|
|
if cell.value and isinstance(cell.value, str) and cell.value.strip():
|
|
texts.append({
|
|
"id": f"{sheet.title}!{cell.coordinate}",
|
|
"text": cell.value
|
|
})
|
|
wb.close()
|
|
elif file_extension == ".docx":
|
|
from docx import Document
|
|
doc = Document(input_path)
|
|
para_idx = 0
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
texts.append({
|
|
"id": f"para_{para_idx}",
|
|
"text": para.text
|
|
})
|
|
para_idx += 1
|
|
# Also extract from tables
|
|
table_idx = 0
|
|
for table in doc.tables:
|
|
for row_idx, row in enumerate(table.rows):
|
|
for cell_idx, cell in enumerate(row.cells):
|
|
if cell.text.strip():
|
|
texts.append({
|
|
"id": f"table_{table_idx}_r{row_idx}_c{cell_idx}",
|
|
"text": cell.text
|
|
})
|
|
table_idx += 1
|
|
elif file_extension == ".pptx":
|
|
from pptx import Presentation
|
|
prs = Presentation(input_path)
|
|
for slide_idx, slide in enumerate(prs.slides):
|
|
for shape_idx, shape in enumerate(slide.shapes):
|
|
if shape.has_text_frame:
|
|
for para_idx, para in enumerate(shape.text_frame.paragraphs):
|
|
for run_idx, run in enumerate(para.runs):
|
|
if run.text.strip():
|
|
texts.append({
|
|
"id": f"slide_{slide_idx}_shape_{shape_idx}_para_{para_idx}_run_{run_idx}",
|
|
"text": run.text
|
|
})
|
|
|
|
# Save session metadata
|
|
session_data = {
|
|
"original_filename": file.filename,
|
|
"file_extension": file_extension,
|
|
"input_path": str(input_path),
|
|
"text_count": len(texts)
|
|
}
|
|
session_file = config.UPLOAD_DIR / f"session_{session_id}.json"
|
|
with open(session_file, "w", encoding="utf-8") as f:
|
|
json.dump(session_data, f)
|
|
|
|
logger.info(f"Extracted {len(texts)} texts from {file.filename}, session: {session_id}")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"texts": texts,
|
|
"file_type": file_extension,
|
|
"text_count": len(texts)
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Text extraction error: {str(e)}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Failed to extract texts: {str(e)}")
|
|
|
|
|
|
@app.post("/reconstruct-document")
|
|
async def reconstruct_document(
|
|
session_id: str = Form(..., description="Session ID from extract-texts"),
|
|
translations: str = Form(..., description="JSON array of {id, translated_text} objects"),
|
|
target_language: str = Form(..., description="Target language code"),
|
|
):
|
|
"""
|
|
Reconstruct a document with translated texts.
|
|
|
|
**Parameters:**
|
|
- **session_id**: The session ID from extract-texts
|
|
- **translations**: JSON array of translations with matching IDs
|
|
- **target_language**: Target language for filename
|
|
|
|
**Returns:**
|
|
- Translated document file
|
|
"""
|
|
import json
|
|
|
|
try:
|
|
# Load session data
|
|
session_file = config.UPLOAD_DIR / f"session_{session_id}.json"
|
|
if not session_file.exists():
|
|
raise HTTPException(status_code=404, detail="Session not found or expired")
|
|
|
|
with open(session_file, "r", encoding="utf-8") as f:
|
|
session_data = json.load(f)
|
|
|
|
input_path = Path(session_data["input_path"])
|
|
file_extension = session_data["file_extension"]
|
|
original_filename = session_data["original_filename"]
|
|
|
|
if not input_path.exists():
|
|
raise HTTPException(status_code=404, detail="Source file not found or expired")
|
|
|
|
# Parse translations
|
|
translation_list = json.loads(translations)
|
|
translation_map = {t["id"]: t["translated_text"] for t in translation_list}
|
|
|
|
# Generate output path
|
|
output_filename = file_handler.generate_unique_filename(original_filename, "translated")
|
|
output_path = config.OUTPUT_DIR / output_filename
|
|
|
|
# Reconstruct based on file type
|
|
if file_extension == ".xlsx":
|
|
from openpyxl import load_workbook
|
|
import shutil
|
|
shutil.copy(input_path, output_path)
|
|
wb = load_workbook(output_path)
|
|
for sheet in wb.worksheets:
|
|
for row in sheet.iter_rows():
|
|
for cell in row:
|
|
cell_id = f"{sheet.title}!{cell.coordinate}"
|
|
if cell_id in translation_map:
|
|
cell.value = translation_map[cell_id]
|
|
wb.save(output_path)
|
|
wb.close()
|
|
|
|
elif file_extension == ".docx":
|
|
from docx import Document
|
|
import shutil
|
|
shutil.copy(input_path, output_path)
|
|
doc = Document(output_path)
|
|
para_idx = 0
|
|
for para in doc.paragraphs:
|
|
para_id = f"para_{para_idx}"
|
|
if para_id in translation_map and para.text.strip():
|
|
# Replace text while keeping formatting
|
|
for run in para.runs:
|
|
run.text = ""
|
|
if para.runs:
|
|
para.runs[0].text = translation_map[para_id]
|
|
else:
|
|
para.text = translation_map[para_id]
|
|
para_idx += 1
|
|
# Also handle tables
|
|
table_idx = 0
|
|
for table in doc.tables:
|
|
for row_idx, row in enumerate(table.rows):
|
|
for cell_idx, cell in enumerate(row.cells):
|
|
cell_id = f"table_{table_idx}_r{row_idx}_c{cell_idx}"
|
|
if cell_id in translation_map:
|
|
# Clear and set new text
|
|
for para in cell.paragraphs:
|
|
for run in para.runs:
|
|
run.text = ""
|
|
if cell.paragraphs and cell.paragraphs[0].runs:
|
|
cell.paragraphs[0].runs[0].text = translation_map[cell_id]
|
|
elif cell.paragraphs:
|
|
cell.paragraphs[0].text = translation_map[cell_id]
|
|
table_idx += 1
|
|
doc.save(output_path)
|
|
|
|
elif file_extension == ".pptx":
|
|
from pptx import Presentation
|
|
import shutil
|
|
shutil.copy(input_path, output_path)
|
|
prs = Presentation(output_path)
|
|
for slide_idx, slide in enumerate(prs.slides):
|
|
for shape_idx, shape in enumerate(slide.shapes):
|
|
if shape.has_text_frame:
|
|
for para_idx, para in enumerate(shape.text_frame.paragraphs):
|
|
for run_idx, run in enumerate(para.runs):
|
|
run_id = f"slide_{slide_idx}_shape_{shape_idx}_para_{para_idx}_run_{run_idx}"
|
|
if run_id in translation_map:
|
|
run.text = translation_map[run_id]
|
|
prs.save(output_path)
|
|
|
|
# Cleanup session files
|
|
file_handler.cleanup_file(input_path)
|
|
file_handler.cleanup_file(session_file)
|
|
|
|
logger.info(f"Reconstructed document: {output_path}")
|
|
|
|
return FileResponse(
|
|
path=output_path,
|
|
filename=f"translated_{original_filename}",
|
|
media_type="application/octet-stream"
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Reconstruction error: {str(e)}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Failed to reconstruct document: {str(e)}")
|
|
|
|
|
|
# ============== SaaS Management Endpoints ==============
|
|
|
|
@app.post("/admin/login")
|
|
async def admin_login(
|
|
username: str = Form(...),
|
|
password: str = Form(...)
|
|
):
|
|
"""
|
|
Admin login endpoint
|
|
Returns a bearer token for authenticated admin access
|
|
"""
|
|
if username != ADMIN_USERNAME:
|
|
logger.warning(f"Failed admin login attempt with username: {username}")
|
|
raise HTTPException(status_code=401, detail="Invalid credentials")
|
|
|
|
if not verify_admin_password(password):
|
|
logger.warning(f"Failed admin login attempt - wrong password")
|
|
raise HTTPException(status_code=401, detail="Invalid credentials")
|
|
|
|
token = create_admin_token()
|
|
logger.info(f"Admin login successful")
|
|
|
|
return {
|
|
"status": "success",
|
|
"token": token,
|
|
"expires_in": 86400, # 24 hours in seconds
|
|
"message": "Login successful"
|
|
}
|
|
|
|
|
|
@app.post("/admin/logout")
|
|
async def admin_logout(authorization: Optional[str] = Header(None)):
|
|
"""Logout and invalidate admin token"""
|
|
if authorization:
|
|
parts = authorization.split(" ")
|
|
if len(parts) == 2 and parts[0].lower() == "bearer":
|
|
token = parts[1]
|
|
if token in admin_sessions:
|
|
del admin_sessions[token]
|
|
logger.info("Admin logout successful")
|
|
|
|
return {"status": "success", "message": "Logged out"}
|
|
|
|
|
|
@app.get("/admin/verify")
|
|
async def verify_admin_session(is_admin: bool = Depends(require_admin)):
|
|
"""Verify admin token is still valid"""
|
|
return {"status": "valid", "authenticated": True}
|
|
|
|
|
|
@app.get("/admin/dashboard")
|
|
async def get_admin_dashboard(is_admin: bool = Depends(require_admin)):
|
|
"""Get comprehensive admin dashboard data"""
|
|
health_status = await health_checker.check_health()
|
|
cleanup_stats = cleanup_manager.get_stats()
|
|
rate_limit_stats = rate_limit_manager.get_stats()
|
|
tracked_files = cleanup_manager.get_tracked_files()
|
|
|
|
return {
|
|
"timestamp": health_status.get("timestamp"),
|
|
"uptime": health_status.get("uptime_human"),
|
|
"status": health_status.get("status"),
|
|
"issues": health_status.get("issues", []),
|
|
"system": {
|
|
"memory": health_status.get("memory", {}),
|
|
"disk": health_status.get("disk", {}),
|
|
},
|
|
"translations": health_status.get("translations", {}),
|
|
"cleanup": {
|
|
**cleanup_stats,
|
|
"tracked_files_count": len(tracked_files)
|
|
},
|
|
"rate_limits": rate_limit_stats,
|
|
"config": {
|
|
"max_file_size_mb": config.MAX_FILE_SIZE_MB,
|
|
"supported_extensions": list(config.SUPPORTED_EXTENSIONS),
|
|
"translation_service": config.TRANSLATION_SERVICE,
|
|
"rate_limit_per_minute": rate_limit_config.requests_per_minute,
|
|
"translations_per_minute": rate_limit_config.translations_per_minute
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/metrics")
|
|
async def get_metrics():
|
|
"""Get system metrics and statistics for monitoring"""
|
|
health_status = await health_checker.check_health()
|
|
cleanup_stats = cleanup_manager.get_stats()
|
|
rate_limit_stats = rate_limit_manager.get_stats()
|
|
|
|
return {
|
|
"system": {
|
|
"memory": health_status.get("memory", {}),
|
|
"disk": health_status.get("disk", {}),
|
|
"status": health_status.get("status", "unknown")
|
|
},
|
|
"cleanup": cleanup_stats,
|
|
"rate_limits": rate_limit_stats,
|
|
"config": {
|
|
"max_file_size_mb": config.MAX_FILE_SIZE_MB,
|
|
"supported_extensions": list(config.SUPPORTED_EXTENSIONS),
|
|
"translation_service": config.TRANSLATION_SERVICE
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/rate-limit/status")
|
|
async def get_rate_limit_status(request: Request):
|
|
"""Get current rate limit status for the requesting client"""
|
|
client_ip = request.client.host if request.client else "unknown"
|
|
status = await rate_limit_manager.get_client_status(client_ip)
|
|
|
|
return {
|
|
"client_ip": client_ip,
|
|
"limits": {
|
|
"requests_per_minute": rate_limit_config.requests_per_minute,
|
|
"requests_per_hour": rate_limit_config.requests_per_hour,
|
|
"translations_per_minute": rate_limit_config.translations_per_minute,
|
|
"translations_per_hour": rate_limit_config.translations_per_hour
|
|
},
|
|
"current_usage": status
|
|
}
|
|
|
|
|
|
@app.post("/admin/cleanup/trigger")
|
|
async def trigger_cleanup(is_admin: bool = Depends(require_admin)):
|
|
"""Trigger manual cleanup of expired files (requires admin auth)"""
|
|
try:
|
|
cleaned = await cleanup_manager.cleanup_expired()
|
|
return {
|
|
"status": "success",
|
|
"files_cleaned": cleaned,
|
|
"message": f"Cleaned up {cleaned} expired files"
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Manual cleanup failed: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=f"Cleanup failed: {str(e)}")
|
|
|
|
|
|
@app.get("/admin/files/tracked")
|
|
async def get_tracked_files(is_admin: bool = Depends(require_admin)):
|
|
"""Get list of currently tracked files (requires admin auth)"""
|
|
tracked = cleanup_manager.get_tracked_files()
|
|
return {
|
|
"count": len(tracked),
|
|
"files": tracked
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) |