Files
office_translator/routes/translate_routes.py
2026-03-07 11:42:58 +01:00

1315 lines
49 KiB
Python

"""
API v1 Translate Endpoint (Story 2.10, 2.11, 2.12, 3.6)
POST /api/v1/translate - Submit document for translation
GET /api/v1/translations/{id} - Get translation status with real-time progress
GET /api/v1/download/{id} - Download translated file
Story 3.6: Documentation OpenAPI complète avec exemples et codes d'erreur
"""
import os
import re
import uuid
import time
import socket
import asyncio
import ipaddress
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Any, Literal, Dict
from urllib.parse import urlparse, unquote
import aiofiles
JOB_ID_PATTERN = re.compile(r"^tr_[a-zA-Z0-9_\-]+$")
import httpx
from fastapi import (
APIRouter,
File,
Form,
Header,
HTTPException,
Request,
UploadFile,
Depends,
)
from fastapi.responses import JSONResponse, FileResponse
from starlette.background import BackgroundTask
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from pydantic import BaseModel, Field, field_validator
from typing_extensions import Annotated
from config import config
from models.subscription import PlanType
from middleware.tier_quota import tier_quota_service
from middleware.validation import FileValidator, ValidationError, LanguageValidator, webhook_validator
from middleware.api_key_auth import get_authenticated_user, get_user_from_api_key
from utils import file_handler
# Import models from schemas (Story 3.6 - DRY principle)
from schemas.translation import (
TranslateResponseData,
TranslateResponseMeta,
TranslateResponse,
TranslationStatusData,
TranslationStatusMeta,
TranslationStatusResponse,
)
from schemas.errors import ErrorResponse
from utils.file_handler import FileHandler
from services.progress_tracker import ProgressTracker
from services.storage_tracker import storage_tracker
from services.glossary_service import get_glossary_terms, validate_glossary_access, build_full_prompt
from services.prompt_service import get_prompt_content, validate_prompt_access
from utils.exceptions import GlossaryNotFoundError, PromptNotFoundError
logger = logging.getLogger(__name__)
router_v1 = APIRouter(prefix="/api/v1", tags=["Translation v1"])
security = HTTPBearer(auto_error=False)
MAX_FILE_SIZE_MB = 50
OFFICE_MAGIC_BYTES = b"PK\x03\x04"
ACCEPTED_EXTENSIONS = {".xlsx", ".docx", ".pptx"}
class TranslateEndpointError(Exception):
"""Exception for translate endpoint errors with structured error codes."""
INVALID_FORMAT = "INVALID_FORMAT"
CORRUPTED_FILE = "CORRUPTED_FILE"
FILE_TOO_LARGE = "FILE_TOO_LARGE"
QUOTA_EXCEEDED = "QUOTA_EXCEEDED"
URL_DOWNLOAD_FAILED = "URL_DOWNLOAD_FAILED"
URL_UNREACHABLE = "URL_UNREACHABLE"
UNAUTHORIZED = "UNAUTHORIZED"
MISSING_FILE = "MISSING_FILE"
PRO_FEATURE_REQUIRED = "PRO_FEATURE_REQUIRED"
ERROR_MESSAGES = {
INVALID_FORMAT: "Format de fichier non supporte. Formats acceptes : .xlsx, .docx, .pptx",
CORRUPTED_FILE: "Le fichier semble corrompu ou n'est pas un document Office valide.",
FILE_TOO_LARGE: f"Le fichier est trop volumineux (max {MAX_FILE_SIZE_MB} Mo).",
QUOTA_EXCEEDED: "Limite quotidienne atteinte.",
URL_DOWNLOAD_FAILED: "Impossible de telecharger le fichier depuis l'URL.",
URL_UNREACHABLE: "URL inaccessible.",
UNAUTHORIZED: "Authentification requise.",
MISSING_FILE: "Fichier ou URL requis.",
PRO_FEATURE_REQUIRED: "Cette fonctionnalite necessite un abonnement Pro.",
}
def __init__(
self, code: str, message: Optional[str] = None, details: Optional[dict] = None
):
self.code = code
self.message = message or self.ERROR_MESSAGES.get(code, "Erreur inconnue")
self.details = details or {}
super().__init__(self.message)
def to_dict(self) -> dict:
result = {
"error": self.code,
"message": self.message,
}
if self.details:
result["details"] = self.details
return result
# NOTE: Response models are now imported from schemas/ module (DRY principle)
# TranslateResponseData, TranslateResponseMeta, TranslateResponse,
# TranslationStatusData, TranslationStatusMeta, TranslationStatusResponse, ErrorResponse
file_validator = FileValidator(
max_size_mb=MAX_FILE_SIZE_MB, allowed_extensions=ACCEPTED_EXTENSIONS
)
file_handler_util = FileHandler()
def _tier_for_quota(plan) -> str:
"""Map plan to quota tier: pro (and equivalent) = unlimited, else free."""
if plan in (PlanType.PRO, PlanType.BUSINESS, PlanType.ENTERPRISE):
return "pro"
return "free"
def _next_midnight_utc() -> datetime:
"""Get next midnight UTC."""
now = datetime.now(timezone.utc)
from datetime import timedelta
tomorrow = now.date() + timedelta(days=1)
return datetime(tomorrow.year, tomorrow.month, tomorrow.day, tzinfo=timezone.utc)
def _seconds_until_midnight_utc() -> int:
"""Seconds until next midnight UTC."""
now = datetime.now(timezone.utc)
next_mid = _next_midnight_utc()
return max(0, int((next_mid - now).total_seconds()))
async def validate_file_content(content: bytes, extension: str) -> None:
"""Validate file content by checking magic bytes."""
if len(content) < 4:
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Le fichier est trop petit pour etre un document Office valide.",
details={"reason": "File is too small"},
)
header = content[:4]
if header != OFFICE_MAGIC_BYTES:
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Le fichier n'est pas un document Office valide ou est corrompu.",
details={
"accepted_formats": list(ACCEPTED_EXTENSIONS),
"hint": "Les fichiers .xlsx, .docx, .pptx doivent etre des archives ZIP valides.",
},
)
def _parse_content_disposition(content_disp: str) -> Optional[str]:
"""Parse filename from Content-Disposition header (RFC 5987 compliant)."""
import re
for part in content_disp.split(";"):
part = part.strip()
if part.lower().startswith("filename*="):
match = re.match(r"filename\*=([^']+)'([^']*)'(.+)", part, re.IGNORECASE)
if match:
from urllib.parse import unquote
return unquote(match.group(3))
if part.lower().startswith("filename="):
filename = part.split("=", 1)[1].strip().strip('"').strip("'")
if filename:
return filename
return None
def _is_ssrf_risk(hostname: str) -> bool:
"""Return True if hostname resolves to a private/reserved IP (SSRF prevention).
Blocks: loopback, private, link-local, reserved, multicast ranges.
Also blocks DNS resolution failures to avoid bypass via non-resolvable names.
"""
try:
ip_str = socket.gethostbyname(hostname)
addr = ipaddress.ip_address(ip_str)
return (
addr.is_loopback
or addr.is_private
or addr.is_link_local
or addr.is_reserved
or addr.is_multicast
or addr.is_unspecified
)
except Exception:
return True
async def download_from_url(url: str, timeout: int = 30) -> tuple[Path, str]:
"""Download file from URL using streaming and return (temp_path, filename).
Uses HTTP streaming to avoid loading entire file in memory.
Validates file extension and magic bytes for security.
"""
temp_path = None
parsed_url = urlparse(url)
if parsed_url.scheme not in ("http", "https"):
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="Seules les URLs HTTP/HTTPS sont acceptees.",
details={"scheme": parsed_url.scheme or "none"},
)
hostname = parsed_url.hostname or ""
if not hostname or _is_ssrf_risk(hostname):
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="L'URL pointe vers une adresse interdite (adresse privee ou interne).",
details={"reason": "ssrf_blocked"},
)
try:
async with httpx.AsyncClient(
timeout=timeout, follow_redirects=True, max_redirects=10
) as client:
async with client.stream("GET", url) as response:
if response.status_code != 200:
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message=f"URL inaccessible (HTTP {response.status_code})",
details={"status_code": response.status_code, "url": url[:100]},
)
content_length = response.headers.get("content-length")
if content_length:
try:
file_size = int(content_length)
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
if file_size > max_size_bytes:
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
message=f"Le fichier est trop volumineux ({round(file_size / (1024 * 1024), 2)} Mo, max {MAX_FILE_SIZE_MB} Mo).",
details={
"size_mb": round(file_size / (1024 * 1024), 2),
"max_mb": MAX_FILE_SIZE_MB,
},
)
except ValueError:
pass
filename = None
content_disp = response.headers.get("content-disposition", "")
if content_disp:
filename = _parse_content_disposition(content_disp)
if not filename:
filename = unquote(Path(parsed_url.path).name) or "downloaded_file"
extension = Path(filename).suffix.lower()
if extension not in ACCEPTED_EXTENSIONS:
raise TranslateEndpointError(
code=TranslateEndpointError.INVALID_FORMAT,
details={
"detected_extension": extension or "none",
"accepted_formats": list(ACCEPTED_EXTENSIONS),
},
)
unique_id = str(uuid.uuid4())[:8]
safe_filename = f"{unique_id}_{filename}"
temp_path = config.UPLOAD_DIR / safe_filename
temp_path.parent.mkdir(parents=True, exist_ok=True)
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
downloaded_bytes = 0
async with aiofiles.open(temp_path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=65536):
downloaded_bytes += len(chunk)
if downloaded_bytes > max_size_bytes:
await f.close()
if temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
details={
"size_mb": round(
downloaded_bytes / (1024 * 1024), 2
),
"max_mb": MAX_FILE_SIZE_MB,
},
)
await f.write(chunk)
async with aiofiles.open(temp_path, "rb") as f:
header = await f.read(4)
await validate_file_content(header, extension)
return temp_path, filename
except httpx.TimeoutException:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="Timeout lors du telechargement.",
details={"timeout_seconds": timeout},
)
except httpx.RequestError as e:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
message=f"Erreur de telechargement: {str(e)}",
details={"error": str(e)},
)
except TranslateEndpointError:
if temp_path and temp_path.exists():
temp_path.unlink()
raise
except Exception as e:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
message=f"Erreur inattendue lors du telechargement: {str(e)}",
details={"error": str(e), "error_type": type(e).__name__},
)
_translation_jobs: dict[str, dict] = {}
_JOB_TTL_SECONDS = 3600
_last_cleanup_ts: float = 0.0
_CLEANUP_INTERVAL_SECONDS = 300 # run cleanup every 5 minutes at most
def _cleanup_old_jobs() -> None:
"""Remove completed/failed jobs older than TTL to prevent memory leak.
Throttled to run at most every _CLEANUP_INTERVAL_SECONDS to avoid
iterating the full dict on every translation request.
"""
global _last_cleanup_ts
current_time = time.time()
if current_time - _last_cleanup_ts < _CLEANUP_INTERVAL_SECONDS:
return
_last_cleanup_ts = current_time
expired_job_ids = [
job_id
for job_id, job in _translation_jobs.items()
if job.get("status") in ("completed", "failed")
and (
(ts := job.get("completed_at") or job.get("failed_at"))
and _job_age_seconds(ts) > _JOB_TTL_SECONDS
)
]
for job_id in expired_job_ids:
del _translation_jobs[job_id]
logger.debug(f"Cleaned up expired job: {job_id}")
def _job_age_seconds(timestamp_str: str) -> float:
"""Return how many seconds ago a ISO timestamp was."""
try:
ts = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")).timestamp()
return time.time() - ts
except Exception:
return 0.0
@router_v1.post(
"/translate",
response_model=TranslateResponse,
responses={
202: {"description": "Translation job accepted", "model": TranslateResponse},
400: {"description": "Invalid request", "model": ErrorResponse},
401: {"description": "Unauthorized", "model": ErrorResponse},
403: {"description": "Pro feature required", "model": ErrorResponse},
413: {"description": "File too large", "model": ErrorResponse},
429: {"description": "Quota exceeded", "model": ErrorResponse},
},
status_code=202,
)
async def translate_document_v1(
request: Request,
file: Optional[UploadFile] = File(
None, description="Document file (.xlsx, .docx, .pptx)"
),
file_url: Optional[str] = Form(None, description="URL to download file (Pro only)"),
source_lang: str = Form(default="auto", description="Source language code"),
target_lang: str = Form(..., description="Target language code"),
mode: Literal["classic", "llm"] = Form(
default="classic", description="Translation mode"
),
provider: Optional[str] = Form(default=None, description="Provider override"),
webhook_url: Optional[str] = Form(None, description="Webhook URL for notification"),
glossary_id: Optional[str] = Form(None, description="Glossary ID (Pro only)"),
custom_prompt: Optional[str] = Form(None, description="Custom prompt (Pro only)"),
prompt_id: Optional[str] = Form(None, description="Prompt ID from saved prompts (Pro only)"),
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Submit a document for translation.
**Authentication:**
- JWT Bearer token in Authorization header (web users)
- X-API-Key header (automation users)
**File Input:**
- `file`: Upload file directly (multipart/form-data)
- `file_url`: URL to download file from (Pro feature)
**Parameters:**
- `source_lang`: Source language code (default: auto-detect)
- `target_lang`: Target language code (required)
- `mode`: Translation mode - "classic" or "llm" (default: classic)
- `provider`: Provider override (google, deepl, ollama, openai, openrouter)
- `webhook_url`: URL to receive POST notification when complete
- `glossary_id`: Glossary ID for LLM translation (Pro only)
- `custom_prompt`: Custom system prompt (Pro only)
- `prompt_id`: Saved prompt ID to use (Pro only). Takes priority over custom_prompt.
**Webhook Notification:**
If `webhook_url` is provided, a POST request will be sent when translation completes.
**Webhook Payload (Success):**
```json
{
"event_id": "evt_abc123def456xyz",
"translation_id": "tr_abc123def456",
"status": "completed",
"timestamp": "2024-01-15T10:30:00Z",
"file_name": "report.xlsx",
"source_lang": "en",
"target_lang": "fr",
"error_message": null
}
```
**Webhook Payload (Failure):**
```json
{
"event_id": "evt_abc123def456xyz",
"translation_id": "tr_abc123def456",
"status": "failed",
"timestamp": "2024-01-15T10:30:00Z",
"file_name": "report.xlsx",
"source_lang": "en",
"target_lang": "fr",
"error_message": "Provider unavailable: connection timeout"
}
```
**Webhook Fields:**
- `event_id`: Unique identifier for webhook deduplication (format: evt_xxxxxxxxxxxxxxxx)
- `translation_id`: The translation job ID
- `status`: "completed" or "failed"
- `timestamp`: ISO 8601 UTC timestamp
- `file_name`: Original file name
- `source_lang`: Source language code
- `target_lang`: Target language code
- `error_message`: Error description (null if successful)
**Webhook Behavior:**
- Timeout: 10 seconds
- Fire & Forget: Translation succeeds even if webhook fails
- Retries: None (implement retry logic on your server if needed)
**Returns:**
- HTTP 202 with job ID and status "processing"
"""
request_id = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
try:
if not file and not file_url:
raise TranslateEndpointError(
code=TranslateEndpointError.MISSING_FILE,
details={"hint": "Provide either 'file' or 'file_url' parameter"},
)
tier = "free"
user_id = None
if current_user:
tier = _tier_for_quota(current_user.plan)
user_id = current_user.id
if file_url:
if tier == "free":
raise TranslateEndpointError(
code=TranslateEndpointError.PRO_FEATURE_REQUIRED,
message="L'ingestion par URL est reservee aux utilisateurs Pro.",
details={"feature": "file_url", "tier": tier},
)
# Story 3.12: Include prompt_id in Pro feature check
if (glossary_id or custom_prompt or prompt_id) and tier == "free":
raise TranslateEndpointError(
code=TranslateEndpointError.PRO_FEATURE_REQUIRED,
message="Les glossaires et prompts personnalises sont reserves aux utilisateurs Pro.",
details={"feature": "glossary_id, custom_prompt, or prompt_id", "tier": tier},
)
# Story 3.10: Validate glossary access before creating the job
if glossary_id and user_id:
try:
validate_glossary_access(glossary_id, user_id)
except GlossaryNotFoundError as e:
raise TranslateEndpointError(
code="GLOSSARY_NOT_FOUND",
message=str(e),
details={"glossary_id": glossary_id}
)
# Story 3.12: Validate prompt access before creating the job
if prompt_id and user_id:
try:
validate_prompt_access(prompt_id, user_id)
except PromptNotFoundError as e:
raise TranslateEndpointError(
code="PROMPT_NOT_FOUND",
message=str(e),
details={**e.details, "prompt_id": prompt_id} if e.details else {"prompt_id": prompt_id}
)
if webhook_url:
is_valid, error_msg, error_details = webhook_validator.validate(webhook_url)
if not is_valid:
raise TranslateEndpointError(
code="INVALID_WEBHOOK_URL",
message=error_msg,
details=error_details,
)
if current_user:
quota = await tier_quota_service.check_quota(user_id, tier)
if not quota.allowed:
retry_after = _seconds_until_midnight_utc()
raise HTTPException(
status_code=429,
detail={
"error": "QUOTA_EXCEEDED",
"message": f"Limite quotidienne atteinte ({quota.current_usage}/{quota.limit} fichiers). Reessayez apres minuit UTC.",
"details": {
"current_usage": quota.current_usage,
"limit": quota.limit,
"tier": tier,
"reset_at": quota.reset_at_utc.isoformat(),
},
},
headers={"Retry-After": str(retry_after)},
)
rate_limit_remaining = quota.remaining
else:
rate_limit_remaining = -1
try:
LanguageValidator.validate(target_lang)
except ValidationError as e:
raise TranslateEndpointError(
code="INVALID_FORMAT",
message=f"Code langue cible invalide: {target_lang}",
details={"field": "target_lang"},
)
if source_lang and source_lang != "auto":
try:
LanguageValidator.validate(source_lang)
except ValidationError:
raise TranslateEndpointError(
code="INVALID_FORMAT",
message=f"Code langue source invalide: {source_lang}",
details={"field": "source_lang"},
)
input_path = None
original_filename = None
file_extension = None
file_size = 0
file_hash = None
if file:
validation_result = await file_validator.validate_async(file)
if not validation_result.is_valid:
error_msg = "; ".join(validation_result.errors)
# Use structured error codes from validator
if validation_result.error_code == "file_too_large":
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
message=error_msg,
details={
"errors": validation_result.errors,
"max_size_mb": MAX_FILE_SIZE_MB,
},
)
elif validation_result.error_code == "invalid_file_content":
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message=error_msg,
details={"errors": validation_result.errors},
)
else:
raise TranslateEndpointError(
code=TranslateEndpointError.INVALID_FORMAT,
message=error_msg,
details={"errors": validation_result.errors},
)
original_filename = file.filename
file_extension = validation_result.data.get("extension")
file_size = validation_result.data.get("size_bytes", 0)
input_filename = file_handler_util.generate_unique_filename(
file.filename, "input"
)
input_path = config.UPLOAD_DIR / input_filename
await file_handler_util.save_upload_file(file, input_path)
file_hash = file_handler_util.calculate_sha256(input_path)
if file_hash is None:
file_handler_util.cleanup_file(input_path)
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Impossible de calculer le hash du fichier. Fichier potentiellement corrompu.",
details={"error": "sha256_calculation_failed"},
)
elif file_url:
input_path, original_filename = await download_from_url(file_url)
file_extension = Path(original_filename).suffix.lower()
file_size = input_path.stat().st_size
file_hash = file_handler_util.calculate_sha256(input_path)
if file_hash is None:
file_handler_util.cleanup_file(input_path)
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Impossible de calculer le hash du fichier telecharge.",
details={"error": "sha256_calculation_failed"},
)
job_id = f"tr_{uuid.uuid4().hex[:12]}"
# Track file metadata in Redis with TTL
await storage_tracker.track_file(
job_id=job_id,
metadata={
"original_filename": original_filename,
"file_size": file_size,
"file_hash": file_hash,
"input_path": str(input_path),
"user_id": str(user_id) if user_id else None,
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
_cleanup_old_jobs()
_translation_jobs[job_id] = {
"id": job_id,
"status": "queued",
"progress_percent": 0,
"current_step": "Initializing",
"total_items": 0,
"processed_items": 0,
"error_message": None,
"file_name": original_filename,
"source_lang": source_lang,
"target_lang": target_lang,
"created_at": datetime.now(timezone.utc).isoformat(),
"user_id": user_id,
"input_path": str(input_path),
"file_extension": file_extension,
"provider": provider or mode,
"webhook_url": webhook_url,
"custom_prompt": custom_prompt,
"glossary_id": glossary_id,
"prompt_id": prompt_id, # Story 3.12: Store prompt_id
}
provider_to_use = provider or ("openrouter" if mode == "llm" else "google")
asyncio.create_task(
_run_translation_job(
job_id=job_id,
input_path=input_path,
file_extension=file_extension,
target_lang=target_lang,
source_lang=source_lang,
provider=provider_to_use,
user_id=user_id,
custom_prompt=custom_prompt,
glossary_id=glossary_id,
prompt_id=prompt_id, # Story 3.12: Pass prompt_id
webhook_url=webhook_url,
)
)
logger.info(
f"[{request_id}] Created translation job {job_id} for {original_filename}"
)
return JSONResponse(
status_code=202,
content={
"data": {
"id": job_id,
"status": "processing",
"file_name": original_filename,
"source_lang": source_lang,
"target_lang": target_lang,
},
"meta": {
"rate_limit_remaining": rate_limit_remaining,
"estimated_time_seconds": 15,
},
},
)
except TranslateEndpointError as e:
status_code = 400
if e.code == TranslateEndpointError.FILE_TOO_LARGE:
status_code = 413
elif e.code == TranslateEndpointError.UNAUTHORIZED:
status_code = 401
elif e.code == TranslateEndpointError.PRO_FEATURE_REQUIRED:
status_code = 403
return JSONResponse(
status_code=status_code,
content=e.to_dict(),
)
except HTTPException:
raise
except Exception as e:
logger.error(f"[{request_id}] Unexpected error: {e}")
return JSONResponse(
status_code=400,
content={
"error": "PROCESSING_ERROR",
"message": "Erreur lors du traitement de la requete.",
"details": {"error_type": type(e).__name__},
},
)
async def _run_translation_job(
job_id: str,
input_path: Path,
file_extension: str,
target_lang: str,
source_lang: str,
provider: str,
user_id: Optional[str],
custom_prompt: Optional[str],
glossary_id: Optional[str],
prompt_id: Optional[str] = None, # Story 3.12: Add prompt_id parameter
webhook_url: Optional[str] = None,
) -> None:
"""
Run translation job in background with progress tracking.
Args:
job_id: Unique job identifier
input_path: Path to input file
file_extension: File extension (.xlsx, .docx, .pptx)
target_lang: Target language code
source_lang: Source language code
provider: Translation provider name
user_id: Optional user ID for quota tracking
custom_prompt: Optional custom prompt text (Pro only)
glossary_id: Optional glossary ID for LLM translation (Pro only)
prompt_id: Optional saved prompt ID - takes priority over custom_prompt (Pro only, Story 3.12)
webhook_url: Optional webhook URL for completion notification
"""
job = _translation_jobs.get(job_id)
if not job:
return
tracker = ProgressTracker(job_id, _translation_jobs)
try:
job["status"] = "processing"
tracker.update(10, "Validating file")
output_filename = file_handler_util.generate_unique_filename(
input_path.name.replace("input_", "translated_"), "translated"
)
output_path = config.OUTPUT_DIR / output_filename
from translators import excel_translator, word_translator, pptx_translator
from services.translation_service import (
OpenRouterTranslationProvider,
OllamaTranslationProvider,
translation_service,
)
from routes.admin_routes import load_settings as _load_admin_settings
_admin_cfg = _load_admin_settings()
# Helper: prefer value from admin settings JSON, fall back to env var
def _cfg(admin_val: str | None, env_var: str, default: str = "") -> str:
return (admin_val or "").strip() or os.getenv(env_var, default)
api_key = _cfg(_admin_cfg.openrouter.api_key, "OPENROUTER_API_KEY")
model = _cfg(_admin_cfg.openrouter.model, "OPENROUTER_MODEL", "deepseek/deepseek-v3.2")
# Story 3.10: Retrieve and format glossary terms for LLM prompt
glossary_terms = None
if glossary_id and user_id:
try:
glossary_terms = get_glossary_terms(glossary_id, user_id)
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms")
except GlossaryNotFoundError as e:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Glossary error - {e}")
return
# Story 3.12: Retrieve prompt content if prompt_id provided
# Priority: prompt_id > custom_prompt
effective_prompt = None
if prompt_id and user_id:
try:
effective_prompt = get_prompt_content(prompt_id, user_id)
logger.info(f"Job {job_id}: Loaded prompt content from {prompt_id}")
except PromptNotFoundError as e:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Prompt error - {e}")
return
elif custom_prompt:
# Use custom_prompt if no prompt_id
effective_prompt = custom_prompt
# Build the full prompt combining effective prompt and glossary
full_prompt = build_full_prompt(effective_prompt, glossary_terms)
translation_provider = None
_p = provider.lower()
if _p in ("openrouter", "llm") and api_key:
translation_provider = OpenRouterTranslationProvider(
api_key, model, full_prompt
)
elif _p == "openrouter_premium":
premium_key = _cfg(_admin_cfg.openrouter_premium.api_key, "OPENROUTER_API_KEY")
premium_model = _cfg(_admin_cfg.openrouter_premium.model, "OPENROUTER_PREMIUM_MODEL", "anthropic/claude-3.5-haiku")
if not premium_key:
premium_key = api_key # fall back to main openrouter key
if premium_key:
translation_provider = OpenRouterTranslationProvider(
premium_key, premium_model, full_prompt
)
elif _p == "openai":
from services.translation_service import OpenAITranslationProvider
openai_key = _cfg(_admin_cfg.openai.api_key, "OPENAI_API_KEY")
openai_model = _cfg(_admin_cfg.openai.model, "OPENAI_MODEL", "gpt-4o-mini")
if openai_key:
translation_provider = OpenAITranslationProvider(
api_key=openai_key,
model=openai_model,
system_prompt=full_prompt,
)
elif _p == "deepl":
deepl_key = _cfg(_admin_cfg.deepl.api_key, "DEEPL_API_KEY")
if deepl_key:
from services.translation_service import DeepLTranslationProvider
translation_provider = DeepLTranslationProvider(deepl_key, full_prompt)
elif _p == "zai":
from services.translation_service import OpenAITranslationProvider as _OAI
zai_key = _cfg(_admin_cfg.zai.api_key, "ZAI_API_KEY")
zai_model = _cfg(_admin_cfg.zai.model, "ZAI_MODEL", "grok-2-1212")
zai_url = _cfg(_admin_cfg.zai.base_url, "ZAI_BASE_URL", "https://api.x.ai/v1")
if zai_key:
translation_provider = _OAI(
api_key=zai_key,
model=zai_model,
base_url=zai_url,
system_prompt=full_prompt,
)
elif _p == "ollama":
ollama_url = _cfg(_admin_cfg.ollama.base_url, "OLLAMA_BASE_URL", "http://localhost:11434")
ollama_model = _cfg(_admin_cfg.ollama.model, "OLLAMA_MODEL", "llama3")
translation_provider = OllamaTranslationProvider(
ollama_url,
ollama_model,
ollama_model,
full_prompt,
)
tracker.update(20, "Preparing translation")
def progress_callback(progress_info: dict) -> None:
"""Callback for translator progress updates with standardized key handling."""
current = progress_info.get(
"current",
progress_info.get(
"slide",
progress_info.get(
"sheet",
progress_info.get("paragraph", progress_info.get("element", 1)),
),
),
)
total = progress_info.get(
"total",
progress_info.get(
"total_slides",
progress_info.get(
"total_sheets", progress_info.get("total_paragraphs", 1)
),
),
)
item_name = "Translating"
if file_extension == ".pptx":
item_name = "Translating slide"
elif file_extension == ".xlsx":
item_name = "Translating sheet"
elif file_extension == ".docx":
item_name = "Processing paragraph"
# max_percent=95: the translator reaches current==total when its last
# chunk finishes, but the file is not yet written. set_completed()
# pushes to 100% once the file is saved.
tracker.update_item(current, total, item_name, max_percent=95)
# Run synchronous translators in a thread pool to avoid blocking the event loop.
# Without this, status polling requests from the frontend would time out during
# translation, causing the "Connection lost" error and frozen progress bar.
# Always call set_provider (even with None) to reset any previously-set
# provider on the singleton translator instances between jobs.
if file_extension == ".xlsx":
excel_translator.set_provider(translation_provider)
await asyncio.to_thread(
excel_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
elif file_extension == ".docx":
word_translator.set_provider(translation_provider)
await asyncio.to_thread(
word_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
elif file_extension == ".pptx":
pptx_translator.set_provider(translation_provider)
await asyncio.to_thread(
pptx_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
if user_id:
await tier_quota_service.increment_on_success(user_id)
tracker.set_completed(str(output_path))
logger.info(f"Job {job_id}: Completed successfully")
except Exception as e:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Failed - {e}")
finally:
if webhook_url:
try:
# Generate unique event_id for webhook deduplication
event_id = f"evt_{uuid.uuid4().hex[:16]}"
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(
webhook_url,
json={
"event_id": event_id,
"translation_id": job_id,
"status": job["status"],
"timestamp": datetime.now(timezone.utc).isoformat(),
"file_name": job.get("file_name"),
"source_lang": job.get("source_lang"),
"target_lang": job.get("target_lang"),
"error_message": job.get("error_message"),
},
)
# Log successful webhook delivery
if response.is_success:
logger.info(
f"Job {job_id}: Webhook notification sent successfully to {webhook_url} "
f"(status={response.status_code}, event_id={event_id})"
)
else:
# Log non-2xx response with body for debugging
try:
response_body = await response.aread()
body_preview = response_body[:500].decode('utf-8', errors='replace')
except Exception:
body_preview = "<unable to read body>"
logger.warning(
f"Job {job_id}: Webhook returned non-success status "
f"(status={response.status_code}, url={webhook_url}, event_id={event_id}, "
f"response_body={body_preview})"
)
except httpx.TimeoutException:
logger.warning(
f"Job {job_id}: Webhook notification timed out after 10s (url={webhook_url}, event_id={event_id})"
)
except httpx.RequestError as e:
logger.warning(
f"Job {job_id}: Webhook notification failed - {type(e).__name__}: {e} "
f"(url={webhook_url}, event_id={event_id})"
)
except Exception as e:
logger.warning(
f"Job {job_id}: Unexpected webhook error - {type(e).__name__}: {e} (event_id={event_id})"
)
@router_v1.get(
"/translations/{job_id}",
response_model=TranslationStatusResponse,
responses={
200: {"description": "Translation status", "model": TranslationStatusResponse},
404: {"description": "Job not found", "model": ErrorResponse},
},
)
async def get_translation_status(
job_id: str,
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Get translation job status with real-time progress.
Returns current status and progress of a translation job.
**Status Values:**
- `queued`: Job is waiting to be processed
- `processing`: Job is actively being translated
- `completed`: Translation finished successfully
- `failed`: Translation encountered an error
**Progress Fields:**
- `progress_percent`: 0-100 indicating completion percentage
- `current_step`: Human-readable description of current operation
- `error_message`: Present only when status is "failed"
**Example Response (Processing):**
```json
{
"data": {
"id": "tr_abc123",
"status": "processing",
"progress_percent": 45,
"current_step": "Translating slide 5/10",
"file_name": "presentation.pptx",
"source_lang": "en",
"target_lang": "fr",
"created_at": "2024-01-15T10:30:00Z"
},
"meta": {}
}
```
"""
job = _translation_jobs.get(job_id)
if not job:
return JSONResponse(
status_code=404,
content={
"error": "NOT_FOUND",
"message": "Job de traduction non trouve.",
"details": {"job_id": job_id},
},
)
response_data = {
"id": job["id"],
"status": job["status"],
"progress_percent": job.get("progress_percent", 0),
"current_step": job.get("current_step", "Unknown"),
"file_name": job.get("file_name"),
"source_lang": job.get("source_lang"),
"target_lang": job.get("target_lang"),
"created_at": job.get("created_at"),
}
estimated_remaining = None
if job["status"] == "processing" and job.get("progress_percent", 0) > 0:
try:
created_at_str = job.get("created_at")
if created_at_str:
created_at = datetime.fromisoformat(
created_at_str.replace("Z", "+00:00")
)
elapsed_seconds = (
datetime.now(timezone.utc) - created_at
).total_seconds()
progress_percent = job.get("progress_percent", 0)
if progress_percent > 0:
total_estimated = elapsed_seconds / (progress_percent / 100)
estimated_remaining = max(1, int(total_estimated - elapsed_seconds))
except Exception:
pass
if job["status"] == "completed":
response_data["completed_at"] = job.get("completed_at")
elif job["status"] == "failed":
response_data["failed_at"] = job.get("failed_at")
response_data["error_message"] = job.get("error_message")
return {
"data": response_data,
"meta": {"estimated_remaining_seconds": estimated_remaining},
}
@router_v1.get("/translate/health")
async def translate_health():
"""Health check for translation endpoint."""
return {"status": "healthy", "endpoint": "/api/v1/translate"}
MIME_TYPES = {
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
}
def _cleanup_files(input_path: Optional[str], output_path: Optional[str]) -> None:
"""Delete input and output files after download completes."""
try:
if output_path:
out_path = Path(output_path)
if out_path.exists():
out_path.unlink()
logger.info(f"Deleted output file: {output_path}")
except Exception as e:
logger.warning(f"Failed to delete output file {output_path}: {e}")
try:
if input_path:
in_path = Path(input_path)
if in_path.exists():
in_path.unlink()
logger.info(f"Deleted input file: {input_path}")
except Exception as e:
logger.warning(f"Failed to delete input file {input_path}: {e}")
@router_v1.get(
"/download/{job_id}",
responses={
200: {
"description": "Translated file download",
"content": {"application/octet-stream": {}},
},
404: {"description": "File not found or not ready", "model": ErrorResponse},
},
)
async def download_translated_file(
job_id: str,
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Download a translated file.
Returns the translated file as a binary download with proper Content-Type
and Content-Disposition headers. The file is automatically deleted after
the download completes.
**Status Requirements:**
- Job must exist and have status "completed"
- Job must have an output_path field
**Error Codes:**
- `FILE_EXPIRED`: Job not found, expired, or no output file
- `NOT_READY`: Job exists but translation is not complete
**Response Headers:**
- `Content-Type`: Appropriate MIME type for the file format
- `Content-Disposition`: attachment with filename containing "_translated" suffix
**Example:**
```
GET /api/v1/download/tr_abc123def456
→ Returns file with Content-Disposition: attachment; filename="report_translated.xlsx"
```
"""
if not JOB_ID_PATTERN.match(job_id):
return JSONResponse(
status_code=400,
content={
"error": "INVALID_JOB_ID",
"message": "Format d'identifiant de travail invalide.",
"details": {"job_id": job_id, "expected_format": "tr_xxxxxxxxxxxx"},
},
)
job = _translation_jobs.get(job_id)
if not job:
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "Le fichier traduit n'est plus disponible ou a expire.",
"details": {"job_id": job_id, "status": "not_found"},
},
)
job_user_id = job.get("user_id")
if current_user and job_user_id and str(job_user_id) != str(current_user.id):
return JSONResponse(
status_code=403,
content={
"error": "ACCESS_DENIED",
"message": "Vous n'avez pas acces a ce fichier.",
"details": {"job_id": job_id},
},
)
if job.get("status") != "completed":
return JSONResponse(
status_code=404,
content={
"error": "NOT_READY",
"message": "La traduction est encore en cours.",
"details": {
"job_id": job_id,
"status": job.get("status"),
"progress_percent": job.get("progress_percent", 0),
},
},
)
output_path_str = job.get("output_path")
if not output_path_str:
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "Le fichier traduit n'est plus disponible ou a expire.",
"details": {"job_id": job_id, "status": "no_output_path"},
},
)
output_path = Path(output_path_str)
if not output_path.exists():
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "Le fichier traduit n'est plus disponible ou a expire.",
"details": {"job_id": job_id, "status": "file_deleted"},
},
)
original_filename = job.get("file_name", "document")
if original_filename:
name_without_ext = Path(original_filename).stem
extension = Path(original_filename).suffix.lower()
download_filename = f"{name_without_ext}_translated{extension}"
else:
file_extension = job.get("file_extension", ".xlsx")
download_filename = f"document_translated{file_extension}"
extension = file_extension
mime_type = MIME_TYPES.get(extension, "application/octet-stream")
input_path_str = job.get("input_path")
logger.info(f"Download requested for job {job_id}: {download_filename}")
return FileResponse(
path=str(output_path),
media_type=mime_type,
filename=download_filename,
background=BackgroundTask(_cleanup_files, input_path_str, output_path_str),
)