Files
office_translator/routes/translate_routes.py
sepehr 4b52f4d9df
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m5s
fix: fallback to free Google Translate when Cloud API key is invalid/blocked
Google Cloud Translation API was returning "Requests to this API are
blocked" which got wrapped as a misleading "Erreur lors de la lecture
du fichier PowerPoint". Now probes the key once (cached 10min) and
falls back to deep_translator (free) when the Cloud key is invalid.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 00:02:29 +02:00

1634 lines
64 KiB
Python

"""
API v1 Translate Endpoint (Story 2.10, 2.11, 2.12, 3.6)
POST /api/v1/translate - Submit document for translation
GET /api/v1/translations/{id} - Get translation status with real-time progress
GET /api/v1/download/{id} - Download translated file
Story 3.6: Documentation OpenAPI complète avec exemples et codes d'erreur
"""
import os
import re
import uuid
import time
import socket
import asyncio
import ipaddress
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Any, Literal, Dict
from urllib.parse import urlparse, unquote
import aiofiles
JOB_ID_PATTERN = re.compile(r"^tr_[a-zA-Z0-9_\-]+$")
import httpx
from fastapi import (
APIRouter,
File,
Form,
Header,
HTTPException,
Request,
UploadFile,
Depends,
)
from fastapi.responses import JSONResponse, FileResponse
from starlette.background import BackgroundTask
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from pydantic import BaseModel, Field, field_validator
from typing_extensions import Annotated
from config import config
from models.subscription import PlanType
from middleware.tier_quota import tier_quota_service
from services.auth_service import record_usage
from middleware.validation import FileValidator, ValidationError, LanguageValidator, webhook_validator
from middleware.api_key_auth import get_authenticated_user, get_user_from_api_key
from utils import file_handler
# Import models from schemas (Story 3.6 - DRY principle)
from schemas.translation import (
TranslateResponseData,
TranslateResponseMeta,
TranslateResponse,
TranslationStatusData,
TranslationStatusMeta,
TranslationStatusResponse,
)
from schemas.errors import ErrorResponse
from utils.file_handler import FileHandler
from middleware.metrics import record_translation, record_file_size
from services.progress_tracker import ProgressTracker
from services.storage_tracker import storage_tracker
from core.redis import set_job_status_async, get_job_status_async
from services.glossary_service import get_glossary_terms, validate_glossary_access, build_full_prompt
from services.prompt_service import get_prompt_content, validate_prompt_access
from utils.exceptions import GlossaryNotFoundError, PromptNotFoundError
logger = logging.getLogger(__name__)
router_v1 = APIRouter(prefix="/api/v1", tags=["Translation v1"])
security = HTTPBearer(auto_error=False)
# Reference config for file constraints (avoids duplicating values)
MAX_FILE_SIZE_MB = config.MAX_FILE_SIZE_MB
OFFICE_MAGIC_BYTES = b"PK\x03\x04"
PDF_MAGIC_BYTES = b"%PDF"
ACCEPTED_EXTENSIONS = config.SUPPORTED_EXTENSIONS
class TranslateEndpointError(Exception):
"""Exception for translate endpoint errors with structured error codes."""
INVALID_FORMAT = "INVALID_FORMAT"
CORRUPTED_FILE = "CORRUPTED_FILE"
FILE_TOO_LARGE = "FILE_TOO_LARGE"
QUOTA_EXCEEDED = "QUOTA_EXCEEDED"
URL_DOWNLOAD_FAILED = "URL_DOWNLOAD_FAILED"
URL_UNREACHABLE = "URL_UNREACHABLE"
UNAUTHORIZED = "UNAUTHORIZED"
MISSING_FILE = "MISSING_FILE"
PRO_FEATURE_REQUIRED = "PRO_FEATURE_REQUIRED"
ERROR_MESSAGES = {
INVALID_FORMAT: "Unsupported file format. Accepted formats: .xlsx, .docx, .pptx",
CORRUPTED_FILE: "The file appears corrupted or is not a valid Office document.",
FILE_TOO_LARGE: f"File is too large (max {MAX_FILE_SIZE_MB} MB).",
QUOTA_EXCEEDED: "Monthly translation limit reached.",
URL_DOWNLOAD_FAILED: "Failed to download file from URL.",
URL_UNREACHABLE: "URL unreachable.",
UNAUTHORIZED: "Authentication required.",
MISSING_FILE: "File or URL required.",
PRO_FEATURE_REQUIRED: "This feature requires a Pro subscription.",
}
def __init__(
self, code: str, message: Optional[str] = None, details: Optional[dict] = None
):
self.code = code
self.message = message or self.ERROR_MESSAGES.get(code, "Unknown error")
self.details = details or {}
super().__init__(self.message)
def to_dict(self) -> dict:
result = {
"error": self.code,
"message": self.message,
}
if self.details:
result["details"] = self.details
return result
# NOTE: Response models are now imported from schemas/ module (DRY principle)
# TranslateResponseData, TranslateResponseMeta, TranslateResponse,
# TranslationStatusData, TranslationStatusMeta, TranslationStatusResponse, ErrorResponse
file_validator = FileValidator(
max_size_mb=MAX_FILE_SIZE_MB, allowed_extensions=ACCEPTED_EXTENSIONS
)
file_handler_util = FileHandler()
def _tier_for_quota(plan) -> str:
"""Map plan to quota tier: pro (and equivalent) = unlimited, else free."""
if plan in (PlanType.PRO, PlanType.BUSINESS, PlanType.ENTERPRISE):
return "pro"
return "free"
def _next_midnight_utc() -> datetime:
"""Get next midnight UTC."""
now = datetime.now(timezone.utc)
from datetime import timedelta
tomorrow = now.date() + timedelta(days=1)
return datetime(tomorrow.year, tomorrow.month, tomorrow.day, tzinfo=timezone.utc)
def _seconds_until_midnight_utc() -> int:
"""Seconds until next midnight UTC."""
now = datetime.now(timezone.utc)
next_mid = _next_midnight_utc()
return max(0, int((next_mid - now).total_seconds()))
async def validate_file_content(content: bytes, extension: str) -> None:
"""Validate file content by checking magic bytes."""
if len(content) < 4:
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="File is too small to be a valid document.",
details={"reason": "File is too small"},
)
header = content[:5]
# PDF files start with %PDF
if extension.lower() == ".pdf":
if not header[:4] == PDF_MAGIC_BYTES:
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="File is not a valid PDF.",
details={"reason": "Invalid PDF header"},
)
return
# Office files (xlsx, docx, pptx) are ZIP archives
if header[:4] != OFFICE_MAGIC_BYTES:
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="File is not a valid Office document.",
details={
"accepted_formats": list(ACCEPTED_EXTENSIONS),
"hint": "Office files (.xlsx, .docx, .pptx) must be valid ZIP archives.",
},
)
def _parse_content_disposition(content_disp: str) -> Optional[str]:
"""Parse filename from Content-Disposition header (RFC 5987 compliant)."""
import re
for part in content_disp.split(";"):
part = part.strip()
if part.lower().startswith("filename*="):
match = re.match(r"filename\*=([^']+)'([^']*)'(.+)", part, re.IGNORECASE)
if match:
from urllib.parse import unquote
return unquote(match.group(3))
if part.lower().startswith("filename="):
filename = part.split("=", 1)[1].strip().strip('"').strip("'")
if filename:
return filename
return None
def _is_ssrf_risk(hostname: str) -> bool:
"""Return True if hostname resolves to a private/reserved IP (SSRF prevention).
Blocks: loopback, private, link-local, reserved, multicast ranges.
Also blocks DNS resolution failures to avoid bypass via non-resolvable names.
"""
try:
ip_str = socket.gethostbyname(hostname)
addr = ipaddress.ip_address(ip_str)
return (
addr.is_loopback
or addr.is_private
or addr.is_link_local
or addr.is_reserved
or addr.is_multicast
or addr.is_unspecified
)
except Exception:
return True
async def download_from_url(url: str, timeout: int = 30) -> tuple[Path, str]:
"""Download file from URL using streaming and return (temp_path, filename).
Uses HTTP streaming to avoid loading entire file in memory.
Validates file extension and magic bytes for security.
"""
temp_path = None
parsed_url = urlparse(url)
if parsed_url.scheme not in ("http", "https"):
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="Only HTTP/HTTPS URLs are accepted.",
details={"scheme": parsed_url.scheme or "none"},
)
hostname = parsed_url.hostname or ""
if not hostname or _is_ssrf_risk(hostname):
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="The URL points to a blocked address (private or internal network).",
details={"reason": "ssrf_blocked"},
)
try:
async with httpx.AsyncClient(
timeout=timeout, follow_redirects=True, max_redirects=10
) as client:
async with client.stream("GET", url) as response:
if response.status_code != 200:
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message=f"URL unreachable (HTTP {response.status_code})",
details={"status_code": response.status_code, "url": url[:100]},
)
content_length = response.headers.get("content-length")
if content_length:
try:
file_size = int(content_length)
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
if file_size > max_size_bytes:
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
message=f"File is too large ({round(file_size / (1024 * 1024), 2)} MB, max {MAX_FILE_SIZE_MB} MB).",
details={
"size_mb": round(file_size / (1024 * 1024), 2),
"max_mb": MAX_FILE_SIZE_MB,
},
)
except ValueError:
pass
filename = None
content_disp = response.headers.get("content-disposition", "")
if content_disp:
filename = _parse_content_disposition(content_disp)
if not filename:
filename = unquote(Path(parsed_url.path).name) or "downloaded_file"
extension = Path(filename).suffix.lower()
if extension not in ACCEPTED_EXTENSIONS:
raise TranslateEndpointError(
code=TranslateEndpointError.INVALID_FORMAT,
details={
"detected_extension": extension or "none",
"accepted_formats": list(ACCEPTED_EXTENSIONS),
},
)
unique_id = str(uuid.uuid4())[:8]
safe_filename = f"{unique_id}_{filename}"
temp_path = config.UPLOAD_DIR / safe_filename
temp_path.parent.mkdir(parents=True, exist_ok=True)
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
downloaded_bytes = 0
async with aiofiles.open(temp_path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=65536):
downloaded_bytes += len(chunk)
if downloaded_bytes > max_size_bytes:
await f.close()
if temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
details={
"size_mb": round(
downloaded_bytes / (1024 * 1024), 2
),
"max_mb": MAX_FILE_SIZE_MB,
},
)
await f.write(chunk)
async with aiofiles.open(temp_path, "rb") as f:
header = await f.read(4)
await validate_file_content(header, extension)
return temp_path, filename
except httpx.TimeoutException:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_UNREACHABLE,
message="Download timed out.",
details={"timeout_seconds": timeout},
)
except httpx.RequestError as e:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
message=f"Download error: {str(e)}",
details={"error": str(e)},
)
except TranslateEndpointError:
if temp_path and temp_path.exists():
temp_path.unlink()
raise
except Exception as e:
if temp_path and temp_path.exists():
temp_path.unlink()
raise TranslateEndpointError(
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
message=f"Unexpected download error: {str(e)}",
details={"error": str(e), "error_type": type(e).__name__},
)
_translation_jobs: dict[str, dict] = {}
_JOB_TTL_SECONDS = 3600
_last_cleanup_ts: float = 0.0
# Google Cloud API key validity cache — avoids probing the API on every request.
_gc_key_cache: dict[str, tuple[bool, float]] = {}
_GC_KEY_CACHE_TTL = 600 # 10 minutes
def _google_cloud_key_valid(api_key: str, job_id: str) -> bool:
"""Check if a Google Cloud API key is valid, with a 10-minute cache."""
import time
now = time.time()
cached = _gc_key_cache.get(api_key)
if cached:
is_valid, ts = cached
if now - ts < _GC_KEY_CACHE_TTL:
return is_valid
# Probe the API with a tiny translation
try:
from services.providers.google_cloud_provider import LegacyGoogleCloudAdapter
_test = LegacyGoogleCloudAdapter(api_key)
_test.translate("test", "fr", "en")
_gc_key_cache[api_key] = (True, now)
return True
except Exception as _gc_err:
logger.warning(
"google_cloud_key_invalid",
extra={"job_id": job_id, "error": str(_gc_err)[:200]},
)
_gc_key_cache[api_key] = (False, now)
return False
_CLEANUP_INTERVAL_SECONDS = 300 # run cleanup every 5 minutes at most
def _cleanup_old_jobs() -> None:
"""Remove completed/failed jobs older than TTL to prevent memory leak.
Throttled to run at most every _CLEANUP_INTERVAL_SECONDS to avoid
iterating the full dict on every translation request.
"""
global _last_cleanup_ts
current_time = time.time()
if current_time - _last_cleanup_ts < _CLEANUP_INTERVAL_SECONDS:
return
_last_cleanup_ts = current_time
expired_job_ids = [
job_id
for job_id, job in _translation_jobs.items()
if job.get("status") in ("completed", "failed")
and (
(ts := job.get("completed_at") or job.get("failed_at"))
and _job_age_seconds(ts) > _JOB_TTL_SECONDS
)
]
for job_id in expired_job_ids:
del _translation_jobs[job_id]
logger.debug(f"Cleaned up expired job: {job_id}")
def _job_age_seconds(timestamp_str: str) -> float:
"""Return how many seconds ago a ISO timestamp was."""
try:
ts = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")).timestamp()
return time.time() - ts
except Exception:
return 0.0
@router_v1.post(
"/translate",
response_model=TranslateResponse,
responses={
202: {"description": "Translation job accepted", "model": TranslateResponse},
400: {"description": "Invalid request", "model": ErrorResponse},
401: {"description": "Unauthorized", "model": ErrorResponse},
403: {"description": "Pro feature required", "model": ErrorResponse},
413: {"description": "File too large", "model": ErrorResponse},
429: {"description": "Quota exceeded", "model": ErrorResponse},
},
status_code=202,
)
async def translate_document_v1(
request: Request,
file: Optional[UploadFile] = File(
None, description="Document file (.xlsx, .docx, .pptx)"
),
file_url: Optional[str] = Form(None, description="URL to download file (Pro only)"),
source_lang: str = Form(default="auto", description="Source language code"),
target_lang: str = Form(..., description="Target language code"),
mode: Literal["classic", "llm"] = Form(
default="classic", description="Translation mode"
),
provider: Optional[str] = Form(default=None, description="Provider override"),
webhook_url: Optional[str] = Form(None, description="Webhook URL for notification"),
glossary_id: Optional[str] = Form(None, description="Glossary ID (Pro only)"),
custom_prompt: Optional[str] = Form(None, description="Custom prompt (Pro only)"),
prompt_id: Optional[str] = Form(None, description="Prompt ID from saved prompts (Pro only)"),
pdf_mode: Optional[Literal["layout", "text_only"]] = Form(
default=None, description="PDF translation mode: 'layout' (preserve layout) or 'text_only' (clean text output). PDF only."
),
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Submit a document for translation.
**Authentication:**
- JWT Bearer token in Authorization header (web users)
- X-API-Key header (automation users)
**File Input:**
- `file`: Upload file directly (multipart/form-data)
- `file_url`: URL to download file from (Pro feature)
**Parameters:**
- `source_lang`: Source language code (default: auto-detect)
- `target_lang`: Target language code (required)
- `mode`: Translation mode - "classic" or "llm" (default: classic)
- `provider`: Provider override (google, deepl, ollama, openai, openrouter)
- `webhook_url`: URL to receive POST notification when complete
- `glossary_id`: Glossary ID for LLM translation (Pro only)
- `custom_prompt`: Custom system prompt (Pro only)
- `prompt_id`: Saved prompt ID to use (Pro only). Takes priority over custom_prompt.
**Webhook Notification:**
If `webhook_url` is provided, a POST request will be sent when translation completes.
**Webhook Payload (Success):**
```json
{
"event_id": "evt_abc123def456xyz",
"translation_id": "tr_abc123def456",
"status": "completed",
"timestamp": "2024-01-15T10:30:00Z",
"file_name": "report.xlsx",
"source_lang": "en",
"target_lang": "fr",
"error_message": null
}
```
**Webhook Payload (Failure):**
```json
{
"event_id": "evt_abc123def456xyz",
"translation_id": "tr_abc123def456",
"status": "failed",
"timestamp": "2024-01-15T10:30:00Z",
"file_name": "report.xlsx",
"source_lang": "en",
"target_lang": "fr",
"error_message": "Provider unavailable: connection timeout"
}
```
**Webhook Fields:**
- `event_id`: Unique identifier for webhook deduplication (format: evt_xxxxxxxxxxxxxxxx)
- `translation_id`: The translation job ID
- `status`: "completed" or "failed"
- `timestamp`: ISO 8601 UTC timestamp
- `file_name`: Original file name
- `source_lang`: Source language code
- `target_lang`: Target language code
- `error_message`: Error description (null if successful)
**Webhook Behavior:**
- Timeout: 10 seconds
- Fire & Forget: Translation succeeds even if webhook fails
- Retries: None (implement retry logic on your server if needed)
**Returns:**
- HTTP 202 with job ID and status "processing"
"""
request_id = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
try:
if not file and not file_url:
raise TranslateEndpointError(
code=TranslateEndpointError.MISSING_FILE,
details={"hint": "Provide either 'file' or 'file_url' parameter"},
)
tier = "free"
user_id = None
if current_user:
tier = _tier_for_quota(current_user.plan)
user_id = current_user.id
# Clean up form data (frontend might send "null" or "undefined" as strings)
if glossary_id in ("null", "undefined", ""): glossary_id = None
if custom_prompt in ("null", "undefined", ""): custom_prompt = None
if prompt_id in ("null", "undefined", ""): prompt_id = None
if file_url in ("null", "undefined", ""): file_url = None
# Story 3.12 / Bugfix: If user is on the free tier, they might have stale Pro features
# in their frontend localStorage (e.g., they downgraded, or switched accounts).
# Instead of hard-blocking the translation, we simply strip the Pro features.
if tier == "free":
glossary_id = None
custom_prompt = None
prompt_id = None
if file_url:
if tier == "free":
raise TranslateEndpointError(
code=TranslateEndpointError.PRO_FEATURE_REQUIRED,
message="URL ingestion is reserved for Pro users.",
details={"feature": "file_url", "tier": tier},
)
# Story 3.10: Validate glossary access before creating the job
if glossary_id and user_id:
try:
validate_glossary_access(glossary_id, user_id)
except GlossaryNotFoundError as e:
raise TranslateEndpointError(
code="GLOSSARY_NOT_FOUND",
message=str(e),
details={"glossary_id": glossary_id}
)
# Story 3.12: Validate prompt access before creating the job
if prompt_id and user_id:
try:
validate_prompt_access(prompt_id, user_id)
except PromptNotFoundError as e:
raise TranslateEndpointError(
code="PROMPT_NOT_FOUND",
message=str(e),
details={**e.details, "prompt_id": prompt_id} if e.details else {"prompt_id": prompt_id}
)
if webhook_url:
is_valid, error_msg, error_details = webhook_validator.validate(webhook_url)
if not is_valid:
raise TranslateEndpointError(
code="INVALID_WEBHOOK_URL",
message=error_msg,
details=error_details,
)
if current_user:
quota = await tier_quota_service.check_quota(user_id, tier)
if not quota.allowed:
retry_after = tier_quota_service.seconds_until_reset()
raise HTTPException(
status_code=429,
detail={
"error": "QUOTA_EXCEEDED",
"message": f"Monthly limit reached ({quota.current_usage}/{quota.limit} documents). Upgrade your plan for more.",
"details": {
"current_usage": quota.current_usage,
"limit": quota.limit,
"tier": tier,
"reset_at": quota.reset_at_utc.isoformat(),
},
},
headers={"Retry-After": str(retry_after)},
)
rate_limit_remaining = quota.remaining
else:
rate_limit_remaining = -1
try:
LanguageValidator.validate(target_lang)
except ValidationError as e:
raise TranslateEndpointError(
code="INVALID_FORMAT",
message=f"Invalid target language code: {target_lang}",
details={"field": "target_lang"},
)
if source_lang and source_lang != "auto":
try:
LanguageValidator.validate(source_lang)
except ValidationError:
raise TranslateEndpointError(
code="INVALID_FORMAT",
message=f"Invalid source language code: {source_lang}",
details={"field": "source_lang"},
)
input_path = None
original_filename = None
file_extension = None
file_size = 0
file_hash = None
if file:
validation_result = await file_validator.validate_async(file)
if not validation_result.is_valid:
error_msg = "; ".join(validation_result.errors)
# Use structured error codes from validator
if validation_result.error_code == "file_too_large":
raise TranslateEndpointError(
code=TranslateEndpointError.FILE_TOO_LARGE,
message=error_msg,
details={
"errors": validation_result.errors,
"max_size_mb": MAX_FILE_SIZE_MB,
},
)
elif validation_result.error_code == "invalid_file_content":
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message=error_msg,
details={"errors": validation_result.errors},
)
else:
raise TranslateEndpointError(
code=TranslateEndpointError.INVALID_FORMAT,
message=error_msg,
details={"errors": validation_result.errors},
)
original_filename = file.filename
file_extension = validation_result.data.get("extension")
file_size = validation_result.data.get("size_bytes", 0)
input_filename = file_handler_util.generate_unique_filename(
file.filename, "input"
)
input_path = config.UPLOAD_DIR / input_filename
await file_handler_util.save_upload_file(file, input_path)
file_hash = file_handler_util.calculate_sha256(input_path)
if file_hash is None:
file_handler_util.cleanup_file(input_path)
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Failed to calculate file hash. File may be corrupted.",
details={"error": "sha256_calculation_failed"},
)
elif file_url:
input_path, original_filename = await download_from_url(file_url)
file_extension = Path(original_filename).suffix.lower()
file_size = input_path.stat().st_size
file_hash = file_handler_util.calculate_sha256(input_path)
if file_hash is None:
file_handler_util.cleanup_file(input_path)
raise TranslateEndpointError(
code=TranslateEndpointError.CORRUPTED_FILE,
message="Failed to calculate downloaded file hash.",
details={"error": "sha256_calculation_failed"},
)
job_id = f"tr_{uuid.uuid4().hex[:12]}"
# Track file metadata in Redis with TTL
await storage_tracker.track_file(
job_id=job_id,
metadata={
"original_filename": original_filename,
"file_size": file_size,
"file_hash": file_hash,
"input_path": str(input_path),
"user_id": str(user_id) if user_id else None,
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
_cleanup_old_jobs()
# Record file size metric
if file_extension and file_size:
record_file_size(file_extension, file_size)
_translation_jobs[job_id] = {
"id": job_id,
"status": "queued",
"progress_percent": 0,
"current_step": "Initializing",
"total_items": 0,
"processed_items": 0,
"error_message": None,
"file_name": original_filename,
"source_lang": source_lang,
"target_lang": target_lang,
"created_at": datetime.now(timezone.utc).isoformat(),
"user_id": user_id,
"input_path": str(input_path),
"file_extension": file_extension,
"provider": provider or mode,
"webhook_url": webhook_url,
"custom_prompt": custom_prompt,
"glossary_id": glossary_id,
"prompt_id": prompt_id, # Story 3.12: Store prompt_id
"pdf_mode": pdf_mode, # PDF translation mode
}
await set_job_status_async(job_id, _translation_jobs[job_id])
provider_to_use = provider or ("openrouter" if mode == "llm" else "google")
# google_cloud (API officielle payante) est réservé aux forfaits Pro et supérieurs.
# Les plans free/starter sont silencieusement redirigés vers le Google Translate gratuit.
if provider_to_use == "google_cloud":
_paid_plans = (PlanType.PRO, PlanType.BUSINESS, PlanType.ENTERPRISE)
if not current_user or current_user.plan not in _paid_plans:
logger.info(
"google_cloud_downgraded_to_google",
reason="plan_restriction",
plan=str(current_user.plan) if current_user else "anonymous",
user_id=str(current_user.id) if current_user else None,
)
provider_to_use = "google"
asyncio.create_task(
_run_translation_job(
job_id=job_id,
input_path=input_path,
file_extension=file_extension,
target_lang=target_lang,
source_lang=source_lang,
provider=provider_to_use,
user_id=user_id,
custom_prompt=custom_prompt,
glossary_id=glossary_id,
prompt_id=prompt_id,
webhook_url=webhook_url,
user_plan=str(current_user.plan) if current_user else "free",
pdf_mode=pdf_mode,
)
)
logger.info(
f"[{request_id}] Created translation job {job_id} for {original_filename}"
)
return JSONResponse(
status_code=202,
content={
"data": {
"id": job_id,
"status": "processing",
"file_name": original_filename,
"source_lang": source_lang,
"target_lang": target_lang,
},
"meta": {
"rate_limit_remaining": rate_limit_remaining,
"estimated_time_seconds": 15,
},
},
)
except TranslateEndpointError as e:
status_code = 400
if e.code == TranslateEndpointError.FILE_TOO_LARGE:
status_code = 413
elif e.code == TranslateEndpointError.UNAUTHORIZED:
status_code = 401
elif e.code == TranslateEndpointError.PRO_FEATURE_REQUIRED:
status_code = 403
return JSONResponse(
status_code=status_code,
content=e.to_dict(),
)
except HTTPException:
raise
except Exception as e:
logger.error(f"[{request_id}] Unexpected error: {e}")
return JSONResponse(
status_code=400,
content={
"error": "PROCESSING_ERROR",
"message": "Error processing the request.",
"details": {"error_type": type(e).__name__},
},
)
def _estimate_pages(file_path: Path, file_extension: str) -> int:
"""
Lightweight page-count estimate for usage accounting.
- .pptx : number of slides (exact)
- .xlsx : number of visible sheets
- .docx : rough estimate (~2 500 chars per page)
Returns at least 1.
"""
try:
ext = file_extension.lower()
if ext == ".pptx":
from pptx import Presentation # already a dep
prs = Presentation(str(file_path))
return max(1, len(prs.slides))
elif ext == ".xlsx":
import openpyxl # already a dep
wb = openpyxl.load_workbook(str(file_path), read_only=True, data_only=True)
count = len(wb.sheetnames)
wb.close()
return max(1, count)
elif ext == ".docx":
import docx # already a dep
doc = docx.Document(str(file_path))
char_count = sum(len(p.text) for p in doc.paragraphs)
return max(1, round(char_count / 2500))
except Exception as exc:
logger.warning(f"_estimate_pages failed for {file_extension}: {exc}")
return 1
class _GoogleCloudWithFallback:
"""Tries Google Cloud API first, falls back to deep_translator on error.
This avoids a hard crash when the Cloud API key is invalid, quota is
exceeded, or the network is unreachable. The legacy GoogleTranslator
(deep_translator, free, no key) is used as a best-effort fallback.
"""
def __init__(self, cloud_adapter, legacy_provider):
self.cloud = cloud_adapter
self.legacy = legacy_provider
self.provider_name = "google_cloud_with_fallback"
def translate(self, text, target_language, source_language="auto"):
try:
return self.cloud.translate(text, target_language, source_language)
except Exception as e:
logger.warning(
"google_cloud_failed_fallback_to_legacy",
error=str(e)[:200],
)
return self.legacy.translate(text, target_language, source_language)
def translate_batch(
self, texts, target_language, source_language="auto", **kwargs
):
try:
return self.cloud.translate_batch(
texts, target_language, source_language
)
except Exception as e:
logger.warning(
"google_cloud_batch_failed_fallback_to_legacy",
error=str(e)[:200],
text_count=len(texts),
)
return self.legacy.translate_batch(
texts, target_language, source_language
)
async def _run_translation_job(
job_id: str,
input_path: Path,
file_extension: str,
target_lang: str,
source_lang: str,
provider: str,
user_id: Optional[str],
custom_prompt: Optional[str],
glossary_id: Optional[str],
prompt_id: Optional[str] = None, # Story 3.12: Add prompt_id parameter
webhook_url: Optional[str] = None,
user_plan: Optional[str] = None, # Plan name for watermark decision
pdf_mode: Optional[str] = None, # PDF translation mode: "layout" or "text_only"
) -> None:
"""
Run translation job in background with progress tracking.
Args:
job_id: Unique job identifier
input_path: Path to input file
file_extension: File extension (.xlsx, .docx, .pptx)
target_lang: Target language code
source_lang: Source language code
provider: Translation provider name
user_id: Optional user ID for quota tracking
custom_prompt: Optional custom prompt text (Pro only)
glossary_id: Optional glossary ID for LLM translation (Pro only)
prompt_id: Optional saved prompt ID - takes priority over custom_prompt (Pro only, Story 3.12)
webhook_url: Optional webhook URL for completion notification
"""
job = _translation_jobs.get(job_id)
if not job:
return
tracker = ProgressTracker(job_id, _translation_jobs)
try:
job["status"] = "processing"
await set_job_status_async(job_id, dict(job))
tracker.update(10, "Validating file")
async def _sync_job_to_redis():
"""Sync job status to Redis every 0.5s until completed/failed or job removed."""
while True:
await asyncio.sleep(0.5)
j = _translation_jobs.get(job_id)
if not j:
break
await set_job_status_async(job_id, dict(j))
if j.get("status") in ("completed", "failed"):
break
asyncio.create_task(_sync_job_to_redis())
output_filename = file_handler_util.generate_unique_filename(
input_path.name.replace("input_", "translated_"), "translated"
)
output_path = config.OUTPUT_DIR / output_filename
from translators import ExcelTranslator, WordTranslator, PowerPointTranslator
from services.translation_service import (
OpenRouterTranslationProvider,
OllamaTranslationProvider,
translation_service,
)
from routes.admin_routes import load_settings as _load_admin_settings
_admin_cfg = _load_admin_settings()
# Helper: prefer value from admin settings JSON, fall back to env var
def _cfg(admin_val: str | None, env_var: str, default: str = "") -> str:
return (admin_val or "").strip() or os.getenv(env_var, default)
api_key = _cfg(_admin_cfg.openrouter.api_key, "OPENROUTER_API_KEY")
model = _cfg(_admin_cfg.openrouter.model, "OPENROUTER_MODEL", "deepseek/deepseek-v3.2")
# Story 3.10: Retrieve and format glossary terms for LLM prompt
glossary_terms = None
glossary_source_lang = "fr"
if glossary_id and user_id:
try:
glossary_data = get_glossary_terms(glossary_id, user_id)
glossary_terms = glossary_data["terms"]
glossary_source_lang = glossary_data.get("source_language", "fr")
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
except GlossaryNotFoundError as e:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Glossary error - {e}")
return
# Story 3.12: Retrieve prompt content if prompt_id provided
# Priority: prompt_id > custom_prompt
effective_prompt = None
if prompt_id and user_id:
try:
effective_prompt = get_prompt_content(prompt_id, user_id)
logger.info(f"Job {job_id}: Loaded prompt content from {prompt_id}")
except PromptNotFoundError as e:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Prompt error - {e}")
return
elif custom_prompt:
# Use custom_prompt if no prompt_id
effective_prompt = custom_prompt
# Build the full prompt combining effective prompt and glossary
full_prompt = build_full_prompt(
effective_prompt, glossary_terms,
source_lang=glossary_source_lang, target_lang=target_lang,
)
translation_provider = None
_p = provider.lower()
# "google" (default classic mode): use Google Cloud API key if available.
# If the Cloud API key is invalid or the API is not enabled, fall back
# to the free legacy Google Translate (deep_translator) instead of failing.
if _p == "google":
# the user might have set GOOGLE_API_KEY instead of GOOGLE_CLOUD_API_KEY
gc_key = _cfg(
getattr(_admin_cfg.google_cloud, "api_key", None),
"GOOGLE_CLOUD_API_KEY",
) or os.getenv("GOOGLE_API_KEY", "").strip()
if gc_key and _google_cloud_key_valid(gc_key, job_id):
from services.providers.google_cloud_provider import LegacyGoogleCloudAdapter
translation_provider = LegacyGoogleCloudAdapter(gc_key)
logger.info("google_provider_using_cloud_api", extra={"job_id": job_id})
else:
from services.translation_service import GoogleTranslationProvider
translation_provider = GoogleTranslationProvider()
logger.info("google_provider_using_legacy", extra={"job_id": job_id})
elif _p in ("openrouter", "llm") and api_key:
translation_provider = OpenRouterTranslationProvider(
api_key, model, full_prompt
)
elif _p == "openrouter_premium":
premium_key = _cfg(_admin_cfg.openrouter_premium.api_key, "OPENROUTER_API_KEY")
premium_model = _cfg(_admin_cfg.openrouter_premium.model, "OPENROUTER_PREMIUM_MODEL", "anthropic/claude-3.5-haiku")
if not premium_key:
premium_key = api_key # fall back to main openrouter key
if premium_key:
translation_provider = OpenRouterTranslationProvider(
premium_key, premium_model, full_prompt
)
elif _p == "openai":
from services.translation_service import OpenAITranslationProvider
openai_key = _cfg(_admin_cfg.openai.api_key, "OPENAI_API_KEY")
openai_model = _cfg(_admin_cfg.openai.model, "OPENAI_MODEL", "gpt-4o-mini")
if openai_key:
translation_provider = OpenAITranslationProvider(
api_key=openai_key,
model=openai_model,
system_prompt=full_prompt,
)
elif _p == "deepseek":
from services.translation_service import OpenAITranslationProvider as _OAI
ds_key = _cfg(getattr(_admin_cfg, "deepseek", None) and _admin_cfg.deepseek.api_key, "DEEPSEEK_API_KEY")
ds_model = _cfg(getattr(_admin_cfg, "deepseek", None) and _admin_cfg.deepseek.model, "DEEPSEEK_MODEL", "deepseek-chat")
ds_url = "https://api.deepseek.com/v1"
if ds_key:
translation_provider = _OAI(
api_key=ds_key,
model=ds_model,
base_url=ds_url,
system_prompt=full_prompt,
)
elif _p == "minimax":
from services.translation_service import OpenAITranslationProvider as _OAI
mm_key = _cfg(getattr(_admin_cfg, "minimax", None) and _admin_cfg.minimax.api_key, "MINIMAX_API_KEY")
mm_model = _cfg(getattr(_admin_cfg, "minimax", None) and _admin_cfg.minimax.model, "MINIMAX_MODEL", "abab6.5s-chat")
mm_url = "https://api.minimax.chat/v1"
if mm_key:
translation_provider = _OAI(
api_key=mm_key,
model=mm_model,
base_url=mm_url,
system_prompt=full_prompt,
)
elif _p == "deepl":
deepl_key = _cfg(_admin_cfg.deepl.api_key, "DEEPL_API_KEY")
if deepl_key:
from services.translation_service import DeepLTranslationProvider
translation_provider = DeepLTranslationProvider(deepl_key)
elif _p == "zai":
from services.translation_service import OpenAITranslationProvider as _OAI
zai_key = _cfg(_admin_cfg.zai.api_key, "ZAI_API_KEY")
zai_model = _cfg(_admin_cfg.zai.model, "ZAI_MODEL", "grok-2-1212")
zai_url = _cfg(_admin_cfg.zai.base_url, "ZAI_BASE_URL", "https://api.x.ai/v1")
if zai_key:
translation_provider = _OAI(
api_key=zai_key,
model=zai_model,
base_url=zai_url,
system_prompt=full_prompt,
)
elif _p == "ollama":
ollama_url = _cfg(_admin_cfg.ollama.base_url, "OLLAMA_BASE_URL", "http://localhost:11434")
ollama_model = _cfg(_admin_cfg.ollama.model, "OLLAMA_MODEL", "llama3")
translation_provider = OllamaTranslationProvider(
ollama_url,
ollama_model,
ollama_model,
full_prompt,
)
elif _p == "google_cloud":
from services.providers.google_cloud_provider import GoogleCloudTranslationProvider
gc_key = _cfg(
getattr(_admin_cfg.google_cloud, "api_key", None),
"GOOGLE_CLOUD_API_KEY",
)
if gc_key:
translation_provider = GoogleCloudTranslationProvider(
api_key=gc_key,
timeout=int(os.getenv("GOOGLE_CLOUD_TIMEOUT", "30")),
max_retries=int(os.getenv("GOOGLE_CLOUD_MAX_RETRIES", "3")),
retry_delay=float(os.getenv("GOOGLE_CLOUD_RETRY_DELAY", "1.0")),
)
logger.info(
"google_cloud_provider_selected",
extra={"job_id": job_id},
)
else:
logger.warning(
"google_cloud_key_missing_fallback_to_google",
extra={"job_id": job_id},
)
# translation_provider reste None → legacy Google gratuit
tracker.update(20, "Preparing translation")
def progress_callback(progress_info: dict) -> None:
"""Callback for translator progress updates with standardized key handling."""
current = progress_info.get(
"current",
progress_info.get(
"slide",
progress_info.get(
"sheet",
progress_info.get("paragraph", progress_info.get("element", 1)),
),
),
)
total = progress_info.get(
"total",
progress_info.get(
"total_slides",
progress_info.get(
"total_sheets", progress_info.get("total_paragraphs", 1)
),
),
)
item_name = "Translating"
if file_extension == ".pptx":
item_name = "Translating slide"
elif file_extension == ".xlsx":
item_name = "Translating sheet"
elif file_extension == ".docx":
item_name = "Processing paragraph"
# max_percent=95: the translator reaches current==total when its last
# chunk finishes, but the file is not yet written. set_completed()
# pushes to 100% once the file is saved.
tracker.update_item(current, total, item_name, max_percent=95)
# Run synchronous translators in a thread pool to avoid blocking the event loop.
# Without this, status polling requests from the frontend would time out during
# translation, causing the "Connection lost" error and frozen progress bar.
# One translator instance per job so concurrent jobs never share mutable
# provider state (singleton set_provider was racy under parallel translations).
if file_extension == ".xlsx":
job_translator = ExcelTranslator(provider=translation_provider)
await asyncio.to_thread(
job_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
elif file_extension == ".docx":
job_translator = WordTranslator(provider=translation_provider)
await asyncio.to_thread(
job_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
elif file_extension == ".pptx":
job_translator = PowerPointTranslator(provider=translation_provider)
await asyncio.to_thread(
job_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
)
elif file_extension == ".pdf":
from translators.pdf_translator import PDFTranslator
job_translator = PDFTranslator(provider=translation_provider)
actual_output = await asyncio.to_thread(
job_translator.translate_file,
input_path,
output_path,
target_lang,
source_lang,
progress_callback=progress_callback,
pdf_mode=pdf_mode or "layout",
)
# PDF translation may output .docx (if no LibreOffice); use actual path
if actual_output and Path(actual_output).exists():
output_path = Path(actual_output)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
# ── Verify translation actually produced results ──
if not output_path.exists() or output_path.stat().st_size == 0:
error_msg = "Translation failed: output file is empty or missing. The translation provider may be unavailable."
logger.error(f"Job {job_id}: {error_msg}")
tracker.set_error(error_msg)
return
stats = job_translator.get_translation_stats()
attempted = stats.get("attempted", 0)
changed = stats.get("changed", 0)
if attempted == 0 and file_extension in ('.docx', '.xlsx', '.pptx'):
error_msg = (
"Aucun texte traduisible détecté dans le document. "
"Le fichier est peut-être vide, protégé, ou ne contient que des images."
)
logger.error(f"Job {job_id}: {error_msg}")
tracker.set_error(error_msg)
return
if attempted > 0:
ratio = changed / attempted
logger.info(f"Job {job_id}: translation stats — {changed}/{attempted} texts changed ({ratio:.0%})")
if changed == 0:
error_msg = (
f"0 textes sur {attempted} ont été traduits. "
f"Le moteur ({provider}) est peut-être indisponible ou mal configuré. "
f"Vérifiez les clés API dans les paramètres admin."
)
logger.error(f"Job {job_id}: {error_msg}")
tracker.set_error(error_msg)
return
elif ratio < 0.05:
# Very suspicious — likely partial failure, warn but don't block
logger.warning(
f"Job {job_id}: suspiciously low translation rate: "
f"{changed}/{attempted} ({ratio:.1%})"
)
if user_id:
await tier_quota_service.increment_on_success(user_id)
# Persist monthly usage counters in PostgreSQL (docs + pages)
pages = await asyncio.to_thread(
_estimate_pages, input_path, file_extension
)
await asyncio.to_thread(record_usage, user_id, pages)
logger.info(f"Job {job_id}: usage recorded — {pages} page(s)")
# Apply watermark for Free-tier users
plan_name = (user_plan or "free").lower()
if plan_name in ("free", "plantype.free"):
try:
from translators.watermark import add_watermark
actual_ext = output_path.suffix.lower()
await asyncio.to_thread(add_watermark, output_path, actual_ext)
logger.info(f"Job {job_id}: watermark applied (free plan)")
except Exception as wm_err:
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
tracker.set_completed(str(output_path))
# Record translation metric
duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple())
record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success")
logger.info(f"Job {job_id}: Completed successfully")
except Exception as e:
# Check if this is our structured TranslationProviderError
if type(e).__name__ == "TranslationProviderError":
tracker.set_error(e.message)
logger.error(f"Job {job_id}: Provider Failed - {e.code}: {e.message}")
else:
tracker.set_error(str(e))
logger.error(f"Job {job_id}: Failed - {e}")
# Record translation failure metric
record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error")
finally:
if webhook_url:
try:
# Generate unique event_id for webhook deduplication
event_id = f"evt_{uuid.uuid4().hex[:16]}"
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(
webhook_url,
json={
"event_id": event_id,
"translation_id": job_id,
"status": job["status"],
"timestamp": datetime.now(timezone.utc).isoformat(),
"file_name": job.get("file_name"),
"source_lang": job.get("source_lang"),
"target_lang": job.get("target_lang"),
"error_message": job.get("error_message"),
},
)
# Log successful webhook delivery
if response.is_success:
logger.info(
f"Job {job_id}: Webhook notification sent successfully to {webhook_url} "
f"(status={response.status_code}, event_id={event_id})"
)
else:
# Log non-2xx response with body for debugging
try:
response_body = await response.aread()
body_preview = response_body[:500].decode('utf-8', errors='replace')
except Exception:
body_preview = "<unable to read body>"
logger.warning(
f"Job {job_id}: Webhook returned non-success status "
f"(status={response.status_code}, url={webhook_url}, event_id={event_id}, "
f"response_body={body_preview})"
)
except httpx.TimeoutException:
logger.warning(
f"Job {job_id}: Webhook notification timed out after 10s (url={webhook_url}, event_id={event_id})"
)
except httpx.RequestError as e:
logger.warning(
f"Job {job_id}: Webhook notification failed - {type(e).__name__}: {e} "
f"(url={webhook_url}, event_id={event_id})"
)
except Exception as e:
logger.warning(
f"Job {job_id}: Unexpected webhook error - {type(e).__name__}: {e} (event_id={event_id})"
)
@router_v1.get(
"/translations/{job_id}",
response_model=TranslationStatusResponse,
responses={
200: {"description": "Translation status", "model": TranslationStatusResponse},
404: {"description": "Job not found", "model": ErrorResponse},
},
)
async def get_translation_status(
job_id: str,
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Get translation job status with real-time progress.
Returns current status and progress of a translation job.
**Status Values:**
- `queued`: Job is waiting to be processed
- `processing`: Job is actively being translated
- `completed`: Translation finished successfully
- `failed`: Translation encountered an error
**Progress Fields:**
- `progress_percent`: 0-100 indicating completion percentage
- `current_step`: Human-readable description of current operation
- `error_message`: Present only when status is "failed"
**Example Response (Processing):**
```json
{
"data": {
"id": "tr_abc123",
"status": "processing",
"progress_percent": 45,
"current_step": "Translating slide 5/10",
"file_name": "presentation.pptx",
"source_lang": "en",
"target_lang": "fr",
"created_at": "2024-01-15T10:30:00Z"
},
"meta": {}
}
```
"""
job = await get_job_status_async(job_id)
if not job:
job = _translation_jobs.get(job_id)
if not job:
return JSONResponse(
status_code=404,
content={
"error": "NOT_FOUND",
"message": "Translation job not found.",
"details": {"job_id": job_id},
},
)
response_data = {
"id": job["id"],
"status": job["status"],
"progress_percent": job.get("progress_percent", 0),
"current_step": job.get("current_step", "Unknown"),
"file_name": job.get("file_name"),
"source_lang": job.get("source_lang"),
"target_lang": job.get("target_lang"),
"created_at": job.get("created_at"),
}
estimated_remaining = None
if job["status"] == "processing" and job.get("progress_percent", 0) > 0:
try:
created_at_str = job.get("created_at")
if created_at_str:
created_at = datetime.fromisoformat(
created_at_str.replace("Z", "+00:00")
)
elapsed_seconds = (
datetime.now(timezone.utc) - created_at
).total_seconds()
progress_percent = job.get("progress_percent", 0)
if progress_percent > 0:
total_estimated = elapsed_seconds / (progress_percent / 100)
estimated_remaining = max(1, int(total_estimated - elapsed_seconds))
except Exception:
pass
if job["status"] == "completed":
response_data["completed_at"] = job.get("completed_at")
elif job["status"] == "failed":
response_data["failed_at"] = job.get("failed_at")
response_data["error_message"] = job.get("error_message")
return {
"data": response_data,
"meta": {"estimated_remaining_seconds": estimated_remaining},
}
@router_v1.get("/translate/health")
async def translate_health():
"""Health check for translation endpoint."""
return {"status": "healthy", "endpoint": "/api/v1/translate"}
MIME_TYPES = {
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
".pdf": "application/pdf",
}
def _cleanup_files(input_path: Optional[str], output_path: Optional[str]) -> None:
"""Delete input and output files after download completes."""
try:
if output_path:
out_path = Path(output_path)
if out_path.exists():
out_path.unlink()
logger.info(f"Deleted output file: {output_path}")
except Exception as e:
logger.warning(f"Failed to delete output file {output_path}: {e}")
try:
if input_path:
in_path = Path(input_path)
if in_path.exists():
in_path.unlink()
logger.info(f"Deleted input file: {input_path}")
except Exception as e:
logger.warning(f"Failed to delete input file {input_path}: {e}")
@router_v1.get(
"/download/{job_id}",
responses={
200: {
"description": "Translated file download",
"content": {"application/octet-stream": {}},
},
404: {"description": "File not found or not ready", "model": ErrorResponse},
},
)
async def download_translated_file(
job_id: str,
current_user: Optional[Any] = Depends(get_authenticated_user),
):
"""
Download a translated file.
Returns the translated file as a binary download with proper Content-Type
and Content-Disposition headers. The file is automatically deleted after
the download completes.
**Status Requirements:**
- Job must exist and have status "completed"
- Job must have an output_path field
**Error Codes:**
- `FILE_EXPIRED`: Job not found, expired, or no output file
- `NOT_READY`: Job exists but translation is not complete
**Response Headers:**
- `Content-Type`: Appropriate MIME type for the file format
- `Content-Disposition`: attachment with filename containing "_translated" suffix
**Example:**
```
GET /api/v1/download/tr_abc123def456
→ Returns file with Content-Disposition: attachment; filename="report_translated.xlsx"
```
"""
if not JOB_ID_PATTERN.match(job_id):
return JSONResponse(
status_code=400,
content={
"error": "INVALID_JOB_ID",
"message": "Invalid job ID format.",
"details": {"job_id": job_id, "expected_format": "tr_xxxxxxxxxxxx"},
},
)
job = await get_job_status_async(job_id)
if not job:
job = _translation_jobs.get(job_id)
if not job:
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "The translated file is no longer available or has expired.",
"details": {"job_id": job_id, "status": "not_found"},
},
)
job_user_id = job.get("user_id")
if current_user and job_user_id and str(job_user_id) != str(current_user.id):
return JSONResponse(
status_code=403,
content={
"error": "ACCESS_DENIED",
"message": "You do not have access to this file.",
"details": {"job_id": job_id},
},
)
if job.get("status") != "completed":
return JSONResponse(
status_code=404,
content={
"error": "NOT_READY",
"message": "Translation is still in progress.",
"details": {
"job_id": job_id,
"status": job.get("status"),
"progress_percent": job.get("progress_percent", 0),
},
},
)
output_path_str = job.get("output_path")
if not output_path_str:
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "The translated file is no longer available or has expired.",
"details": {"job_id": job_id, "status": "no_output_path"},
},
)
output_path = Path(output_path_str)
if not output_path.exists():
return JSONResponse(
status_code=404,
content={
"error": "FILE_EXPIRED",
"message": "The translated file is no longer available or has expired.",
"details": {"job_id": job_id, "status": "file_deleted"},
},
)
original_filename = job.get("file_name", "document")
# Use the actual output file extension (PDF→DOCX conversion changes extension)
actual_extension = output_path.suffix.lower()
if original_filename:
name_without_ext = Path(original_filename).stem
download_filename = f"{name_without_ext}_translated{actual_extension}"
else:
download_filename = f"document_translated{actual_extension}"
mime_type = MIME_TYPES.get(actual_extension, "application/octet-stream")
input_path_str = job.get("input_path")
logger.info(f"Download requested for job {job_id}: {download_filename}")
return FileResponse(
path=str(output_path),
media_type=mime_type,
filename=download_filename,
background=BackgroundTask(_cleanup_files, input_path_str, output_path_str),
)