All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 3m16s
1728 lines
68 KiB
Python
1728 lines
68 KiB
Python
"""
|
|
API v1 Translate Endpoint (Story 2.10, 2.11, 2.12, 3.6)
|
|
POST /api/v1/translate - Submit document for translation
|
|
GET /api/v1/translations/{id} - Get translation status with real-time progress
|
|
GET /api/v1/download/{id} - Download translated file
|
|
|
|
Story 3.6: Documentation OpenAPI complète avec exemples et codes d'erreur
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import uuid
|
|
import time
|
|
import socket
|
|
import asyncio
|
|
import ipaddress
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Any, Literal, Dict
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
import aiofiles
|
|
|
|
JOB_ID_PATTERN = re.compile(r"^tr_[a-zA-Z0-9_\-]+$")
|
|
|
|
import httpx
|
|
from fastapi import (
|
|
APIRouter,
|
|
File,
|
|
Form,
|
|
Header,
|
|
HTTPException,
|
|
Request,
|
|
UploadFile,
|
|
Depends,
|
|
)
|
|
from fastapi.responses import JSONResponse, FileResponse
|
|
from starlette.background import BackgroundTask
|
|
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
from pydantic import BaseModel, Field, field_validator
|
|
from typing_extensions import Annotated
|
|
|
|
from config import config
|
|
from translators import ExcelTranslator, WordTranslator, PowerPointTranslator
|
|
from models.subscription import PlanType
|
|
from services.auth_service import (
|
|
record_usage,
|
|
check_usage_limits,
|
|
reserve_translation_quota,
|
|
release_translation_quota,
|
|
)
|
|
from middleware.tier_quota import _seconds_until_next_month, _next_month_utc
|
|
from middleware.validation import FileValidator, ValidationError, LanguageValidator, webhook_validator
|
|
from middleware.api_key_auth import get_authenticated_user, get_user_from_api_key
|
|
from utils import file_handler
|
|
|
|
# Import models from schemas (Story 3.6 - DRY principle)
|
|
from schemas.translation import (
|
|
TranslateResponseData,
|
|
TranslateResponseMeta,
|
|
TranslateResponse,
|
|
TranslationStatusData,
|
|
TranslationStatusMeta,
|
|
TranslationStatusResponse,
|
|
)
|
|
from schemas.errors import ErrorResponse
|
|
from utils.file_handler import FileHandler
|
|
from middleware.metrics import record_translation, record_file_size
|
|
from services.progress_tracker import ProgressTracker
|
|
from services.storage_tracker import storage_tracker
|
|
from core.redis import set_job_status_async, get_job_status_async
|
|
from services.glossary_service import get_glossary_terms, validate_glossary_access, build_full_prompt
|
|
from services.prompt_service import get_prompt_content, validate_prompt_access
|
|
from utils.exceptions import GlossaryNotFoundError, PromptNotFoundError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router_v1 = APIRouter(prefix="/api/v1", tags=["Translation v1"])
|
|
security = HTTPBearer(auto_error=False)
|
|
|
|
# Reference config for file constraints (avoids duplicating values)
|
|
MAX_FILE_SIZE_MB = config.MAX_FILE_SIZE_MB
|
|
OFFICE_MAGIC_BYTES = b"PK\x03\x04"
|
|
PDF_MAGIC_BYTES = b"%PDF"
|
|
ACCEPTED_EXTENSIONS = config.SUPPORTED_EXTENSIONS
|
|
|
|
|
|
class TranslateEndpointError(Exception):
|
|
"""Exception for translate endpoint errors with structured error codes."""
|
|
|
|
INVALID_FORMAT = "INVALID_FORMAT"
|
|
CORRUPTED_FILE = "CORRUPTED_FILE"
|
|
FILE_TOO_LARGE = "FILE_TOO_LARGE"
|
|
QUOTA_EXCEEDED = "QUOTA_EXCEEDED"
|
|
URL_DOWNLOAD_FAILED = "URL_DOWNLOAD_FAILED"
|
|
URL_UNREACHABLE = "URL_UNREACHABLE"
|
|
UNAUTHORIZED = "UNAUTHORIZED"
|
|
MISSING_FILE = "MISSING_FILE"
|
|
PRO_FEATURE_REQUIRED = "PRO_FEATURE_REQUIRED"
|
|
|
|
ERROR_MESSAGES = {
|
|
INVALID_FORMAT: "Unsupported file format. Accepted formats: .xlsx, .docx, .pptx",
|
|
CORRUPTED_FILE: "The file appears corrupted or is not a valid Office document.",
|
|
FILE_TOO_LARGE: f"File is too large (max {MAX_FILE_SIZE_MB} MB).",
|
|
QUOTA_EXCEEDED: "Monthly translation limit reached.",
|
|
URL_DOWNLOAD_FAILED: "Failed to download file from URL.",
|
|
URL_UNREACHABLE: "URL unreachable.",
|
|
UNAUTHORIZED: "Authentication required.",
|
|
MISSING_FILE: "File or URL required.",
|
|
PRO_FEATURE_REQUIRED: "This feature requires a Pro subscription.",
|
|
}
|
|
|
|
def __init__(
|
|
self, code: str, message: Optional[str] = None, details: Optional[dict] = None
|
|
):
|
|
self.code = code
|
|
self.message = message or self.ERROR_MESSAGES.get(code, "Unknown error")
|
|
self.details = details or {}
|
|
super().__init__(self.message)
|
|
|
|
def to_dict(self) -> dict:
|
|
result = {
|
|
"error": self.code,
|
|
"message": self.message,
|
|
}
|
|
if self.details:
|
|
result["details"] = self.details
|
|
return result
|
|
|
|
|
|
# NOTE: Response models are now imported from schemas/ module (DRY principle)
|
|
# TranslateResponseData, TranslateResponseMeta, TranslateResponse,
|
|
# TranslationStatusData, TranslationStatusMeta, TranslationStatusResponse, ErrorResponse
|
|
|
|
file_validator = FileValidator(
|
|
max_size_mb=MAX_FILE_SIZE_MB, allowed_extensions=ACCEPTED_EXTENSIONS
|
|
)
|
|
file_handler_util = FileHandler()
|
|
|
|
|
|
def _tier_for_quota(plan) -> str:
|
|
"""Map plan to quota tier: pro/starter (and equivalent) = unlimited, else free."""
|
|
if plan in (PlanType.PRO, PlanType.BUSINESS, PlanType.ENTERPRISE):
|
|
return "pro"
|
|
if plan == PlanType.STARTER:
|
|
return "starter"
|
|
return "free"
|
|
|
|
|
|
def _next_midnight_utc() -> datetime:
|
|
"""Get next midnight UTC."""
|
|
now = datetime.now(timezone.utc)
|
|
from datetime import timedelta
|
|
|
|
tomorrow = now.date() + timedelta(days=1)
|
|
return datetime(tomorrow.year, tomorrow.month, tomorrow.day, tzinfo=timezone.utc)
|
|
|
|
|
|
def _seconds_until_midnight_utc() -> int:
|
|
"""Seconds until next midnight UTC."""
|
|
now = datetime.now(timezone.utc)
|
|
next_mid = _next_midnight_utc()
|
|
return max(0, int((next_mid - now).total_seconds()))
|
|
|
|
|
|
async def validate_file_content(content: bytes, extension: str) -> None:
|
|
"""Validate file content by checking magic bytes."""
|
|
if len(content) < 4:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message="File is too small to be a valid document.",
|
|
details={"reason": "File is too small"},
|
|
)
|
|
|
|
header = content[:5]
|
|
# PDF files start with %PDF
|
|
if extension.lower() == ".pdf":
|
|
if not header[:4] == PDF_MAGIC_BYTES:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message="File is not a valid PDF.",
|
|
details={"reason": "Invalid PDF header"},
|
|
)
|
|
return
|
|
|
|
# Office files (xlsx, docx, pptx) are ZIP archives
|
|
if header[:4] != OFFICE_MAGIC_BYTES:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message="File is not a valid Office document.",
|
|
details={
|
|
"accepted_formats": list(ACCEPTED_EXTENSIONS),
|
|
"hint": "Office files (.xlsx, .docx, .pptx) must be valid ZIP archives.",
|
|
},
|
|
)
|
|
|
|
|
|
def _parse_content_disposition(content_disp: str) -> Optional[str]:
|
|
"""Parse filename from Content-Disposition header (RFC 5987 compliant)."""
|
|
import re
|
|
|
|
for part in content_disp.split(";"):
|
|
part = part.strip()
|
|
if part.lower().startswith("filename*="):
|
|
match = re.match(r"filename\*=([^']+)'([^']*)'(.+)", part, re.IGNORECASE)
|
|
if match:
|
|
from urllib.parse import unquote
|
|
|
|
return unquote(match.group(3))
|
|
if part.lower().startswith("filename="):
|
|
filename = part.split("=", 1)[1].strip().strip('"').strip("'")
|
|
if filename:
|
|
return filename
|
|
return None
|
|
|
|
|
|
def _is_ssrf_risk(hostname: str) -> bool:
|
|
"""Return True if hostname resolves to a private/reserved IP (SSRF prevention).
|
|
|
|
Blocks: loopback, private, link-local, reserved, multicast ranges.
|
|
Also blocks DNS resolution failures to avoid bypass via non-resolvable names.
|
|
"""
|
|
try:
|
|
ip_str = socket.gethostbyname(hostname)
|
|
addr = ipaddress.ip_address(ip_str)
|
|
return (
|
|
addr.is_loopback
|
|
or addr.is_private
|
|
or addr.is_link_local
|
|
or addr.is_reserved
|
|
or addr.is_multicast
|
|
or addr.is_unspecified
|
|
)
|
|
except Exception:
|
|
return True
|
|
|
|
|
|
async def download_from_url(url: str, timeout: int = 30) -> tuple[Path, str]:
|
|
"""Download file from URL using streaming and return (temp_path, filename).
|
|
|
|
Uses HTTP streaming to avoid loading entire file in memory.
|
|
Validates file extension and magic bytes for security.
|
|
"""
|
|
temp_path = None
|
|
|
|
parsed_url = urlparse(url)
|
|
if parsed_url.scheme not in ("http", "https"):
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_UNREACHABLE,
|
|
message="Only HTTP/HTTPS URLs are accepted.",
|
|
details={"scheme": parsed_url.scheme or "none"},
|
|
)
|
|
|
|
hostname = parsed_url.hostname or ""
|
|
if not hostname or _is_ssrf_risk(hostname):
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_UNREACHABLE,
|
|
message="The URL points to a blocked address (private or internal network).",
|
|
details={"reason": "ssrf_blocked"},
|
|
)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=timeout, follow_redirects=True, max_redirects=10
|
|
) as client:
|
|
async with client.stream("GET", url) as response:
|
|
if response.status_code != 200:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_UNREACHABLE,
|
|
message=f"URL unreachable (HTTP {response.status_code})",
|
|
details={"status_code": response.status_code, "url": url[:100]},
|
|
)
|
|
|
|
content_length = response.headers.get("content-length")
|
|
if content_length:
|
|
try:
|
|
file_size = int(content_length)
|
|
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
|
|
if file_size > max_size_bytes:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.FILE_TOO_LARGE,
|
|
message=f"File is too large ({round(file_size / (1024 * 1024), 2)} MB, max {MAX_FILE_SIZE_MB} MB).",
|
|
details={
|
|
"size_mb": round(file_size / (1024 * 1024), 2),
|
|
"max_mb": MAX_FILE_SIZE_MB,
|
|
},
|
|
)
|
|
except ValueError:
|
|
pass
|
|
|
|
filename = None
|
|
content_disp = response.headers.get("content-disposition", "")
|
|
if content_disp:
|
|
filename = _parse_content_disposition(content_disp)
|
|
|
|
if not filename:
|
|
filename = unquote(Path(parsed_url.path).name) or "downloaded_file"
|
|
|
|
extension = Path(filename).suffix.lower()
|
|
if extension not in ACCEPTED_EXTENSIONS:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.INVALID_FORMAT,
|
|
details={
|
|
"detected_extension": extension or "none",
|
|
"accepted_formats": list(ACCEPTED_EXTENSIONS),
|
|
},
|
|
)
|
|
|
|
unique_id = str(uuid.uuid4())[:8]
|
|
safe_filename = f"{unique_id}_{filename}"
|
|
temp_path = config.UPLOAD_DIR / safe_filename
|
|
|
|
temp_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
max_size_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
|
|
downloaded_bytes = 0
|
|
|
|
async with aiofiles.open(temp_path, "wb") as f:
|
|
async for chunk in response.aiter_bytes(chunk_size=65536):
|
|
downloaded_bytes += len(chunk)
|
|
|
|
if downloaded_bytes > max_size_bytes:
|
|
await f.close()
|
|
if temp_path.exists():
|
|
temp_path.unlink()
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.FILE_TOO_LARGE,
|
|
details={
|
|
"size_mb": round(
|
|
downloaded_bytes / (1024 * 1024), 2
|
|
),
|
|
"max_mb": MAX_FILE_SIZE_MB,
|
|
},
|
|
)
|
|
|
|
await f.write(chunk)
|
|
|
|
async with aiofiles.open(temp_path, "rb") as f:
|
|
header = await f.read(4)
|
|
await validate_file_content(header, extension)
|
|
|
|
return temp_path, filename
|
|
|
|
except httpx.TimeoutException:
|
|
if temp_path and temp_path.exists():
|
|
temp_path.unlink()
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_UNREACHABLE,
|
|
message="Download timed out.",
|
|
details={"timeout_seconds": timeout},
|
|
)
|
|
except httpx.RequestError as e:
|
|
if temp_path and temp_path.exists():
|
|
temp_path.unlink()
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
|
|
message=f"Download error: {str(e)}",
|
|
details={"error": str(e)},
|
|
)
|
|
except TranslateEndpointError:
|
|
if temp_path and temp_path.exists():
|
|
temp_path.unlink()
|
|
raise
|
|
except Exception as e:
|
|
if temp_path and temp_path.exists():
|
|
temp_path.unlink()
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.URL_DOWNLOAD_FAILED,
|
|
message=f"Unexpected download error: {str(e)}",
|
|
details={"error": str(e), "error_type": type(e).__name__},
|
|
)
|
|
|
|
|
|
|
|
_translation_jobs: dict[str, dict] = {}
|
|
_JOB_TTL_SECONDS = 3600
|
|
_last_cleanup_ts: float = 0.0
|
|
|
|
# Google Cloud API key validity cache — avoids probing the API on every request.
|
|
_gc_key_cache: dict[str, tuple[bool, float]] = {}
|
|
_GC_KEY_CACHE_TTL = 600 # 10 minutes
|
|
|
|
|
|
def _google_cloud_key_valid(api_key: str, job_id: str) -> bool:
|
|
"""Check if a Google Cloud API key is valid, with a 10-minute cache."""
|
|
import time
|
|
now = time.time()
|
|
cached = _gc_key_cache.get(api_key)
|
|
if cached:
|
|
is_valid, ts = cached
|
|
if now - ts < _GC_KEY_CACHE_TTL:
|
|
return is_valid
|
|
# Probe the API with a tiny translation
|
|
try:
|
|
from services.providers.google_cloud_provider import LegacyGoogleCloudAdapter
|
|
_test = LegacyGoogleCloudAdapter(api_key)
|
|
_test.translate("test", "fr", "en")
|
|
_gc_key_cache[api_key] = (True, now)
|
|
return True
|
|
except Exception as _gc_err:
|
|
logger.warning(
|
|
"google_cloud_key_invalid",
|
|
extra={"job_id": job_id, "error": str(_gc_err)[:200]},
|
|
)
|
|
_gc_key_cache[api_key] = (False, now)
|
|
return False
|
|
_CLEANUP_INTERVAL_SECONDS = 300 # run cleanup every 5 minutes at most
|
|
|
|
|
|
def _cleanup_old_jobs() -> None:
|
|
"""Remove completed/failed jobs older than TTL to prevent memory leak.
|
|
|
|
Throttled to run at most every _CLEANUP_INTERVAL_SECONDS to avoid
|
|
iterating the full dict on every translation request.
|
|
"""
|
|
global _last_cleanup_ts
|
|
current_time = time.time()
|
|
if current_time - _last_cleanup_ts < _CLEANUP_INTERVAL_SECONDS:
|
|
return
|
|
_last_cleanup_ts = current_time
|
|
|
|
expired_job_ids = [
|
|
job_id
|
|
for job_id, job in _translation_jobs.items()
|
|
if job.get("status") in ("completed", "failed")
|
|
and (
|
|
(ts := job.get("completed_at") or job.get("failed_at"))
|
|
and _job_age_seconds(ts) > _JOB_TTL_SECONDS
|
|
)
|
|
]
|
|
|
|
for job_id in expired_job_ids:
|
|
del _translation_jobs[job_id]
|
|
logger.debug(f"Cleaned up expired job: {job_id}")
|
|
|
|
|
|
def _job_age_seconds(timestamp_str: str) -> float:
|
|
"""Return how many seconds ago a ISO timestamp was."""
|
|
try:
|
|
ts = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")).timestamp()
|
|
return time.time() - ts
|
|
except Exception:
|
|
return 0.0
|
|
|
|
|
|
@router_v1.post(
|
|
"/translate",
|
|
response_model=TranslateResponse,
|
|
responses={
|
|
202: {"description": "Translation job accepted", "model": TranslateResponse},
|
|
400: {"description": "Invalid request", "model": ErrorResponse},
|
|
401: {"description": "Unauthorized", "model": ErrorResponse},
|
|
403: {"description": "Pro feature required", "model": ErrorResponse},
|
|
413: {"description": "File too large", "model": ErrorResponse},
|
|
429: {"description": "Quota exceeded", "model": ErrorResponse},
|
|
},
|
|
status_code=202,
|
|
)
|
|
async def translate_document_v1(
|
|
request: Request,
|
|
file: Optional[UploadFile] = File(
|
|
None, description="Document file (.xlsx, .docx, .pptx)"
|
|
),
|
|
file_url: Optional[str] = Form(None, description="URL to download file (Pro only)"),
|
|
source_lang: str = Form(default="auto", description="Source language code"),
|
|
target_lang: str = Form(..., description="Target language code"),
|
|
mode: Literal["classic", "llm"] = Form(
|
|
default="classic", description="Translation mode"
|
|
),
|
|
provider: Optional[str] = Form(default=None, description="Provider override"),
|
|
webhook_url: Optional[str] = Form(None, description="Webhook URL for notification"),
|
|
glossary_id: Optional[str] = Form(None, description="Glossary ID (Pro only)"),
|
|
custom_prompt: Optional[str] = Form(None, description="Custom prompt (Pro only)"),
|
|
prompt_id: Optional[str] = Form(None, description="Prompt ID from saved prompts (Pro only)"),
|
|
pdf_mode: Optional[Literal["layout", "text_only"]] = Form(
|
|
default=None, description="PDF translation mode: 'layout' (preserve layout) or 'text_only' (clean text output). PDF only."
|
|
),
|
|
translate_images: bool = Form(
|
|
default=False, description="Translate text inside images using AI vision"
|
|
),
|
|
current_user: Optional[Any] = Depends(get_authenticated_user),
|
|
):
|
|
"""
|
|
Submit a document for translation.
|
|
|
|
**Authentication:**
|
|
- JWT Bearer token in Authorization header (web users)
|
|
- X-API-Key header (automation users)
|
|
|
|
**File Input:**
|
|
- `file`: Upload file directly (multipart/form-data)
|
|
- `file_url`: URL to download file from (Pro feature)
|
|
|
|
**Parameters:**
|
|
- `source_lang`: Source language code (default: auto-detect)
|
|
- `target_lang`: Target language code (required)
|
|
- `mode`: Translation mode - "classic" or "llm" (default: classic)
|
|
- `provider`: Provider override (google, deepl, ollama, openai, openrouter)
|
|
- `webhook_url`: URL to receive POST notification when complete
|
|
- `glossary_id`: Glossary ID for LLM translation (Pro only)
|
|
- `custom_prompt`: Custom system prompt (Pro only)
|
|
- `prompt_id`: Saved prompt ID to use (Pro only). Takes priority over custom_prompt.
|
|
|
|
**Webhook Notification:**
|
|
If `webhook_url` is provided, a POST request will be sent when translation completes.
|
|
|
|
**Webhook Payload (Success):**
|
|
```json
|
|
{
|
|
"event_id": "evt_abc123def456xyz",
|
|
"translation_id": "tr_abc123def456",
|
|
"status": "completed",
|
|
"timestamp": "2024-01-15T10:30:00Z",
|
|
"file_name": "report.xlsx",
|
|
"source_lang": "en",
|
|
"target_lang": "fr",
|
|
"error_message": null
|
|
}
|
|
```
|
|
|
|
**Webhook Payload (Failure):**
|
|
```json
|
|
{
|
|
"event_id": "evt_abc123def456xyz",
|
|
"translation_id": "tr_abc123def456",
|
|
"status": "failed",
|
|
"timestamp": "2024-01-15T10:30:00Z",
|
|
"file_name": "report.xlsx",
|
|
"source_lang": "en",
|
|
"target_lang": "fr",
|
|
"error_message": "Provider unavailable: connection timeout"
|
|
}
|
|
```
|
|
|
|
**Webhook Fields:**
|
|
- `event_id`: Unique identifier for webhook deduplication (format: evt_xxxxxxxxxxxxxxxx)
|
|
- `translation_id`: The translation job ID
|
|
- `status`: "completed" or "failed"
|
|
- `timestamp`: ISO 8601 UTC timestamp
|
|
- `file_name`: Original file name
|
|
- `source_lang`: Source language code
|
|
- `target_lang`: Target language code
|
|
- `error_message`: Error description (null if successful)
|
|
|
|
**Webhook Behavior:**
|
|
- Timeout: 10 seconds
|
|
- Fire & Forget: Translation succeeds even if webhook fails
|
|
- Retries: None (implement retry logic on your server if needed)
|
|
|
|
**Returns:**
|
|
- HTTP 202 with job ID and status "processing"
|
|
"""
|
|
request_id = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
|
|
|
|
quota_reserved = False
|
|
try:
|
|
if not file and not file_url:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.MISSING_FILE,
|
|
details={"hint": "Provide either 'file' or 'file_url' parameter"},
|
|
)
|
|
|
|
tier = "free"
|
|
user_id = None
|
|
if current_user:
|
|
tier = _tier_for_quota(current_user.plan)
|
|
user_id = current_user.id
|
|
|
|
# Clean up form data (frontend might send "null" or "undefined" as strings)
|
|
if glossary_id in ("null", "undefined", ""): glossary_id = None
|
|
if custom_prompt in ("null", "undefined", ""): custom_prompt = None
|
|
if prompt_id in ("null", "undefined", ""): prompt_id = None
|
|
if file_url in ("null", "undefined", ""): file_url = None
|
|
|
|
# Story 3.12 / Bugfix: If user is on the free tier, they might have stale Pro features
|
|
# in their frontend localStorage (e.g., they downgraded, or switched accounts).
|
|
# Instead of hard-blocking the translation, we simply strip the Pro features.
|
|
if tier == "free":
|
|
glossary_id = None
|
|
custom_prompt = None
|
|
prompt_id = None
|
|
|
|
if file_url:
|
|
if tier == "free":
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.PRO_FEATURE_REQUIRED,
|
|
message="URL ingestion is reserved for Pro users.",
|
|
details={"feature": "file_url", "tier": tier},
|
|
)
|
|
|
|
# Story 3.10: Validate glossary access before creating the job
|
|
if glossary_id and user_id:
|
|
try:
|
|
validate_glossary_access(glossary_id, user_id)
|
|
except GlossaryNotFoundError as e:
|
|
raise TranslateEndpointError(
|
|
code="GLOSSARY_NOT_FOUND",
|
|
message=str(e),
|
|
details={"glossary_id": glossary_id}
|
|
)
|
|
|
|
# Story 3.12: Validate prompt access before creating the job
|
|
if prompt_id and user_id:
|
|
try:
|
|
validate_prompt_access(prompt_id, user_id)
|
|
except PromptNotFoundError as e:
|
|
raise TranslateEndpointError(
|
|
code="PROMPT_NOT_FOUND",
|
|
message=str(e),
|
|
details={**e.details, "prompt_id": prompt_id} if e.details else {"prompt_id": prompt_id}
|
|
)
|
|
|
|
if webhook_url:
|
|
is_valid, error_msg, error_details = webhook_validator.validate(webhook_url)
|
|
if not is_valid:
|
|
raise TranslateEndpointError(
|
|
code="INVALID_WEBHOOK_URL",
|
|
message=error_msg,
|
|
details=error_details,
|
|
)
|
|
|
|
if current_user:
|
|
usage = check_usage_limits(current_user)
|
|
if not usage["can_translate"]:
|
|
retry_after = _seconds_until_next_month()
|
|
raise HTTPException(
|
|
status_code=429,
|
|
detail={
|
|
"error": "QUOTA_EXCEEDED",
|
|
"message": f"Monthly limit reached ({usage['docs_used']}/{usage['docs_limit']} documents). Upgrade your plan for more.",
|
|
"details": {
|
|
"current_usage": usage['docs_used'],
|
|
"limit": usage['docs_limit'],
|
|
"tier": tier,
|
|
"reset_at": _next_month_utc().isoformat(),
|
|
},
|
|
},
|
|
headers={"Retry-After": str(retry_after)},
|
|
)
|
|
# Atomically reserve one document slot now so concurrent requests cannot
|
|
# overshoot the monthly quota while background jobs are still running.
|
|
reserved = await asyncio.to_thread(reserve_translation_quota, user_id)
|
|
if not reserved:
|
|
retry_after = _seconds_until_next_month()
|
|
raise HTTPException(
|
|
status_code=429,
|
|
detail={
|
|
"error": "QUOTA_EXCEEDED",
|
|
"message": "Monthly limit reached. Upgrade your plan for more.",
|
|
"details": {
|
|
"tier": tier,
|
|
"reset_at": _next_month_utc().isoformat(),
|
|
},
|
|
},
|
|
headers={"Retry-After": str(retry_after)},
|
|
)
|
|
quota_reserved = True
|
|
rate_limit_remaining = usage["docs_remaining"]
|
|
else:
|
|
rate_limit_remaining = -1
|
|
|
|
try:
|
|
LanguageValidator.validate(target_lang)
|
|
except ValidationError as e:
|
|
raise TranslateEndpointError(
|
|
code="INVALID_FORMAT",
|
|
message=f"Invalid target language code: {target_lang}",
|
|
details={"field": "target_lang"},
|
|
)
|
|
|
|
if source_lang and source_lang != "auto":
|
|
try:
|
|
LanguageValidator.validate(source_lang)
|
|
except ValidationError:
|
|
raise TranslateEndpointError(
|
|
code="INVALID_FORMAT",
|
|
message=f"Invalid source language code: {source_lang}",
|
|
details={"field": "source_lang"},
|
|
)
|
|
|
|
input_path = None
|
|
original_filename = None
|
|
file_extension = None
|
|
file_size = 0
|
|
file_hash = None
|
|
|
|
if file:
|
|
validation_result = await file_validator.validate_async(file)
|
|
if not validation_result.is_valid:
|
|
error_msg = "; ".join(validation_result.errors)
|
|
|
|
# Use structured error codes from validator
|
|
if validation_result.error_code == "file_too_large":
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.FILE_TOO_LARGE,
|
|
message=error_msg,
|
|
details={
|
|
"errors": validation_result.errors,
|
|
"max_size_mb": MAX_FILE_SIZE_MB,
|
|
},
|
|
)
|
|
elif validation_result.error_code == "invalid_file_content":
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message=error_msg,
|
|
details={"errors": validation_result.errors},
|
|
)
|
|
else:
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.INVALID_FORMAT,
|
|
message=error_msg,
|
|
details={"errors": validation_result.errors},
|
|
)
|
|
|
|
original_filename = file.filename
|
|
file_extension = validation_result.data.get("extension")
|
|
file_size = validation_result.data.get("size_bytes", 0)
|
|
|
|
input_filename = file_handler_util.generate_unique_filename(
|
|
file.filename, "input"
|
|
)
|
|
input_path = config.UPLOAD_DIR / input_filename
|
|
await file_handler_util.save_upload_file(file, input_path)
|
|
|
|
file_hash = file_handler_util.calculate_sha256(input_path)
|
|
if file_hash is None:
|
|
file_handler_util.cleanup_file(input_path)
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message="Failed to calculate file hash. File may be corrupted.",
|
|
details={"error": "sha256_calculation_failed"},
|
|
)
|
|
|
|
elif file_url:
|
|
input_path, original_filename = await download_from_url(file_url)
|
|
file_extension = Path(original_filename).suffix.lower()
|
|
file_size = input_path.stat().st_size
|
|
file_hash = file_handler_util.calculate_sha256(input_path)
|
|
if file_hash is None:
|
|
file_handler_util.cleanup_file(input_path)
|
|
raise TranslateEndpointError(
|
|
code=TranslateEndpointError.CORRUPTED_FILE,
|
|
message="Failed to calculate downloaded file hash.",
|
|
details={"error": "sha256_calculation_failed"},
|
|
)
|
|
|
|
job_id = f"tr_{uuid.uuid4().hex[:12]}"
|
|
|
|
# Track file metadata in Redis with TTL
|
|
await storage_tracker.track_file(
|
|
job_id=job_id,
|
|
metadata={
|
|
"original_filename": original_filename,
|
|
"file_size": file_size,
|
|
"file_hash": file_hash,
|
|
"input_path": str(input_path),
|
|
"user_id": str(user_id) if user_id else None,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
)
|
|
|
|
_cleanup_old_jobs()
|
|
|
|
# Record file size metric
|
|
if file_extension and file_size:
|
|
record_file_size(file_extension, file_size)
|
|
|
|
_translation_jobs[job_id] = {
|
|
"id": job_id,
|
|
"status": "queued",
|
|
"progress_percent": 0,
|
|
"current_step": "Initializing",
|
|
"total_items": 0,
|
|
"processed_items": 0,
|
|
"error_message": None,
|
|
"file_name": original_filename,
|
|
"source_lang": source_lang,
|
|
"target_lang": target_lang,
|
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
"user_id": user_id,
|
|
"input_path": str(input_path),
|
|
"file_extension": file_extension,
|
|
"provider": provider or mode,
|
|
"webhook_url": webhook_url,
|
|
"custom_prompt": custom_prompt,
|
|
"glossary_id": glossary_id,
|
|
"prompt_id": prompt_id, # Story 3.12: Store prompt_id
|
|
"pdf_mode": pdf_mode, # PDF translation mode
|
|
"translate_images": translate_images,
|
|
}
|
|
await set_job_status_async(job_id, _translation_jobs[job_id])
|
|
|
|
provider_to_use = provider or ("openrouter" if mode == "llm" else "google")
|
|
|
|
# google_cloud (API officielle payante) est réservé aux forfaits Pro et supérieurs.
|
|
# Les plans free/starter sont silencieusement redirigés vers le Google Translate gratuit.
|
|
if provider_to_use == "google_cloud":
|
|
_paid_plans = (PlanType.PRO, PlanType.BUSINESS, PlanType.ENTERPRISE)
|
|
if not current_user or current_user.plan not in _paid_plans:
|
|
logger.info(
|
|
"google_cloud_downgraded_to_google",
|
|
reason="plan_restriction",
|
|
plan=str(current_user.plan) if current_user else "anonymous",
|
|
user_id=str(current_user.id) if current_user else None,
|
|
)
|
|
provider_to_use = "google"
|
|
|
|
asyncio.create_task(
|
|
_run_translation_job(
|
|
job_id=job_id,
|
|
input_path=input_path,
|
|
file_extension=file_extension,
|
|
target_lang=target_lang,
|
|
source_lang=source_lang,
|
|
provider=provider_to_use,
|
|
user_id=user_id,
|
|
custom_prompt=custom_prompt,
|
|
glossary_id=glossary_id,
|
|
prompt_id=prompt_id,
|
|
webhook_url=webhook_url,
|
|
user_plan=str(current_user.plan) if current_user else "free",
|
|
pdf_mode=pdf_mode,
|
|
translate_images=translate_images,
|
|
)
|
|
)
|
|
|
|
logger.info(
|
|
f"[{request_id}] Created translation job {job_id} for {original_filename}"
|
|
)
|
|
|
|
return JSONResponse(
|
|
status_code=202,
|
|
content={
|
|
"data": {
|
|
"id": job_id,
|
|
"status": "processing",
|
|
"file_name": original_filename,
|
|
"source_lang": source_lang,
|
|
"target_lang": target_lang,
|
|
},
|
|
"meta": {
|
|
"rate_limit_remaining": rate_limit_remaining,
|
|
"estimated_time_seconds": 15,
|
|
},
|
|
},
|
|
)
|
|
|
|
except TranslateEndpointError as e:
|
|
if quota_reserved and user_id:
|
|
await asyncio.to_thread(release_translation_quota, user_id)
|
|
status_code = 400
|
|
if e.code == TranslateEndpointError.FILE_TOO_LARGE:
|
|
status_code = 413
|
|
elif e.code == TranslateEndpointError.UNAUTHORIZED:
|
|
status_code = 401
|
|
elif e.code == TranslateEndpointError.PRO_FEATURE_REQUIRED:
|
|
status_code = 403
|
|
|
|
return JSONResponse(
|
|
status_code=status_code,
|
|
content=e.to_dict(),
|
|
)
|
|
except HTTPException:
|
|
if quota_reserved and user_id:
|
|
await asyncio.to_thread(release_translation_quota, user_id)
|
|
raise
|
|
except Exception as e:
|
|
if quota_reserved and user_id:
|
|
await asyncio.to_thread(release_translation_quota, user_id)
|
|
logger.error(f"[{request_id}] Unexpected error: {e}")
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={
|
|
"error": "PROCESSING_ERROR",
|
|
"message": "Error processing the request.",
|
|
"details": {"error_type": type(e).__name__},
|
|
},
|
|
)
|
|
|
|
|
|
def _estimate_pages(file_path: Path, file_extension: str) -> int:
|
|
"""
|
|
Lightweight page-count estimate for usage accounting.
|
|
- .pptx : number of slides (exact)
|
|
- .xlsx : number of visible sheets
|
|
- .docx : rough estimate (~2 500 chars per page)
|
|
Returns at least 1.
|
|
"""
|
|
try:
|
|
ext = file_extension.lower()
|
|
if ext == ".pptx":
|
|
from pptx import Presentation # already a dep
|
|
prs = Presentation(str(file_path))
|
|
return max(1, len(prs.slides))
|
|
elif ext == ".xlsx":
|
|
import openpyxl # already a dep
|
|
wb = openpyxl.load_workbook(str(file_path), read_only=True, data_only=True)
|
|
count = len(wb.sheetnames)
|
|
wb.close()
|
|
return max(1, count)
|
|
elif ext == ".docx":
|
|
import docx # already a dep
|
|
doc = docx.Document(str(file_path))
|
|
char_count = sum(len(p.text) for p in doc.paragraphs)
|
|
return max(1, round(char_count / 2500))
|
|
except Exception as exc:
|
|
logger.warning(f"_estimate_pages failed for {file_extension}: {exc}")
|
|
return 1
|
|
|
|
|
|
class _GoogleCloudWithFallback:
|
|
"""Tries Google Cloud API first, falls back to deep_translator on error.
|
|
|
|
This avoids a hard crash when the Cloud API key is invalid, quota is
|
|
exceeded, or the network is unreachable. The legacy GoogleTranslator
|
|
(deep_translator, free, no key) is used as a best-effort fallback.
|
|
"""
|
|
|
|
def __init__(self, cloud_adapter, legacy_provider):
|
|
self.cloud = cloud_adapter
|
|
self.legacy = legacy_provider
|
|
self.provider_name = "google_cloud_with_fallback"
|
|
|
|
def translate(self, text, target_language, source_language="auto"):
|
|
try:
|
|
return self.cloud.translate(text, target_language, source_language)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"google_cloud_failed_fallback_to_legacy",
|
|
error=str(e)[:200],
|
|
)
|
|
return self.legacy.translate(text, target_language, source_language)
|
|
|
|
def translate_batch(
|
|
self, texts, target_language, source_language="auto", **kwargs
|
|
):
|
|
try:
|
|
return self.cloud.translate_batch(
|
|
texts, target_language, source_language
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"google_cloud_batch_failed_fallback_to_legacy",
|
|
error=str(e)[:200],
|
|
text_count=len(texts),
|
|
)
|
|
return self.legacy.translate_batch(
|
|
texts, target_language, source_language
|
|
)
|
|
|
|
|
|
async def _run_translation_job(
|
|
job_id: str,
|
|
input_path: Path,
|
|
file_extension: str,
|
|
target_lang: str,
|
|
source_lang: str,
|
|
provider: str,
|
|
user_id: Optional[str],
|
|
custom_prompt: Optional[str],
|
|
glossary_id: Optional[str],
|
|
prompt_id: Optional[str] = None, # Story 3.12: Add prompt_id parameter
|
|
webhook_url: Optional[str] = None,
|
|
user_plan: Optional[str] = None, # Plan name for watermark decision
|
|
pdf_mode: Optional[str] = None, # PDF translation mode: "layout" or "text_only"
|
|
translate_images: bool = False,
|
|
) -> None:
|
|
"""
|
|
Run translation job in background with progress tracking.
|
|
|
|
Args:
|
|
job_id: Unique job identifier
|
|
input_path: Path to input file
|
|
file_extension: File extension (.xlsx, .docx, .pptx)
|
|
target_lang: Target language code
|
|
source_lang: Source language code
|
|
provider: Translation provider name
|
|
user_id: Optional user ID for quota tracking
|
|
custom_prompt: Optional custom prompt text (Pro only)
|
|
glossary_id: Optional glossary ID for LLM translation (Pro only)
|
|
prompt_id: Optional saved prompt ID - takes priority over custom_prompt (Pro only, Story 3.12)
|
|
webhook_url: Optional webhook URL for completion notification
|
|
"""
|
|
job = _translation_jobs.get(job_id)
|
|
if not job:
|
|
return
|
|
|
|
tracker = ProgressTracker(job_id, _translation_jobs)
|
|
usage_recorded = False
|
|
|
|
try:
|
|
job["status"] = "processing"
|
|
await set_job_status_async(job_id, dict(job))
|
|
tracker.update(10, "Validating file")
|
|
|
|
async def _sync_job_to_redis():
|
|
"""Sync job status to Redis every 0.5s until completed/failed or job removed."""
|
|
while True:
|
|
await asyncio.sleep(0.5)
|
|
j = _translation_jobs.get(job_id)
|
|
if not j:
|
|
break
|
|
await set_job_status_async(job_id, dict(j))
|
|
if j.get("status") in ("completed", "failed"):
|
|
break
|
|
|
|
asyncio.create_task(_sync_job_to_redis())
|
|
|
|
output_filename = file_handler_util.generate_unique_filename(
|
|
input_path.name.replace("input_", "translated_"), "translated"
|
|
)
|
|
output_path = config.OUTPUT_DIR / output_filename
|
|
|
|
from services.translation_service import (
|
|
OpenRouterTranslationProvider,
|
|
OllamaTranslationProvider,
|
|
translation_service,
|
|
)
|
|
from routes.admin_routes import load_settings as _load_admin_settings
|
|
|
|
_admin_cfg = _load_admin_settings()
|
|
|
|
# Helper: prefer value from admin settings JSON, fall back to env var
|
|
def _cfg(admin_val: str | None, env_var: str, default: str = "") -> str:
|
|
return (admin_val or "").strip() or os.getenv(env_var, default)
|
|
|
|
api_key = _cfg(_admin_cfg.openrouter.api_key, "OPENROUTER_API_KEY")
|
|
model = _cfg(_admin_cfg.openrouter.model, "OPENROUTER_MODEL", "google/gemini-3.5-flash")
|
|
if model in ("deepseek/deepseek-v3.2", "google/gemini-2.0-flash-001"):
|
|
model = "google/gemini-3.5-flash"
|
|
|
|
# Story 3.10: Retrieve and format glossary terms for LLM prompt
|
|
glossary_terms = None
|
|
glossary_source_lang = "fr"
|
|
if glossary_id and user_id:
|
|
try:
|
|
glossary_data = get_glossary_terms(glossary_id, user_id)
|
|
glossary_terms = glossary_data["terms"]
|
|
glossary_source_lang = glossary_data.get("source_language", "fr")
|
|
logger.info(f"Job {job_id}: Loaded {len(glossary_terms)} glossary terms (source: {glossary_source_lang})")
|
|
except GlossaryNotFoundError as e:
|
|
tracker.set_error(str(e))
|
|
logger.error(f"Job {job_id}: Glossary error - {e}")
|
|
return
|
|
|
|
# Story 3.12: Retrieve prompt content if prompt_id provided
|
|
# Priority: prompt_id > custom_prompt
|
|
effective_prompt = None
|
|
if prompt_id and user_id:
|
|
try:
|
|
effective_prompt = get_prompt_content(prompt_id, user_id)
|
|
logger.info(f"Job {job_id}: Loaded prompt content from {prompt_id}")
|
|
except PromptNotFoundError as e:
|
|
tracker.set_error(str(e))
|
|
logger.error(f"Job {job_id}: Prompt error - {e}")
|
|
return
|
|
elif custom_prompt:
|
|
# Use custom_prompt if no prompt_id
|
|
effective_prompt = custom_prompt
|
|
|
|
# Build the full prompt combining effective prompt and glossary
|
|
full_prompt = build_full_prompt(
|
|
effective_prompt, glossary_terms,
|
|
source_lang=glossary_source_lang, target_lang=target_lang,
|
|
)
|
|
|
|
from services.providers.google_provider import GoogleTranslationProvider
|
|
from services.providers.google_cloud_provider import GoogleCloudTranslationProvider
|
|
from services.providers.deepl_provider import DeepLTranslationProvider
|
|
from services.providers.openai_provider import OpenAITranslationProvider
|
|
from services.providers.deepseek_provider import DeepSeekTranslationProvider
|
|
from services.providers.minimax_provider import MinimaxTranslationProvider
|
|
|
|
translation_provider = None
|
|
_p = provider.lower()
|
|
|
|
# "google" (default classic mode): use Google Cloud API key if available.
|
|
# If the Cloud API key is invalid or the API is not enabled, fall back
|
|
# to the free legacy Google Translate (deep_translator) instead of failing.
|
|
if _p == "google":
|
|
gc_key = _cfg(
|
|
getattr(_admin_cfg.google_cloud, "api_key", None),
|
|
"GOOGLE_CLOUD_API_KEY",
|
|
) or os.getenv("GOOGLE_API_KEY", "").strip()
|
|
|
|
if gc_key and _google_cloud_key_valid(gc_key, job_id):
|
|
translation_provider = GoogleCloudTranslationProvider(
|
|
api_key=gc_key,
|
|
timeout=int(os.getenv("GOOGLE_CLOUD_TIMEOUT", "30")),
|
|
max_retries=int(os.getenv("GOOGLE_CLOUD_MAX_RETRIES", "3")),
|
|
retry_delay=float(os.getenv("GOOGLE_CLOUD_RETRY_DELAY", "1.0")),
|
|
)
|
|
logger.info("google_provider_using_cloud_api", extra={"job_id": job_id})
|
|
else:
|
|
translation_provider = GoogleTranslationProvider(
|
|
use_cache=True,
|
|
timeout=int(os.getenv("GOOGLE_TRANSLATE_TIMEOUT", "30")),
|
|
max_retries=int(os.getenv("GOOGLE_TRANSLATE_MAX_RETRIES", "3")),
|
|
retry_delay=float(os.getenv("GOOGLE_TRANSLATE_RETRY_DELAY", "1.0")),
|
|
)
|
|
logger.info("google_provider_using_legacy", extra={"job_id": job_id})
|
|
|
|
elif _p in ("openrouter", "llm") and api_key:
|
|
translation_provider = OpenAITranslationProvider(
|
|
api_key=api_key,
|
|
model=model,
|
|
base_url="https://openrouter.ai/api/v1",
|
|
timeout=int(os.getenv("OPENROUTER_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "openrouter_premium":
|
|
premium_key = _cfg(_admin_cfg.openrouter_premium.api_key, "OPENROUTER_API_KEY")
|
|
premium_model = _cfg(_admin_cfg.openrouter_premium.model, "OPENROUTER_PREMIUM_MODEL", "anthropic/claude-sonnet-4.6")
|
|
if not premium_key:
|
|
premium_key = api_key # fall back to main openrouter key
|
|
if premium_key:
|
|
translation_provider = OpenAITranslationProvider(
|
|
api_key=premium_key,
|
|
model=premium_model,
|
|
base_url="https://openrouter.ai/api/v1",
|
|
timeout=int(os.getenv("OPENROUTER_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "openai":
|
|
openai_key = _cfg(_admin_cfg.openai.api_key, "OPENAI_API_KEY")
|
|
openai_model = _cfg(_admin_cfg.openai.model, "OPENAI_MODEL", "gpt-4o-mini")
|
|
if openai_key:
|
|
translation_provider = OpenAITranslationProvider(
|
|
api_key=openai_key,
|
|
model=openai_model,
|
|
timeout=int(os.getenv("OPENAI_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "deepseek":
|
|
ds_key = _cfg(getattr(_admin_cfg, "deepseek", None) and _admin_cfg.deepseek.api_key, "DEEPSEEK_API_KEY")
|
|
ds_model = _cfg(getattr(_admin_cfg, "deepseek", None) and _admin_cfg.deepseek.model, "DEEPSEEK_MODEL", "deepseek/deepseek-chat")
|
|
if not ds_key and api_key:
|
|
translation_provider = OpenAITranslationProvider(
|
|
api_key=api_key,
|
|
model=ds_model,
|
|
base_url="https://openrouter.ai/api/v1",
|
|
timeout=int(os.getenv("OPENROUTER_TIMEOUT", "60")),
|
|
)
|
|
elif ds_key:
|
|
translation_provider = DeepSeekTranslationProvider(
|
|
api_key=ds_key,
|
|
model=ds_model,
|
|
timeout=int(os.getenv("DEEPSEEK_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "minimax":
|
|
mm_key = _cfg(getattr(_admin_cfg, "minimax", None) and _admin_cfg.minimax.api_key, "MINIMAX_API_KEY")
|
|
mm_model = _cfg(getattr(_admin_cfg, "minimax", None) and _admin_cfg.minimax.model, "MINIMAX_MODEL", "MiniMax-M1")
|
|
if mm_key:
|
|
translation_provider = MinimaxTranslationProvider(
|
|
api_key=mm_key,
|
|
model=mm_model,
|
|
timeout=int(os.getenv("MINIMAX_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "deepl":
|
|
deepl_key = _cfg(_admin_cfg.deepl.api_key, "DEEPL_API_KEY")
|
|
if deepl_key:
|
|
translation_provider = DeepLTranslationProvider(
|
|
api_key=deepl_key,
|
|
timeout=int(os.getenv("DEEPL_TIMEOUT", "30")),
|
|
)
|
|
elif _p == "zai":
|
|
zai_key = _cfg(_admin_cfg.zai.api_key, "ZAI_API_KEY")
|
|
zai_model = _cfg(_admin_cfg.zai.model, "ZAI_MODEL", "grok-2-1212")
|
|
zai_url = _cfg(_admin_cfg.zai.base_url, "ZAI_BASE_URL", "https://api.x.ai/v1")
|
|
if zai_key:
|
|
translation_provider = OpenAITranslationProvider(
|
|
api_key=zai_key,
|
|
model=zai_model,
|
|
base_url=zai_url,
|
|
timeout=int(os.getenv("ZAI_TIMEOUT", "60")),
|
|
)
|
|
elif _p == "google_cloud":
|
|
gc_key = _cfg(
|
|
getattr(_admin_cfg.google_cloud, "api_key", None),
|
|
"GOOGLE_CLOUD_API_KEY",
|
|
)
|
|
if gc_key:
|
|
translation_provider = GoogleCloudTranslationProvider(
|
|
api_key=gc_key,
|
|
timeout=int(os.getenv("GOOGLE_CLOUD_TIMEOUT", "30")),
|
|
max_retries=int(os.getenv("GOOGLE_CLOUD_MAX_RETRIES", "3")),
|
|
retry_delay=float(os.getenv("GOOGLE_CLOUD_RETRY_DELAY", "1.0")),
|
|
)
|
|
logger.info(
|
|
"google_cloud_provider_selected",
|
|
extra={"job_id": job_id},
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"google_cloud_key_missing_fallback_to_google",
|
|
extra={"job_id": job_id},
|
|
)
|
|
|
|
tracker.update(20, "Preparing translation")
|
|
|
|
def progress_callback(progress_info: dict) -> None:
|
|
"""Callback for translator progress updates with standardized key handling."""
|
|
current = progress_info.get(
|
|
"current",
|
|
progress_info.get(
|
|
"slide",
|
|
progress_info.get(
|
|
"sheet",
|
|
progress_info.get("paragraph", progress_info.get("element", 1)),
|
|
),
|
|
),
|
|
)
|
|
total = progress_info.get(
|
|
"total",
|
|
progress_info.get(
|
|
"total_slides",
|
|
progress_info.get(
|
|
"total_sheets", progress_info.get("total_paragraphs", 1)
|
|
),
|
|
),
|
|
)
|
|
|
|
item_name = "Translating"
|
|
if file_extension == ".pptx":
|
|
item_name = "Translating slide"
|
|
elif file_extension == ".xlsx":
|
|
item_name = "Translating sheet"
|
|
elif file_extension == ".docx":
|
|
item_name = "Processing paragraph"
|
|
|
|
# max_percent=95: the translator reaches current==total when its last
|
|
# chunk finishes, but the file is not yet written. set_completed()
|
|
# pushes to 100% once the file is saved.
|
|
tracker.update_item(current, total, item_name, max_percent=95)
|
|
|
|
# Run synchronous translators in a thread pool to avoid blocking the event loop.
|
|
# Without this, status polling requests from the frontend would time out during
|
|
# translation, causing the "Connection lost" error and frozen progress bar.
|
|
# One translator instance per job so concurrent jobs never share mutable
|
|
# provider state (singleton set_provider was racy under parallel translations).
|
|
if file_extension == ".xlsx":
|
|
logger.info(f"DEBUG: ExcelTranslator class is {ExcelTranslator} and translate_file is {ExcelTranslator.translate_file}")
|
|
job_translator = ExcelTranslator(provider=translation_provider)
|
|
if hasattr(job_translator, "set_custom_prompt"):
|
|
job_translator.set_custom_prompt(full_prompt)
|
|
await asyncio.to_thread(
|
|
job_translator.translate_file,
|
|
input_path,
|
|
output_path,
|
|
target_lang,
|
|
source_lang,
|
|
progress_callback=progress_callback,
|
|
translate_images=translate_images,
|
|
)
|
|
elif file_extension == ".docx":
|
|
job_translator = WordTranslator(provider=translation_provider)
|
|
if hasattr(job_translator, "set_custom_prompt"):
|
|
job_translator.set_custom_prompt(full_prompt)
|
|
await asyncio.to_thread(
|
|
job_translator.translate_file,
|
|
input_path,
|
|
output_path,
|
|
target_lang,
|
|
source_lang,
|
|
progress_callback=progress_callback,
|
|
translate_images=translate_images,
|
|
)
|
|
elif file_extension == ".pptx":
|
|
job_translator = PowerPointTranslator(provider=translation_provider)
|
|
if hasattr(job_translator, "set_custom_prompt"):
|
|
job_translator.set_custom_prompt(full_prompt)
|
|
await asyncio.to_thread(
|
|
job_translator.translate_file,
|
|
input_path,
|
|
output_path,
|
|
target_lang,
|
|
source_lang,
|
|
progress_callback=progress_callback,
|
|
translate_images=translate_images,
|
|
)
|
|
elif file_extension == ".pdf":
|
|
from translators.pdf_translator import PDFTranslator
|
|
job_translator = PDFTranslator(provider=translation_provider)
|
|
if hasattr(job_translator, "set_custom_prompt"):
|
|
job_translator.set_custom_prompt(full_prompt)
|
|
actual_output = await asyncio.to_thread(
|
|
job_translator.translate_file,
|
|
input_path,
|
|
output_path,
|
|
target_lang,
|
|
source_lang,
|
|
progress_callback=progress_callback,
|
|
pdf_mode=pdf_mode or "layout",
|
|
translate_images=translate_images,
|
|
)
|
|
# PDF translation may output .docx (if no LibreOffice); use actual path
|
|
if actual_output and Path(actual_output).exists():
|
|
output_path = Path(actual_output)
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {file_extension}")
|
|
|
|
# ── Verify translation actually produced results ──
|
|
if not output_path.exists() or output_path.stat().st_size == 0:
|
|
error_msg = "Translation failed: output file is empty or missing. The translation provider may be unavailable."
|
|
logger.error(f"Job {job_id}: {error_msg}")
|
|
tracker.set_error(error_msg)
|
|
return
|
|
|
|
stats = job_translator.get_translation_stats()
|
|
attempted = stats.get("attempted", 0)
|
|
changed = stats.get("changed", 0)
|
|
|
|
if attempted == 0 and file_extension in ('.docx', '.xlsx', '.pptx'):
|
|
error_msg = (
|
|
"Aucun texte traduisible détecté dans le document. "
|
|
"Le fichier est peut-être vide, protégé, ou ne contient que des images."
|
|
)
|
|
logger.error(f"Job {job_id}: {error_msg}")
|
|
tracker.set_error(error_msg)
|
|
return
|
|
|
|
if attempted > 0:
|
|
ratio = changed / attempted
|
|
logger.info(f"Job {job_id}: translation stats — {changed}/{attempted} texts changed ({ratio:.0%})")
|
|
if changed == 0:
|
|
error_msg = (
|
|
f"0 textes sur {attempted} ont été traduits. "
|
|
f"Le moteur ({provider}) est peut-être indisponible ou mal configuré. "
|
|
f"Vérifiez les clés API dans les paramètres admin."
|
|
)
|
|
logger.error(f"Job {job_id}: {error_msg}")
|
|
tracker.set_error(error_msg)
|
|
return
|
|
elif ratio < 0.05:
|
|
# Very suspicious — likely partial failure, warn but don't block
|
|
logger.warning(
|
|
f"Job {job_id}: suspiciously low translation rate: "
|
|
f"{changed}/{attempted} ({ratio:.1%})"
|
|
)
|
|
|
|
if user_id:
|
|
# Determine cost factor based on selected provider and model
|
|
cost_factor = 1
|
|
provider_lower = (provider or "").lower()
|
|
|
|
prov_model = ""
|
|
if translation_provider:
|
|
prov_model = getattr(translation_provider, "model", "") or ""
|
|
|
|
prov_model_lower = prov_model.lower()
|
|
if any(k in prov_model_lower for k in ["claude", "fable", "gpt-4"]) or provider_lower == "openrouter_premium":
|
|
if "haiku" in prov_model_lower:
|
|
cost_factor = 1
|
|
else:
|
|
cost_factor = 5
|
|
|
|
# Persist monthly usage counters in PostgreSQL (docs + pages)
|
|
pages = await asyncio.to_thread(
|
|
_estimate_pages, input_path, file_extension
|
|
)
|
|
await asyncio.to_thread(
|
|
record_usage, user_id, pages, cost_factor, reserved_docs=1
|
|
)
|
|
usage_recorded = True
|
|
logger.info(f"Job {job_id}: usage recorded — {pages} page(s) with cost factor {cost_factor}")
|
|
|
|
# Apply watermark for Free-tier users
|
|
plan_name = (user_plan or "free").lower()
|
|
if plan_name in ("free", "plantype.free"):
|
|
try:
|
|
from translators.watermark import add_watermark
|
|
actual_ext = output_path.suffix.lower()
|
|
await asyncio.to_thread(add_watermark, output_path, actual_ext)
|
|
logger.info(f"Job {job_id}: watermark applied (free plan)")
|
|
except Exception as wm_err:
|
|
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
|
|
|
|
tracker.set_completed(str(output_path))
|
|
# Record translation metric
|
|
duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple())
|
|
record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success")
|
|
logger.info(f"Job {job_id}: Completed successfully")
|
|
|
|
except asyncio.CancelledError:
|
|
# Background task cancelled (e.g. TestClient teardown or server shutdown).
|
|
# The document slot was already reserved at request time; keep it consumed
|
|
# so quota enforcement remains deterministic.
|
|
logger.warning(f"Job {job_id}: translation task cancelled, keeping reserved quota")
|
|
raise
|
|
except Exception as e:
|
|
# Check if this is our structured TranslationProviderError
|
|
if type(e).__name__ == "TranslationProviderError":
|
|
tracker.set_error(e.message)
|
|
logger.error(f"Job {job_id}: Provider Failed - {e.code}: {e.message}")
|
|
else:
|
|
tracker.set_error(str(e))
|
|
logger.error(f"Job {job_id}: Failed - {e}")
|
|
|
|
# Record translation failure metric
|
|
record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error")
|
|
|
|
if user_id and not usage_recorded:
|
|
try:
|
|
await asyncio.to_thread(release_translation_quota, user_id)
|
|
logger.info(f"Job {job_id}: released reserved quota after failure")
|
|
except Exception as release_err:
|
|
logger.exception(f"Job {job_id}: failed to release reserved quota: {release_err}")
|
|
|
|
finally:
|
|
if webhook_url:
|
|
try:
|
|
# Generate unique event_id for webhook deduplication
|
|
event_id = f"evt_{uuid.uuid4().hex[:16]}"
|
|
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
response = await client.post(
|
|
webhook_url,
|
|
json={
|
|
"event_id": event_id,
|
|
"translation_id": job_id,
|
|
"status": job["status"],
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"file_name": job.get("file_name"),
|
|
"source_lang": job.get("source_lang"),
|
|
"target_lang": job.get("target_lang"),
|
|
"error_message": job.get("error_message"),
|
|
},
|
|
)
|
|
|
|
# Log successful webhook delivery
|
|
if response.is_success:
|
|
logger.info(
|
|
f"Job {job_id}: Webhook notification sent successfully to {webhook_url} "
|
|
f"(status={response.status_code}, event_id={event_id})"
|
|
)
|
|
else:
|
|
# Log non-2xx response with body for debugging
|
|
try:
|
|
response_body = await response.aread()
|
|
body_preview = response_body[:500].decode('utf-8', errors='replace')
|
|
except Exception:
|
|
body_preview = "<unable to read body>"
|
|
logger.warning(
|
|
f"Job {job_id}: Webhook returned non-success status "
|
|
f"(status={response.status_code}, url={webhook_url}, event_id={event_id}, "
|
|
f"response_body={body_preview})"
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
logger.warning(
|
|
f"Job {job_id}: Webhook notification timed out after 10s (url={webhook_url}, event_id={event_id})"
|
|
)
|
|
except httpx.RequestError as e:
|
|
logger.warning(
|
|
f"Job {job_id}: Webhook notification failed - {type(e).__name__}: {e} "
|
|
f"(url={webhook_url}, event_id={event_id})"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Job {job_id}: Unexpected webhook error - {type(e).__name__}: {e} (event_id={event_id})"
|
|
)
|
|
|
|
|
|
@router_v1.get(
|
|
"/translations/{job_id}",
|
|
response_model=TranslationStatusResponse,
|
|
responses={
|
|
200: {"description": "Translation status", "model": TranslationStatusResponse},
|
|
404: {"description": "Job not found", "model": ErrorResponse},
|
|
},
|
|
)
|
|
async def get_translation_status(
|
|
job_id: str,
|
|
current_user: Optional[Any] = Depends(get_authenticated_user),
|
|
):
|
|
"""
|
|
Get translation job status with real-time progress.
|
|
|
|
Returns current status and progress of a translation job.
|
|
|
|
**Status Values:**
|
|
- `queued`: Job is waiting to be processed
|
|
- `processing`: Job is actively being translated
|
|
- `completed`: Translation finished successfully
|
|
- `failed`: Translation encountered an error
|
|
|
|
**Progress Fields:**
|
|
- `progress_percent`: 0-100 indicating completion percentage
|
|
- `current_step`: Human-readable description of current operation
|
|
- `error_message`: Present only when status is "failed"
|
|
|
|
**Example Response (Processing):**
|
|
```json
|
|
{
|
|
"data": {
|
|
"id": "tr_abc123",
|
|
"status": "processing",
|
|
"progress_percent": 45,
|
|
"current_step": "Translating slide 5/10",
|
|
"file_name": "presentation.pptx",
|
|
"source_lang": "en",
|
|
"target_lang": "fr",
|
|
"created_at": "2024-01-15T10:30:00Z"
|
|
},
|
|
"meta": {}
|
|
}
|
|
```
|
|
"""
|
|
job = await get_job_status_async(job_id)
|
|
if not job:
|
|
job = _translation_jobs.get(job_id)
|
|
|
|
if not job:
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={
|
|
"error": "NOT_FOUND",
|
|
"message": "Translation job not found.",
|
|
"details": {"job_id": job_id},
|
|
},
|
|
)
|
|
|
|
response_data = {
|
|
"id": job["id"],
|
|
"status": job["status"],
|
|
"progress_percent": job.get("progress_percent", 0),
|
|
"current_step": job.get("current_step", "Unknown"),
|
|
"file_name": job.get("file_name"),
|
|
"source_lang": job.get("source_lang"),
|
|
"target_lang": job.get("target_lang"),
|
|
"created_at": job.get("created_at"),
|
|
}
|
|
|
|
estimated_remaining = None
|
|
if job["status"] == "processing" and job.get("progress_percent", 0) > 0:
|
|
try:
|
|
created_at_str = job.get("created_at")
|
|
if created_at_str:
|
|
created_at = datetime.fromisoformat(
|
|
created_at_str.replace("Z", "+00:00")
|
|
)
|
|
elapsed_seconds = (
|
|
datetime.now(timezone.utc) - created_at
|
|
).total_seconds()
|
|
progress_percent = job.get("progress_percent", 0)
|
|
if progress_percent > 0:
|
|
total_estimated = elapsed_seconds / (progress_percent / 100)
|
|
estimated_remaining = max(1, int(total_estimated - elapsed_seconds))
|
|
except Exception:
|
|
pass
|
|
|
|
if job["status"] == "completed":
|
|
response_data["completed_at"] = job.get("completed_at")
|
|
elif job["status"] == "failed":
|
|
response_data["failed_at"] = job.get("failed_at")
|
|
response_data["error_message"] = job.get("error_message")
|
|
|
|
return {
|
|
"data": response_data,
|
|
"meta": {"estimated_remaining_seconds": estimated_remaining},
|
|
}
|
|
|
|
|
|
@router_v1.get("/translate/health")
|
|
async def translate_health():
|
|
"""Health check for translation endpoint."""
|
|
return {"status": "healthy", "endpoint": "/api/v1/translate"}
|
|
|
|
|
|
MIME_TYPES = {
|
|
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
".pdf": "application/pdf",
|
|
}
|
|
|
|
|
|
def _cleanup_files(input_path: Optional[str], output_path: Optional[str]) -> None:
|
|
"""Delete input and output files after download completes."""
|
|
try:
|
|
if output_path:
|
|
out_path = Path(output_path)
|
|
if out_path.exists():
|
|
out_path.unlink()
|
|
logger.info(f"Deleted output file: {output_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete output file {output_path}: {e}")
|
|
|
|
try:
|
|
if input_path:
|
|
in_path = Path(input_path)
|
|
if in_path.exists():
|
|
in_path.unlink()
|
|
logger.info(f"Deleted input file: {input_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete input file {input_path}: {e}")
|
|
|
|
|
|
@router_v1.get(
|
|
"/download/{job_id}",
|
|
responses={
|
|
200: {
|
|
"description": "Translated file download",
|
|
"content": {"application/octet-stream": {}},
|
|
},
|
|
404: {"description": "File not found or not ready", "model": ErrorResponse},
|
|
},
|
|
)
|
|
async def download_translated_file(
|
|
job_id: str,
|
|
current_user: Optional[Any] = Depends(get_authenticated_user),
|
|
):
|
|
"""
|
|
Download a translated file.
|
|
|
|
Returns the translated file as a binary download with proper Content-Type
|
|
and Content-Disposition headers. The file is automatically deleted after
|
|
the download completes.
|
|
|
|
**Status Requirements:**
|
|
- Job must exist and have status "completed"
|
|
- Job must have an output_path field
|
|
|
|
**Error Codes:**
|
|
- `FILE_EXPIRED`: Job not found, expired, or no output file
|
|
- `NOT_READY`: Job exists but translation is not complete
|
|
|
|
**Response Headers:**
|
|
- `Content-Type`: Appropriate MIME type for the file format
|
|
- `Content-Disposition`: attachment with filename containing "_translated" suffix
|
|
|
|
**Example:**
|
|
```
|
|
GET /api/v1/download/tr_abc123def456
|
|
→ Returns file with Content-Disposition: attachment; filename="report_translated.xlsx"
|
|
```
|
|
"""
|
|
if not JOB_ID_PATTERN.match(job_id):
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={
|
|
"error": "INVALID_JOB_ID",
|
|
"message": "Invalid job ID format.",
|
|
"details": {"job_id": job_id, "expected_format": "tr_xxxxxxxxxxxx"},
|
|
},
|
|
)
|
|
|
|
job = await get_job_status_async(job_id)
|
|
if not job:
|
|
job = _translation_jobs.get(job_id)
|
|
|
|
if not job:
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={
|
|
"error": "FILE_EXPIRED",
|
|
"message": "The translated file is no longer available or has expired.",
|
|
"details": {"job_id": job_id, "status": "not_found"},
|
|
},
|
|
)
|
|
|
|
job_user_id = job.get("user_id")
|
|
if current_user and job_user_id and str(job_user_id) != str(current_user.id):
|
|
return JSONResponse(
|
|
status_code=403,
|
|
content={
|
|
"error": "ACCESS_DENIED",
|
|
"message": "You do not have access to this file.",
|
|
"details": {"job_id": job_id},
|
|
},
|
|
)
|
|
|
|
if job.get("status") != "completed":
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={
|
|
"error": "NOT_READY",
|
|
"message": "Translation is still in progress.",
|
|
"details": {
|
|
"job_id": job_id,
|
|
"status": job.get("status"),
|
|
"progress_percent": job.get("progress_percent", 0),
|
|
},
|
|
},
|
|
)
|
|
|
|
output_path_str = job.get("output_path")
|
|
if not output_path_str:
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={
|
|
"error": "FILE_EXPIRED",
|
|
"message": "The translated file is no longer available or has expired.",
|
|
"details": {"job_id": job_id, "status": "no_output_path"},
|
|
},
|
|
)
|
|
|
|
output_path = Path(output_path_str)
|
|
if not output_path.exists():
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={
|
|
"error": "FILE_EXPIRED",
|
|
"message": "The translated file is no longer available or has expired.",
|
|
"details": {"job_id": job_id, "status": "file_deleted"},
|
|
},
|
|
)
|
|
|
|
original_filename = job.get("file_name", "document")
|
|
# Use the actual output file extension (PDF→DOCX conversion changes extension)
|
|
actual_extension = output_path.suffix.lower()
|
|
if original_filename:
|
|
name_without_ext = Path(original_filename).stem
|
|
download_filename = f"{name_without_ext}_translated{actual_extension}"
|
|
else:
|
|
download_filename = f"document_translated{actual_extension}"
|
|
|
|
mime_type = MIME_TYPES.get(actual_extension, "application/octet-stream")
|
|
|
|
input_path_str = job.get("input_path")
|
|
|
|
logger.info(f"Download requested for job {job_id}: {download_filename}")
|
|
|
|
return FileResponse(
|
|
path=str(output_path),
|
|
media_type=mime_type,
|
|
filename=download_filename,
|
|
background=BackgroundTask(_cleanup_files, input_path_str, output_path_str),
|
|
)
|