From 26dfa08730c599f0b31a0a4d1f7f4feefce05f3b Mon Sep 17 00:00:00 2001 From: sepehr Date: Fri, 15 May 2026 14:33:10 +0200 Subject: [PATCH] feat: add Prometheus metrics + fix CI/CD health check port - Add prometheus-client dependency - Create middleware/metrics.py with PrometheusMiddleware - Expose /metrics endpoint in Prometheus text format - Track http_requests_total, translation_total, translation_duration_seconds, file_size_bytes - Instrument translate routes with record_translation() and record_file_size() - Fix deploy.yml health check: localhost:8000 -> localhost:8001 (Portainer conflict) Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/deploy.yml | 4 +- main.py | 8 ++++ middleware/metrics.py | 85 +++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + routes/translate_routes.py | 10 +++++ 5 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 middleware/metrics.py diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 79e3a49..22262b4 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -52,7 +52,7 @@ jobs: # Final health check on backend for i in $(seq 1 10); do - if curl -sf http://localhost:8000/health > /dev/null 2>&1; then + if curl -sf http://localhost:8001/health > /dev/null 2>&1; then echo "Backend health check: OK" break fi @@ -77,7 +77,7 @@ jobs: echo "=========================================" docker compose ps echo "" - echo "Health: $(curl -sf http://localhost:8000/health 2>/dev/null || echo 'FAILED')" + echo "Health: $(curl -sf http://localhost:8001/health 2>/dev/null || echo 'FAILED')" echo "=========================================" # Optional: deploy monitoring stack diff --git a/main.py b/main.py index f45d04e..3632ec3 100644 --- a/main.py +++ b/main.py @@ -56,6 +56,7 @@ from middleware.security import ( RequestLoggingMiddleware, ) from middleware.error_handler import ErrorHandlingMiddleware, format_error_response +from middleware.metrics import PrometheusMiddleware, get_metrics from middleware.cleanup import ( MemoryMonitor, HealthChecker, @@ -359,6 +360,7 @@ app = FastAPI( app.openapi = custom_openapi app.add_middleware(ErrorHandlingMiddleware) +app.add_middleware(PrometheusMiddleware) app.add_middleware(RequestLoggingMiddleware, log_body=False) app.add_middleware( SecurityHeadersMiddleware, @@ -574,6 +576,12 @@ async def root(): } +@app.get("/metrics", tags=["Health"]) +async def metrics_endpoint(): + """Prometheus metrics endpoint""" + return get_metrics() + + @app.get("/health", tags=["Health"]) async def health_check(): """Health check endpoint with detailed system status (Kubernetes liveness probe)""" diff --git a/middleware/metrics.py b/middleware/metrics.py new file mode 100644 index 0000000..2d6f50d --- /dev/null +++ b/middleware/metrics.py @@ -0,0 +1,85 @@ +""" +Prometheus metrics middleware for FastAPI. + +Exposes /metrics endpoint in Prometheus text format. +Tracks HTTP requests, translations, and file uploads. +""" + +import time +import logging +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response +from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LSP + +logger = logging.getLogger(__name__) + +# ---- Metrics definitions ---- + +http_requests_total = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "path", "status"], +) + +translation_total = Counter( + "translation_total", + "Total translations processed", + ["provider", "file_type", "status"], +) + +translation_duration_seconds = Histogram( + "translation_duration_seconds", + "Translation processing duration in seconds", + ["provider", "file_type"], + buckets=(0.5, 1, 2, 5, 10, 30, 60, 120, 300), +) + +file_size_bytes = Histogram( + "file_size_bytes", + "Uploaded file size in bytes", + ["file_type"], + buckets=(100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000), +) + +# Paths to skip from metrics (noisy health checks) +_SKIP_PATHS = {"/health", "/ready", "/metrics", "/favicon.ico"} + + +def record_translation(provider: str, file_type: str, duration: float, status: str = "success"): + translation_total.labels(provider=provider, file_type=file_type, status=status).inc() + translation_duration_seconds.labels(provider=provider, file_type=file_type).observe(duration) + + +def record_file_size(file_type: str, size_bytes: int): + file_size_bytes.labels(file_type=file_type).observe(size_bytes) + + +class PrometheusMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + if request.url.path in _SKIP_PATHS: + return await call_next(request) + + start = time.time() + response: Response = await call_next(request) + duration = time.time() - start + + path = request.url.path + # Group dynamic paths to avoid label explosion + if path.startswith("/api/v1/translations/"): + path = "/api/v1/translations/{id}" + elif path.startswith("/api/v1/download/"): + path = "/api/v1/download/{id}" + + http_requests_total.labels( + method=request.method, + path=path, + status=str(response.status_code), + ).inc() + + return response + + +def get_metrics() -> Response: + body = generate_latest() + return Response(content=body, media_type=CONTENT_TYPE_LSP) diff --git a/requirements.txt b/requirements.txt index 91a917f..868f8b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,3 +46,5 @@ aiosmtplib>=3.0.0 pytest>=7.0.0 pytest-asyncio>=0.21.0 + +prometheus-client==0.20.0 diff --git a/routes/translate_routes.py b/routes/translate_routes.py index 63a181d..79d1105 100644 --- a/routes/translate_routes.py +++ b/routes/translate_routes.py @@ -60,6 +60,7 @@ from schemas.translation import ( ) from schemas.errors import ErrorResponse from utils.file_handler import FileHandler +from middleware.metrics import record_translation, record_file_size from services.progress_tracker import ProgressTracker from services.storage_tracker import storage_tracker from core.redis import set_job_status_async, get_job_status_async @@ -696,6 +697,10 @@ async def translate_document_v1( _cleanup_old_jobs() + # Record file size metric + if file_extension and file_size: + record_file_size(file_extension, file_size) + _translation_jobs[job_id] = { "id": job_id, "status": "queued", @@ -1124,10 +1129,15 @@ async def _run_translation_job( logger.warning(f"Job {job_id}: watermark failed: {wm_err}") tracker.set_completed(str(output_path)) + # Record translation metric + duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple()) + record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success") logger.info(f"Job {job_id}: Completed successfully") except Exception as e: tracker.set_error(str(e)) + # Record translation failure metric + record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error") logger.error(f"Job {job_id}: Failed - {e}") finally: