feat: add Prometheus metrics + fix CI/CD health check port
- Add prometheus-client dependency - Create middleware/metrics.py with PrometheusMiddleware - Expose /metrics endpoint in Prometheus text format - Track http_requests_total, translation_total, translation_duration_seconds, file_size_bytes - Instrument translate routes with record_translation() and record_file_size() - Fix deploy.yml health check: localhost:8000 -> localhost:8001 (Portainer conflict) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,7 @@ jobs:
|
||||
|
||||
# Final health check on backend
|
||||
for i in $(seq 1 10); do
|
||||
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
|
||||
if curl -sf http://localhost:8001/health > /dev/null 2>&1; then
|
||||
echo "Backend health check: OK"
|
||||
break
|
||||
fi
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
echo "========================================="
|
||||
docker compose ps
|
||||
echo ""
|
||||
echo "Health: $(curl -sf http://localhost:8000/health 2>/dev/null || echo 'FAILED')"
|
||||
echo "Health: $(curl -sf http://localhost:8001/health 2>/dev/null || echo 'FAILED')"
|
||||
echo "========================================="
|
||||
|
||||
# Optional: deploy monitoring stack
|
||||
|
||||
8
main.py
8
main.py
@@ -56,6 +56,7 @@ from middleware.security import (
|
||||
RequestLoggingMiddleware,
|
||||
)
|
||||
from middleware.error_handler import ErrorHandlingMiddleware, format_error_response
|
||||
from middleware.metrics import PrometheusMiddleware, get_metrics
|
||||
from middleware.cleanup import (
|
||||
MemoryMonitor,
|
||||
HealthChecker,
|
||||
@@ -359,6 +360,7 @@ app = FastAPI(
|
||||
app.openapi = custom_openapi
|
||||
|
||||
app.add_middleware(ErrorHandlingMiddleware)
|
||||
app.add_middleware(PrometheusMiddleware)
|
||||
app.add_middleware(RequestLoggingMiddleware, log_body=False)
|
||||
app.add_middleware(
|
||||
SecurityHeadersMiddleware,
|
||||
@@ -574,6 +576,12 @@ async def root():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics", tags=["Health"])
|
||||
async def metrics_endpoint():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return get_metrics()
|
||||
|
||||
|
||||
@app.get("/health", tags=["Health"])
|
||||
async def health_check():
|
||||
"""Health check endpoint with detailed system status (Kubernetes liveness probe)"""
|
||||
|
||||
85
middleware/metrics.py
Normal file
85
middleware/metrics.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Prometheus metrics middleware for FastAPI.
|
||||
|
||||
Exposes /metrics endpoint in Prometheus text format.
|
||||
Tracks HTTP requests, translations, and file uploads.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LSP
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---- Metrics definitions ----
|
||||
|
||||
http_requests_total = Counter(
|
||||
"http_requests_total",
|
||||
"Total HTTP requests",
|
||||
["method", "path", "status"],
|
||||
)
|
||||
|
||||
translation_total = Counter(
|
||||
"translation_total",
|
||||
"Total translations processed",
|
||||
["provider", "file_type", "status"],
|
||||
)
|
||||
|
||||
translation_duration_seconds = Histogram(
|
||||
"translation_duration_seconds",
|
||||
"Translation processing duration in seconds",
|
||||
["provider", "file_type"],
|
||||
buckets=(0.5, 1, 2, 5, 10, 30, 60, 120, 300),
|
||||
)
|
||||
|
||||
file_size_bytes = Histogram(
|
||||
"file_size_bytes",
|
||||
"Uploaded file size in bytes",
|
||||
["file_type"],
|
||||
buckets=(100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000),
|
||||
)
|
||||
|
||||
# Paths to skip from metrics (noisy health checks)
|
||||
_SKIP_PATHS = {"/health", "/ready", "/metrics", "/favicon.ico"}
|
||||
|
||||
|
||||
def record_translation(provider: str, file_type: str, duration: float, status: str = "success"):
|
||||
translation_total.labels(provider=provider, file_type=file_type, status=status).inc()
|
||||
translation_duration_seconds.labels(provider=provider, file_type=file_type).observe(duration)
|
||||
|
||||
|
||||
def record_file_size(file_type: str, size_bytes: int):
|
||||
file_size_bytes.labels(file_type=file_type).observe(size_bytes)
|
||||
|
||||
|
||||
class PrometheusMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next):
|
||||
if request.url.path in _SKIP_PATHS:
|
||||
return await call_next(request)
|
||||
|
||||
start = time.time()
|
||||
response: Response = await call_next(request)
|
||||
duration = time.time() - start
|
||||
|
||||
path = request.url.path
|
||||
# Group dynamic paths to avoid label explosion
|
||||
if path.startswith("/api/v1/translations/"):
|
||||
path = "/api/v1/translations/{id}"
|
||||
elif path.startswith("/api/v1/download/"):
|
||||
path = "/api/v1/download/{id}"
|
||||
|
||||
http_requests_total.labels(
|
||||
method=request.method,
|
||||
path=path,
|
||||
status=str(response.status_code),
|
||||
).inc()
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def get_metrics() -> Response:
|
||||
body = generate_latest()
|
||||
return Response(content=body, media_type=CONTENT_TYPE_LSP)
|
||||
@@ -46,3 +46,5 @@ aiosmtplib>=3.0.0
|
||||
|
||||
pytest>=7.0.0
|
||||
pytest-asyncio>=0.21.0
|
||||
|
||||
prometheus-client==0.20.0
|
||||
|
||||
@@ -60,6 +60,7 @@ from schemas.translation import (
|
||||
)
|
||||
from schemas.errors import ErrorResponse
|
||||
from utils.file_handler import FileHandler
|
||||
from middleware.metrics import record_translation, record_file_size
|
||||
from services.progress_tracker import ProgressTracker
|
||||
from services.storage_tracker import storage_tracker
|
||||
from core.redis import set_job_status_async, get_job_status_async
|
||||
@@ -696,6 +697,10 @@ async def translate_document_v1(
|
||||
|
||||
_cleanup_old_jobs()
|
||||
|
||||
# Record file size metric
|
||||
if file_extension and file_size:
|
||||
record_file_size(file_extension, file_size)
|
||||
|
||||
_translation_jobs[job_id] = {
|
||||
"id": job_id,
|
||||
"status": "queued",
|
||||
@@ -1124,10 +1129,15 @@ async def _run_translation_job(
|
||||
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
|
||||
|
||||
tracker.set_completed(str(output_path))
|
||||
# Record translation metric
|
||||
duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple())
|
||||
record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success")
|
||||
logger.info(f"Job {job_id}: Completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
tracker.set_error(str(e))
|
||||
# Record translation failure metric
|
||||
record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error")
|
||||
logger.error(f"Job {job_id}: Failed - {e}")
|
||||
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user