feat: add Prometheus metrics + fix CI/CD health check port
Some checks failed
Deploy to Homelab / Deploy Wordly to 192.168.1.151 (push) Has been cancelled
Deploy to Homelab / Deploy Monitoring (if configured) (push) Has been cancelled

- Add prometheus-client dependency
- Create middleware/metrics.py with PrometheusMiddleware
- Expose /metrics endpoint in Prometheus text format
- Track http_requests_total, translation_total, translation_duration_seconds,
  file_size_bytes
- Instrument translate routes with record_translation() and record_file_size()
- Fix deploy.yml health check: localhost:8000 -> localhost:8001 (Portainer conflict)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 14:33:10 +02:00
parent a76442b382
commit 26dfa08730
5 changed files with 107 additions and 2 deletions

View File

@@ -52,7 +52,7 @@ jobs:
# Final health check on backend
for i in $(seq 1 10); do
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
if curl -sf http://localhost:8001/health > /dev/null 2>&1; then
echo "Backend health check: OK"
break
fi
@@ -77,7 +77,7 @@ jobs:
echo "========================================="
docker compose ps
echo ""
echo "Health: $(curl -sf http://localhost:8000/health 2>/dev/null || echo 'FAILED')"
echo "Health: $(curl -sf http://localhost:8001/health 2>/dev/null || echo 'FAILED')"
echo "========================================="
# Optional: deploy monitoring stack

View File

@@ -56,6 +56,7 @@ from middleware.security import (
RequestLoggingMiddleware,
)
from middleware.error_handler import ErrorHandlingMiddleware, format_error_response
from middleware.metrics import PrometheusMiddleware, get_metrics
from middleware.cleanup import (
MemoryMonitor,
HealthChecker,
@@ -359,6 +360,7 @@ app = FastAPI(
app.openapi = custom_openapi
app.add_middleware(ErrorHandlingMiddleware)
app.add_middleware(PrometheusMiddleware)
app.add_middleware(RequestLoggingMiddleware, log_body=False)
app.add_middleware(
SecurityHeadersMiddleware,
@@ -574,6 +576,12 @@ async def root():
}
@app.get("/metrics", tags=["Health"])
async def metrics_endpoint():
"""Prometheus metrics endpoint"""
return get_metrics()
@app.get("/health", tags=["Health"])
async def health_check():
"""Health check endpoint with detailed system status (Kubernetes liveness probe)"""

85
middleware/metrics.py Normal file
View File

@@ -0,0 +1,85 @@
"""
Prometheus metrics middleware for FastAPI.
Exposes /metrics endpoint in Prometheus text format.
Tracks HTTP requests, translations, and file uploads.
"""
import time
import logging
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.requests import Request
from starlette.responses import Response
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LSP
logger = logging.getLogger(__name__)
# ---- Metrics definitions ----
http_requests_total = Counter(
"http_requests_total",
"Total HTTP requests",
["method", "path", "status"],
)
translation_total = Counter(
"translation_total",
"Total translations processed",
["provider", "file_type", "status"],
)
translation_duration_seconds = Histogram(
"translation_duration_seconds",
"Translation processing duration in seconds",
["provider", "file_type"],
buckets=(0.5, 1, 2, 5, 10, 30, 60, 120, 300),
)
file_size_bytes = Histogram(
"file_size_bytes",
"Uploaded file size in bytes",
["file_type"],
buckets=(100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000),
)
# Paths to skip from metrics (noisy health checks)
_SKIP_PATHS = {"/health", "/ready", "/metrics", "/favicon.ico"}
def record_translation(provider: str, file_type: str, duration: float, status: str = "success"):
translation_total.labels(provider=provider, file_type=file_type, status=status).inc()
translation_duration_seconds.labels(provider=provider, file_type=file_type).observe(duration)
def record_file_size(file_type: str, size_bytes: int):
file_size_bytes.labels(file_type=file_type).observe(size_bytes)
class PrometheusMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
if request.url.path in _SKIP_PATHS:
return await call_next(request)
start = time.time()
response: Response = await call_next(request)
duration = time.time() - start
path = request.url.path
# Group dynamic paths to avoid label explosion
if path.startswith("/api/v1/translations/"):
path = "/api/v1/translations/{id}"
elif path.startswith("/api/v1/download/"):
path = "/api/v1/download/{id}"
http_requests_total.labels(
method=request.method,
path=path,
status=str(response.status_code),
).inc()
return response
def get_metrics() -> Response:
body = generate_latest()
return Response(content=body, media_type=CONTENT_TYPE_LSP)

View File

@@ -46,3 +46,5 @@ aiosmtplib>=3.0.0
pytest>=7.0.0
pytest-asyncio>=0.21.0
prometheus-client==0.20.0

View File

@@ -60,6 +60,7 @@ from schemas.translation import (
)
from schemas.errors import ErrorResponse
from utils.file_handler import FileHandler
from middleware.metrics import record_translation, record_file_size
from services.progress_tracker import ProgressTracker
from services.storage_tracker import storage_tracker
from core.redis import set_job_status_async, get_job_status_async
@@ -696,6 +697,10 @@ async def translate_document_v1(
_cleanup_old_jobs()
# Record file size metric
if file_extension and file_size:
record_file_size(file_extension, file_size)
_translation_jobs[job_id] = {
"id": job_id,
"status": "queued",
@@ -1124,10 +1129,15 @@ async def _run_translation_job(
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
tracker.set_completed(str(output_path))
# Record translation metric
duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple())
record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success")
logger.info(f"Job {job_id}: Completed successfully")
except Exception as e:
tracker.set_error(str(e))
# Record translation failure metric
record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error")
logger.error(f"Job {job_id}: Failed - {e}")
finally: