feat: add Prometheus metrics + fix CI/CD health check port
- Add prometheus-client dependency - Create middleware/metrics.py with PrometheusMiddleware - Expose /metrics endpoint in Prometheus text format - Track http_requests_total, translation_total, translation_duration_seconds, file_size_bytes - Instrument translate routes with record_translation() and record_file_size() - Fix deploy.yml health check: localhost:8000 -> localhost:8001 (Portainer conflict) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,7 @@ jobs:
|
|||||||
|
|
||||||
# Final health check on backend
|
# Final health check on backend
|
||||||
for i in $(seq 1 10); do
|
for i in $(seq 1 10); do
|
||||||
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
|
if curl -sf http://localhost:8001/health > /dev/null 2>&1; then
|
||||||
echo "Backend health check: OK"
|
echo "Backend health check: OK"
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
@@ -77,7 +77,7 @@ jobs:
|
|||||||
echo "========================================="
|
echo "========================================="
|
||||||
docker compose ps
|
docker compose ps
|
||||||
echo ""
|
echo ""
|
||||||
echo "Health: $(curl -sf http://localhost:8000/health 2>/dev/null || echo 'FAILED')"
|
echo "Health: $(curl -sf http://localhost:8001/health 2>/dev/null || echo 'FAILED')"
|
||||||
echo "========================================="
|
echo "========================================="
|
||||||
|
|
||||||
# Optional: deploy monitoring stack
|
# Optional: deploy monitoring stack
|
||||||
|
|||||||
8
main.py
8
main.py
@@ -56,6 +56,7 @@ from middleware.security import (
|
|||||||
RequestLoggingMiddleware,
|
RequestLoggingMiddleware,
|
||||||
)
|
)
|
||||||
from middleware.error_handler import ErrorHandlingMiddleware, format_error_response
|
from middleware.error_handler import ErrorHandlingMiddleware, format_error_response
|
||||||
|
from middleware.metrics import PrometheusMiddleware, get_metrics
|
||||||
from middleware.cleanup import (
|
from middleware.cleanup import (
|
||||||
MemoryMonitor,
|
MemoryMonitor,
|
||||||
HealthChecker,
|
HealthChecker,
|
||||||
@@ -359,6 +360,7 @@ app = FastAPI(
|
|||||||
app.openapi = custom_openapi
|
app.openapi = custom_openapi
|
||||||
|
|
||||||
app.add_middleware(ErrorHandlingMiddleware)
|
app.add_middleware(ErrorHandlingMiddleware)
|
||||||
|
app.add_middleware(PrometheusMiddleware)
|
||||||
app.add_middleware(RequestLoggingMiddleware, log_body=False)
|
app.add_middleware(RequestLoggingMiddleware, log_body=False)
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
SecurityHeadersMiddleware,
|
SecurityHeadersMiddleware,
|
||||||
@@ -574,6 +576,12 @@ async def root():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/metrics", tags=["Health"])
|
||||||
|
async def metrics_endpoint():
|
||||||
|
"""Prometheus metrics endpoint"""
|
||||||
|
return get_metrics()
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health", tags=["Health"])
|
@app.get("/health", tags=["Health"])
|
||||||
async def health_check():
|
async def health_check():
|
||||||
"""Health check endpoint with detailed system status (Kubernetes liveness probe)"""
|
"""Health check endpoint with detailed system status (Kubernetes liveness probe)"""
|
||||||
|
|||||||
85
middleware/metrics.py
Normal file
85
middleware/metrics.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""
|
||||||
|
Prometheus metrics middleware for FastAPI.
|
||||||
|
|
||||||
|
Exposes /metrics endpoint in Prometheus text format.
|
||||||
|
Tracks HTTP requests, translations, and file uploads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from starlette.middleware.base import BaseHTTPMiddleware
|
||||||
|
from starlette.requests import Request
|
||||||
|
from starlette.responses import Response
|
||||||
|
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LSP
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---- Metrics definitions ----
|
||||||
|
|
||||||
|
http_requests_total = Counter(
|
||||||
|
"http_requests_total",
|
||||||
|
"Total HTTP requests",
|
||||||
|
["method", "path", "status"],
|
||||||
|
)
|
||||||
|
|
||||||
|
translation_total = Counter(
|
||||||
|
"translation_total",
|
||||||
|
"Total translations processed",
|
||||||
|
["provider", "file_type", "status"],
|
||||||
|
)
|
||||||
|
|
||||||
|
translation_duration_seconds = Histogram(
|
||||||
|
"translation_duration_seconds",
|
||||||
|
"Translation processing duration in seconds",
|
||||||
|
["provider", "file_type"],
|
||||||
|
buckets=(0.5, 1, 2, 5, 10, 30, 60, 120, 300),
|
||||||
|
)
|
||||||
|
|
||||||
|
file_size_bytes = Histogram(
|
||||||
|
"file_size_bytes",
|
||||||
|
"Uploaded file size in bytes",
|
||||||
|
["file_type"],
|
||||||
|
buckets=(100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Paths to skip from metrics (noisy health checks)
|
||||||
|
_SKIP_PATHS = {"/health", "/ready", "/metrics", "/favicon.ico"}
|
||||||
|
|
||||||
|
|
||||||
|
def record_translation(provider: str, file_type: str, duration: float, status: str = "success"):
|
||||||
|
translation_total.labels(provider=provider, file_type=file_type, status=status).inc()
|
||||||
|
translation_duration_seconds.labels(provider=provider, file_type=file_type).observe(duration)
|
||||||
|
|
||||||
|
|
||||||
|
def record_file_size(file_type: str, size_bytes: int):
|
||||||
|
file_size_bytes.labels(file_type=file_type).observe(size_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusMiddleware(BaseHTTPMiddleware):
|
||||||
|
async def dispatch(self, request: Request, call_next):
|
||||||
|
if request.url.path in _SKIP_PATHS:
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
response: Response = await call_next(request)
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
path = request.url.path
|
||||||
|
# Group dynamic paths to avoid label explosion
|
||||||
|
if path.startswith("/api/v1/translations/"):
|
||||||
|
path = "/api/v1/translations/{id}"
|
||||||
|
elif path.startswith("/api/v1/download/"):
|
||||||
|
path = "/api/v1/download/{id}"
|
||||||
|
|
||||||
|
http_requests_total.labels(
|
||||||
|
method=request.method,
|
||||||
|
path=path,
|
||||||
|
status=str(response.status_code),
|
||||||
|
).inc()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def get_metrics() -> Response:
|
||||||
|
body = generate_latest()
|
||||||
|
return Response(content=body, media_type=CONTENT_TYPE_LSP)
|
||||||
@@ -46,3 +46,5 @@ aiosmtplib>=3.0.0
|
|||||||
|
|
||||||
pytest>=7.0.0
|
pytest>=7.0.0
|
||||||
pytest-asyncio>=0.21.0
|
pytest-asyncio>=0.21.0
|
||||||
|
|
||||||
|
prometheus-client==0.20.0
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ from schemas.translation import (
|
|||||||
)
|
)
|
||||||
from schemas.errors import ErrorResponse
|
from schemas.errors import ErrorResponse
|
||||||
from utils.file_handler import FileHandler
|
from utils.file_handler import FileHandler
|
||||||
|
from middleware.metrics import record_translation, record_file_size
|
||||||
from services.progress_tracker import ProgressTracker
|
from services.progress_tracker import ProgressTracker
|
||||||
from services.storage_tracker import storage_tracker
|
from services.storage_tracker import storage_tracker
|
||||||
from core.redis import set_job_status_async, get_job_status_async
|
from core.redis import set_job_status_async, get_job_status_async
|
||||||
@@ -696,6 +697,10 @@ async def translate_document_v1(
|
|||||||
|
|
||||||
_cleanup_old_jobs()
|
_cleanup_old_jobs()
|
||||||
|
|
||||||
|
# Record file size metric
|
||||||
|
if file_extension and file_size:
|
||||||
|
record_file_size(file_extension, file_size)
|
||||||
|
|
||||||
_translation_jobs[job_id] = {
|
_translation_jobs[job_id] = {
|
||||||
"id": job_id,
|
"id": job_id,
|
||||||
"status": "queued",
|
"status": "queued",
|
||||||
@@ -1124,10 +1129,15 @@ async def _run_translation_job(
|
|||||||
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
|
logger.warning(f"Job {job_id}: watermark failed: {wm_err}")
|
||||||
|
|
||||||
tracker.set_completed(str(output_path))
|
tracker.set_completed(str(output_path))
|
||||||
|
# Record translation metric
|
||||||
|
duration = time.time() - time.mktime(datetime.fromisoformat(job["created_at"].replace("Z", "+00:00")).timetuple())
|
||||||
|
record_translation(provider=provider, file_type=file_extension or "unknown", duration=duration, status="success")
|
||||||
logger.info(f"Job {job_id}: Completed successfully")
|
logger.info(f"Job {job_id}: Completed successfully")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
tracker.set_error(str(e))
|
tracker.set_error(str(e))
|
||||||
|
# Record translation failure metric
|
||||||
|
record_translation(provider=provider, file_type=file_extension or "unknown", duration=0, status="error")
|
||||||
logger.error(f"Job {job_id}: Failed - {e}")
|
logger.error(f"Job {job_id}: Failed - {e}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
Reference in New Issue
Block a user