fix(monitoring): fix production monitoring startup, alertmanager configuration, prometheus alert syntax, and mcp healthcheck
Some checks failed
CI / Lint, Unit Tests & Build (push) Successful in 5m5s
CI / Deploy production (on server) (push) Has been cancelled

This commit is contained in:
Antigravity
2026-05-30 11:42:32 +00:00
parent c266359f63
commit ff0fae9ae7
4 changed files with 30 additions and 32 deletions

View File

@@ -150,7 +150,7 @@ services:
cpus: '0.25' cpus: '0.25'
memory: 128M memory: 128M
healthcheck: healthcheck:
test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"] test: ["CMD-SHELL", "wget --header \"x-api-key: $${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3

View File

@@ -37,7 +37,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
summary: "⚠️ Disk space below 15%" summary: "⚠️ Disk space below 15%"
description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining." description: "Only {{ $value | humanizePercentage }} disk space remaining."
- alert: DiskSpaceCritical - alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05 expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05

View File

@@ -55,11 +55,14 @@ services:
image: metalmatze/alertmanager-bot:0.4.3 image: metalmatze/alertmanager-bot:0.4.3
container_name: memento-alertmanager-telegram container_name: memento-alertmanager-telegram
restart: unless-stopped restart: unless-stopped
profiles:
- telegram
environment: environment:
TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-} TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-dummy_token}
TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-} TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-0}
ALERTMANAGER_URL: http://alertmanager:9093 ALERTMANAGER_URL: http://alertmanager:9093
STORE: /data/bolt.db STORE: bolt
BOLT_PATH: /data/bolt.db
LISTEN_ADDR: 0.0.0.0:8080 LISTEN_ADDR: 0.0.0.0:8080
volumes: volumes:
- alertmanager-bot-data:/data - alertmanager-bot-data:/data
@@ -91,7 +94,7 @@ services:
container_name: memento-postgres-exporter container_name: memento-postgres-exporter
restart: unless-stopped restart: unless-stopped
env_file: env_file:
- /opt/memento/.env.docker - ../.env.docker
environment: environment:
DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable" DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
ports: ports:
@@ -141,4 +144,4 @@ networks:
driver: bridge driver: bridge
memento-net: memento-net:
external: true external: true
name: memento_memento-network name: ${MEMENTO_NETWORK_NAME:-memento_memento-network}

View File

@@ -130,13 +130,7 @@ docker compose up -d --remove-orphans --force-recreate memento-note
docker compose up -d memento-socket docker compose up -d memento-socket
docker compose up -d mcp-server 2>/dev/null || true docker compose up -d mcp-server 2>/dev/null || true
# Redémarrer les exporters monitoring pour appliquer les configs à jour # Monitoring stack updates are handled at the end of successful deployment
if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then
echo "=== Updating monitoring exporters ==="
env $(cat /opt/memento/.env.docker | grep -v '^#' | xargs) \
docker compose -f monitoring/docker-compose.monitoring.yml up -d \
postgres-exporter cadvisor node-exporter redis-exporter 2>&1 || true
fi
echo "=== Migrations (Prisma CLI via node, pas npx) ===" echo "=== Migrations (Prisma CLI via node, pas npx) ==="
if docker compose exec -T memento-note test -f ./node_modules/prisma/build/index.js 2>/dev/null; then if docker compose exec -T memento-note test -f ./node_modules/prisma/build/index.js 2>/dev/null; then
@@ -152,24 +146,25 @@ for i in $(seq 1 "$HEALTH_CHECK_MAX_ITERATIONS"); do
ACTUAL=$(echo "$BODY" | jq -r '.commit // empty' 2>/dev/null || true) ACTUAL=$(echo "$BODY" | jq -r '.commit // empty' 2>/dev/null || true)
if [ "$ACTUAL" = "$GIT_COMMIT" ]; then if [ "$ACTUAL" = "$GIT_COMMIT" ]; then
echo "OK build-info commit=$ACTUAL" echo "OK build-info commit=$ACTUAL"
if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then echo "=== Updating monitoring stack ==="
# Ne recréer Prometheus que si sa config a changé (préserve l'historique TSDB) if [ -f /opt/memento/.env.docker ]; then
PROM_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/(prometheus\.yml|alerts\.yml)' || true) export $(cat /opt/memento/.env.docker | grep -v '^#' | xargs)
GRAFANA_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/' || true) fi
if [ -n "$PROM_CHANGED" ]; then if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then
echo "=== Prometheus config changed — recreating ===" echo "=== Starting Monitoring Stack (with Telegram bot) ==="
docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate prometheus docker compose -f monitoring/docker-compose.monitoring.yml --profile telegram up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack"
else else
echo "=== Prometheus config unchanged — keeping TSDB history ===" echo "=== Starting Monitoring Stack (without Telegram bot) ==="
docker compose -f monitoring/docker-compose.monitoring.yml up -d prometheus docker compose -f monitoring/docker-compose.monitoring.yml up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack"
fi fi
if [ -n "$GRAFANA_CHANGED" ]; then
echo "=== Grafana config changed — recreating ===" if docker ps --format '{{.Names}}' | grep -q "^memento-prometheus$"; then
docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate grafana echo "=== Reloading Prometheus configuration ==="
else docker compose -f monitoring/docker-compose.monitoring.yml exec -T prometheus kill -SIGHUP 1 2>/dev/null || true
echo "=== Grafana config unchanged — keeping state ===" fi
docker compose -f monitoring/docker-compose.monitoring.yml up -d grafana if docker ps --format '{{.Names}}' | grep -q "^memento-alertmanager$"; then
fi echo "=== Reloading Alertmanager configuration ==="
docker compose -f monitoring/docker-compose.monitoring.yml exec -T alertmanager kill -SIGHUP 1 2>/dev/null || true
fi fi
docker compose ps docker compose ps
telegram_notify "success" "Deployment successful — app is healthy" telegram_notify "success" "Deployment successful — app is healthy"