fix(monitoring): fix production monitoring startup, alertmanager configuration, prometheus alert syntax, and mcp healthcheck
This commit is contained in:
@@ -150,7 +150,7 @@ services:
|
||||
cpus: '0.25'
|
||||
memory: 128M
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"]
|
||||
test: ["CMD-SHELL", "wget --header \"x-api-key: $${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
@@ -37,7 +37,7 @@ groups:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "⚠️ Disk space below 15%"
|
||||
description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining."
|
||||
description: "Only {{ $value | humanizePercentage }} disk space remaining."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
|
||||
|
||||
@@ -55,11 +55,14 @@ services:
|
||||
image: metalmatze/alertmanager-bot:0.4.3
|
||||
container_name: memento-alertmanager-telegram
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- telegram
|
||||
environment:
|
||||
TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
|
||||
TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-}
|
||||
TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-dummy_token}
|
||||
TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-0}
|
||||
ALERTMANAGER_URL: http://alertmanager:9093
|
||||
STORE: /data/bolt.db
|
||||
STORE: bolt
|
||||
BOLT_PATH: /data/bolt.db
|
||||
LISTEN_ADDR: 0.0.0.0:8080
|
||||
volumes:
|
||||
- alertmanager-bot-data:/data
|
||||
@@ -91,7 +94,7 @@ services:
|
||||
container_name: memento-postgres-exporter
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- /opt/memento/.env.docker
|
||||
- ../.env.docker
|
||||
environment:
|
||||
DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
|
||||
ports:
|
||||
@@ -141,4 +144,4 @@ networks:
|
||||
driver: bridge
|
||||
memento-net:
|
||||
external: true
|
||||
name: memento_memento-network
|
||||
name: ${MEMENTO_NETWORK_NAME:-memento_memento-network}
|
||||
|
||||
@@ -130,13 +130,7 @@ docker compose up -d --remove-orphans --force-recreate memento-note
|
||||
docker compose up -d memento-socket
|
||||
docker compose up -d mcp-server 2>/dev/null || true
|
||||
|
||||
# Redémarrer les exporters monitoring pour appliquer les configs à jour
|
||||
if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then
|
||||
echo "=== Updating monitoring exporters ==="
|
||||
env $(cat /opt/memento/.env.docker | grep -v '^#' | xargs) \
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d \
|
||||
postgres-exporter cadvisor node-exporter redis-exporter 2>&1 || true
|
||||
fi
|
||||
# Monitoring stack updates are handled at the end of successful deployment
|
||||
|
||||
echo "=== Migrations (Prisma CLI via node, pas npx) ==="
|
||||
if docker compose exec -T memento-note test -f ./node_modules/prisma/build/index.js 2>/dev/null; then
|
||||
@@ -152,24 +146,25 @@ for i in $(seq 1 "$HEALTH_CHECK_MAX_ITERATIONS"); do
|
||||
ACTUAL=$(echo "$BODY" | jq -r '.commit // empty' 2>/dev/null || true)
|
||||
if [ "$ACTUAL" = "$GIT_COMMIT" ]; then
|
||||
echo "OK build-info commit=$ACTUAL"
|
||||
if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then
|
||||
# Ne recréer Prometheus que si sa config a changé (préserve l'historique TSDB)
|
||||
PROM_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/(prometheus\.yml|alerts\.yml)' || true)
|
||||
GRAFANA_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/' || true)
|
||||
if [ -n "$PROM_CHANGED" ]; then
|
||||
echo "=== Prometheus config changed — recreating ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate prometheus
|
||||
else
|
||||
echo "=== Prometheus config unchanged — keeping TSDB history ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d prometheus
|
||||
fi
|
||||
if [ -n "$GRAFANA_CHANGED" ]; then
|
||||
echo "=== Grafana config changed — recreating ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate grafana
|
||||
else
|
||||
echo "=== Grafana config unchanged — keeping state ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d grafana
|
||||
fi
|
||||
echo "=== Updating monitoring stack ==="
|
||||
if [ -f /opt/memento/.env.docker ]; then
|
||||
export $(cat /opt/memento/.env.docker | grep -v '^#' | xargs)
|
||||
fi
|
||||
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then
|
||||
echo "=== Starting Monitoring Stack (with Telegram bot) ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml --profile telegram up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack"
|
||||
else
|
||||
echo "=== Starting Monitoring Stack (without Telegram bot) ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack"
|
||||
fi
|
||||
|
||||
if docker ps --format '{{.Names}}' | grep -q "^memento-prometheus$"; then
|
||||
echo "=== Reloading Prometheus configuration ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml exec -T prometheus kill -SIGHUP 1 2>/dev/null || true
|
||||
fi
|
||||
if docker ps --format '{{.Names}}' | grep -q "^memento-alertmanager$"; then
|
||||
echo "=== Reloading Alertmanager configuration ==="
|
||||
docker compose -f monitoring/docker-compose.monitoring.yml exec -T alertmanager kill -SIGHUP 1 2>/dev/null || true
|
||||
fi
|
||||
docker compose ps
|
||||
telegram_notify "success" "Deployment successful — app is healthy"
|
||||
|
||||
Reference in New Issue
Block a user