From ff0fae9ae783193460da178a1af847c2eae5accf Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sat, 30 May 2026 11:42:32 +0000 Subject: [PATCH] fix(monitoring): fix production monitoring startup, alertmanager configuration, prometheus alert syntax, and mcp healthcheck --- docker-compose.yml | 2 +- monitoring/alerts.yml | 2 +- monitoring/docker-compose.monitoring.yml | 13 ++++--- scripts/deploy-prod.sh | 45 +++++++++++------------- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8ded3d7..3d6d805 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -150,7 +150,7 @@ services: cpus: '0.25' memory: 128M healthcheck: - test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"] + test: ["CMD-SHELL", "wget --header \"x-api-key: $${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"] interval: 30s timeout: 10s retries: 3 diff --git a/monitoring/alerts.yml b/monitoring/alerts.yml index 25898d2..0cf771e 100644 --- a/monitoring/alerts.yml +++ b/monitoring/alerts.yml @@ -37,7 +37,7 @@ groups: severity: warning annotations: summary: "⚠️ Disk space below 15%" - description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining." + description: "Only {{ $value | humanizePercentage }} disk space remaining." - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05 diff --git a/monitoring/docker-compose.monitoring.yml b/monitoring/docker-compose.monitoring.yml index fedfdf4..b471d3d 100644 --- a/monitoring/docker-compose.monitoring.yml +++ b/monitoring/docker-compose.monitoring.yml @@ -55,11 +55,14 @@ services: image: metalmatze/alertmanager-bot:0.4.3 container_name: memento-alertmanager-telegram restart: unless-stopped + profiles: + - telegram environment: - TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-} - TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-} + TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-dummy_token} + TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-0} ALERTMANAGER_URL: http://alertmanager:9093 - STORE: /data/bolt.db + STORE: bolt + BOLT_PATH: /data/bolt.db LISTEN_ADDR: 0.0.0.0:8080 volumes: - alertmanager-bot-data:/data @@ -91,7 +94,7 @@ services: container_name: memento-postgres-exporter restart: unless-stopped env_file: - - /opt/memento/.env.docker + - ../.env.docker environment: DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable" ports: @@ -141,4 +144,4 @@ networks: driver: bridge memento-net: external: true - name: memento_memento-network + name: ${MEMENTO_NETWORK_NAME:-memento_memento-network} diff --git a/scripts/deploy-prod.sh b/scripts/deploy-prod.sh index 5bbf3ab..a80431f 100755 --- a/scripts/deploy-prod.sh +++ b/scripts/deploy-prod.sh @@ -130,13 +130,7 @@ docker compose up -d --remove-orphans --force-recreate memento-note docker compose up -d memento-socket docker compose up -d mcp-server 2>/dev/null || true -# Redémarrer les exporters monitoring pour appliquer les configs à jour -if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then - echo "=== Updating monitoring exporters ===" - env $(cat /opt/memento/.env.docker | grep -v '^#' | xargs) \ - docker compose -f monitoring/docker-compose.monitoring.yml up -d \ - postgres-exporter cadvisor node-exporter redis-exporter 2>&1 || true -fi +# Monitoring stack updates are handled at the end of successful deployment echo "=== Migrations (Prisma CLI via node, pas npx) ===" if docker compose exec -T memento-note test -f ./node_modules/prisma/build/index.js 2>/dev/null; then @@ -152,24 +146,25 @@ for i in $(seq 1 "$HEALTH_CHECK_MAX_ITERATIONS"); do ACTUAL=$(echo "$BODY" | jq -r '.commit // empty' 2>/dev/null || true) if [ "$ACTUAL" = "$GIT_COMMIT" ]; then echo "OK build-info commit=$ACTUAL" - if docker ps --format '{{.Names}}' | grep -q "^memento-grafana$"; then - # Ne recréer Prometheus que si sa config a changé (préserve l'historique TSDB) - PROM_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/(prometheus\.yml|alerts\.yml)' || true) - GRAFANA_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^monitoring/' || true) - if [ -n "$PROM_CHANGED" ]; then - echo "=== Prometheus config changed — recreating ===" - docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate prometheus - else - echo "=== Prometheus config unchanged — keeping TSDB history ===" - docker compose -f monitoring/docker-compose.monitoring.yml up -d prometheus - fi - if [ -n "$GRAFANA_CHANGED" ]; then - echo "=== Grafana config changed — recreating ===" - docker compose -f monitoring/docker-compose.monitoring.yml up -d --force-recreate grafana - else - echo "=== Grafana config unchanged — keeping state ===" - docker compose -f monitoring/docker-compose.monitoring.yml up -d grafana - fi + echo "=== Updating monitoring stack ===" + if [ -f /opt/memento/.env.docker ]; then + export $(cat /opt/memento/.env.docker | grep -v '^#' | xargs) + fi + if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then + echo "=== Starting Monitoring Stack (with Telegram bot) ===" + docker compose -f monitoring/docker-compose.monitoring.yml --profile telegram up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack" + else + echo "=== Starting Monitoring Stack (without Telegram bot) ===" + docker compose -f monitoring/docker-compose.monitoring.yml up -d --remove-orphans 2>&1 || echo "WARN: Failed to bring up monitoring stack" + fi + + if docker ps --format '{{.Names}}' | grep -q "^memento-prometheus$"; then + echo "=== Reloading Prometheus configuration ===" + docker compose -f monitoring/docker-compose.monitoring.yml exec -T prometheus kill -SIGHUP 1 2>/dev/null || true + fi + if docker ps --format '{{.Names}}' | grep -q "^memento-alertmanager$"; then + echo "=== Reloading Alertmanager configuration ===" + docker compose -f monitoring/docker-compose.monitoring.yml exec -T alertmanager kill -SIGHUP 1 2>/dev/null || true fi docker compose ps telegram_notify "success" "Deployment successful — app is healthy"