From 79fd6553b7e47febf53d6c951faea123bb6bc923 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Fri, 29 May 2026 14:49:34 +0000 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20business=20metrics=20+=20ha?= =?UTF-8?q?rdening=20s=C3=A9curit=C3=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Métriques business dans /api/metrics : - Abonnements par tier/status (BASIC/PRO/ENTERPRISE × ACTIVE/CANCELED) - Nouveaux abonnements ce mois vs mois dernier - Désabonnements / churn ce mois vs mois dernier - Utilisateurs actifs 7j / 30j (proxy : note modifiée) - Nouvelles inscriptions 7j / ce mois - Runs agents IA par status (30j + aujourd'hui) + tokens consommés - Usage IA par feature (requêtes + tokens ce mois) - Logins aujourd'hui / ce mois (via AuditLog) - Sessions brainstorm ce mois - Flashcards total + reviews ce mois Alertes Prometheus : - HighChurnRate (> 10 désabonnements ce mois) - NoNewUsersLast7Days (aucune inscription 7j) - AgentRunsHighErrorRate (> 20% erreurs agents) - BusinessMetricsCollectionFailed Hardening monitoring : - Ports monitoring → 127.0.0.1 (plus exposés publiquement) - Images pinned (prometheus v2.53.0, grafana 11.1.0, etc.) - alertmanager-bridge fake → metalmatze/alertmanager-bot:0.4.3 - /api/metrics sécurisé avec METRICS_TOKEN bearer - Prometheus auth bearer via credentials_file - Redis AOF + 256mb, healthcheck → /api/build-info - repeat_interval 4h, inhibit_rules alertmanager - Secrets CI/CD : AUTH_GOOGLE_SECRET, METRICS_TOKEN, GRAFANA, MCP_API_KEY Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitea/workflows/ci.yaml | 8 + .gitea/workflows/deploy.yaml | 7 + .gitignore | 1 + docker-compose.yml | 6 +- memento-note/app/api/metrics/route.ts | 187 ++++++++++++++++++++++- monitoring/alertmanager.yml | 20 ++- monitoring/alerts.yml | 102 +++++++++++-- monitoring/docker-compose.monitoring.yml | 70 +++++---- monitoring/prometheus.yml | 2 + 9 files changed, 352 insertions(+), 51 deletions(-) diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml index a62aad5..38910dd 100644 --- a/.gitea/workflows/ci.yaml +++ b/.gitea/workflows/ci.yaml @@ -174,6 +174,9 @@ jobs: NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }} TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }} + GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }} + MCP_API_KEY: ${{ secrets.MCP_API_KEY }} run: | ENV_FILE="/opt/memento/.env.docker" touch "$ENV_FILE" @@ -226,6 +229,11 @@ jobs: upsert NEXT_PUBLIC_SOCKET_URL "$NEXT_PUBLIC_SOCKET_URL" upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN" upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID" + upsert METRICS_TOKEN "$METRICS_TOKEN" + upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD" + upsert MCP_API_KEY "$MCP_API_KEY" + # Write metrics token file for Prometheus (same secret) + [ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true - name: Deploy on 192.168.1.190 env: diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml index 3012fc2..1e56506 100644 --- a/.gitea/workflows/deploy.yaml +++ b/.gitea/workflows/deploy.yaml @@ -61,6 +61,9 @@ jobs: NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }} TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }} + GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }} + MCP_API_KEY: ${{ secrets.MCP_API_KEY }} run: | ENV_FILE="/opt/memento/.env.docker" touch "$ENV_FILE" @@ -113,6 +116,10 @@ jobs: upsert REDIS_HOST "redis" upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN" upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID" + upsert METRICS_TOKEN "$METRICS_TOKEN" + upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD" + upsert MCP_API_KEY "$MCP_API_KEY" + [ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true - name: Deploy (full build, no CI artifact) env: diff --git a/.gitignore b/.gitignore index 7f40959..ebaf100 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ docker-data/ # Misc *.tsbuildinfo next-env.d.ts +monitoring/metrics-token diff --git a/docker-compose.yml b/docker-compose.yml index 31d08df..8ded3d7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,7 @@ services: image: redis:7-alpine container_name: memento-redis restart: unless-stopped - command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru + command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru --appendonly yes --appendfsync everysec volumes: - redis-data:/data ports: @@ -73,7 +73,7 @@ services: condition: service_healthy restart: unless-stopped healthcheck: - test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"] + test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/api/build-info',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"] interval: 15s timeout: 10s retries: 5 @@ -150,7 +150,7 @@ services: cpus: '0.25' memory: 128M healthcheck: - test: ["CMD-SHELL", "wget --header \"x-api-key: 1b11f42537c1442456ea413feee75bac\" -q -O /dev/null http://localhost:3001/ || exit 1"] + test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"] interval: 30s timeout: 10s retries: 3 diff --git a/memento-note/app/api/metrics/route.ts b/memento-note/app/api/metrics/route.ts index 8eb4283..c98b34f 100644 --- a/memento-note/app/api/metrics/route.ts +++ b/memento-note/app/api/metrics/route.ts @@ -4,7 +4,17 @@ import { redis } from '@/lib/redis' export const dynamic = 'force-dynamic' -export async function GET() { +export async function GET(req: Request) { + // Secure endpoint with bearer token (METRICS_TOKEN env var) + const metricsToken = process.env.METRICS_TOKEN + if (metricsToken) { + const authHeader = req.headers.get('authorization') ?? '' + const token = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : '' + if (token !== metricsToken) { + return new NextResponse('Unauthorized', { status: 401 }) + } + } + const lines: string[] = [] const metric = (name: string, help: string, type: string, value: number | string, labels = '') => { @@ -13,10 +23,19 @@ export async function GET() { lines.push(labels ? `${name}{${labels}} ${value}` : `${name} ${value}`) } - // Uptime + // Multiple labeled values for the same metric name + const metricLabeled = (name: string, help: string, type: string, rows: Array<{ labels: string; value: number }>) => { + lines.push(`# HELP ${name} ${help}`) + lines.push(`# TYPE ${name} ${type}`) + for (const row of rows) { + lines.push(`${name}{${row.labels}} ${row.value}`) + } + } + + // ── Uptime ────────────────────────────────────────────────────────────── metric('memento_uptime_seconds', 'Application uptime in seconds', 'gauge', process.uptime().toFixed(2)) - // Database + // ── Infrastructure ─────────────────────────────────────────────────────── try { const dbStart = Date.now() const [noteCount, notebookCount, userCount] = await Promise.all([ @@ -34,7 +53,6 @@ export async function GET() { metric('memento_db_up', 'Database connectivity (1=up, 0=down)', 'gauge', 0) } - // Redis try { const redisStart = Date.now() await redis.ping() @@ -51,12 +69,171 @@ export async function GET() { metric('memento_redis_up', 'Redis connectivity (1=up, 0=down)', 'gauge', 0) } - // Node.js process memory const mem = process.memoryUsage() metric('memento_process_heap_used_bytes', 'Node.js heap used in bytes', 'gauge', mem.heapUsed) metric('memento_process_heap_total_bytes', 'Node.js heap total in bytes', 'gauge', mem.heapTotal) metric('memento_process_rss_bytes', 'Node.js RSS memory in bytes', 'gauge', mem.rss) + // ── Business metrics ───────────────────────────────────────────────────── + try { + const now = new Date() + const startOfMonth = new Date(now.getFullYear(), now.getMonth(), 1) + const startOfLastMonth = new Date(now.getFullYear(), now.getMonth() - 1, 1) + const endOfLastMonth = new Date(now.getFullYear(), now.getMonth(), 0, 23, 59, 59) + const last7days = new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000) + const last30days = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000) + + // ── Subscriptions par tier ── + const subsByTier = await prisma.subscription.groupBy({ + by: ['tier', 'status'], + _count: { _all: true }, + }) + const subRows = subsByTier.map(r => ({ + labels: `tier="${r.tier}",status="${r.status}"`, + value: r._count._all, + })) + metricLabeled( + 'memento_subscriptions_total', + 'Total subscriptions by tier and status', + 'gauge', + subRows, + ) + + // Totaux agrégés utiles + const activeSubs = subsByTier + .filter(r => r.status === 'ACTIVE') + .reduce((s, r) => s + r._count._all, 0) + const canceledSubs = subsByTier + .filter(r => r.status === 'CANCELED') + .reduce((s, r) => s + r._count._all, 0) + metric('memento_subscriptions_active_total', 'Total active subscriptions (all tiers)', 'gauge', activeSubs) + metric('memento_subscriptions_canceled_total', 'Total canceled subscriptions (all tiers)', 'gauge', canceledSubs) + + // ── Nouveaux abonnements ce mois ── + const newSubsThisMonth = await prisma.subscription.count({ + where: { createdAt: { gte: startOfMonth }, status: 'ACTIVE' }, + }) + const newSubsLastMonth = await prisma.subscription.count({ + where: { createdAt: { gte: startOfLastMonth, lte: endOfLastMonth }, status: 'ACTIVE' }, + }) + metric('memento_subscriptions_new_this_month', 'New active subscriptions created this month', 'gauge', newSubsThisMonth) + metric('memento_subscriptions_new_last_month', 'New active subscriptions created last month', 'gauge', newSubsLastMonth) + + // ── Désabonnements (cancelAtPeriodEnd ou canceledAt ce mois) ── + const churnsThisMonth = await prisma.subscription.count({ + where: { + OR: [ + { canceledAt: { gte: startOfMonth } }, + { cancelAtPeriodEnd: true, updatedAt: { gte: startOfMonth } }, + ], + }, + }) + const churnsLastMonth = await prisma.subscription.count({ + where: { + OR: [ + { canceledAt: { gte: startOfLastMonth, lte: endOfLastMonth } }, + { cancelAtPeriodEnd: true, updatedAt: { gte: startOfLastMonth, lte: endOfLastMonth } }, + ], + }, + }) + metric('memento_churn_this_month', 'Cancellations / pending cancellations this month', 'gauge', churnsThisMonth) + metric('memento_churn_last_month', 'Cancellations / pending cancellations last month', 'gauge', churnsLastMonth) + + // ── Utilisateurs actifs ── + const activeUsers7d = await prisma.note.groupBy({ + by: ['userId'], + where: { updatedAt: { gte: last7days } }, + }) + const activeUsers30d = await prisma.note.groupBy({ + by: ['userId'], + where: { updatedAt: { gte: last30days } }, + }) + metric('memento_active_users_7d', 'Users who modified at least one note in the last 7 days', 'gauge', activeUsers7d.length) + metric('memento_active_users_30d', 'Users who modified at least one note in the last 30 days', 'gauge', activeUsers30d.length) + + // Nouveaux utilisateurs + const newUsers7d = await prisma.user.count({ where: { createdAt: { gte: last7days } } }) + const newUsersThisMonth = await prisma.user.count({ where: { createdAt: { gte: startOfMonth } } }) + metric('memento_new_users_7d', 'New user registrations in the last 7 days', 'gauge', newUsers7d) + metric('memento_new_users_this_month', 'New user registrations this month', 'gauge', newUsersThisMonth) + + // ── Agents IA ── + const agentsByStatus = await prisma.agentAction.groupBy({ + by: ['status'], + _count: { _all: true }, + where: { createdAt: { gte: last30days } }, + }) + const agentRows = agentsByStatus.map(r => ({ + labels: `status="${r.status}"`, + value: r._count._all, + })) + metricLabeled( + 'memento_agent_runs_30d', + 'Agent runs by status in the last 30 days', + 'gauge', + agentRows, + ) + + const agentRunsToday = await prisma.agentAction.count({ + where: { createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) } }, + }) + metric('memento_agent_runs_today', 'Agent runs triggered today', 'gauge', agentRunsToday) + + // Tokens consommés par les agents + const agentTokens = await prisma.agentAction.aggregate({ + _sum: { tokensUsed: true }, + where: { createdAt: { gte: startOfMonth } }, + }) + metric('memento_agent_tokens_this_month', 'Total tokens consumed by agents this month', 'gauge', agentTokens._sum.tokensUsed ?? 0) + + // ── Usage IA par feature (ce mois) ── + const usageByFeature = await prisma.usageLog.groupBy({ + by: ['feature'], + _sum: { requestsCount: true, tokensUsed: true }, + where: { periodStart: { gte: startOfMonth } }, + }) + const usageRequestRows = usageByFeature.map(r => ({ + labels: `feature="${r.feature}"`, + value: r._sum.requestsCount ?? 0, + })) + const usageTokenRows = usageByFeature.map(r => ({ + labels: `feature="${r.feature}"`, + value: r._sum.tokensUsed ?? 0, + })) + metricLabeled('memento_ai_requests_this_month', 'AI API requests by feature this month', 'gauge', usageRequestRows) + metricLabeled('memento_ai_tokens_this_month', 'AI tokens consumed by feature this month', 'gauge', usageTokenRows) + + // ── Logins (AuditLog) ── + const loginsToday = await prisma.auditLog.count({ + where: { + action: 'LOGIN', + createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) }, + }, + }) + const loginsThisMonth = await prisma.auditLog.count({ + where: { action: 'LOGIN', createdAt: { gte: startOfMonth } }, + }) + metric('memento_logins_today', 'Login events today', 'gauge', loginsToday) + metric('memento_logins_this_month', 'Login events this month', 'gauge', loginsThisMonth) + + // ── Brainstorm sessions ── + const brainstormThisMonth = await prisma.brainstormSession.count({ + where: { createdAt: { gte: startOfMonth } }, + }) + metric('memento_brainstorm_sessions_this_month', 'Brainstorm sessions created this month', 'gauge', brainstormThisMonth) + + // ── Flashcards ── + const flashcardsTotal = await prisma.flashcard.count() + const flashcardsReviewedThisMonth = await prisma.flashcardReview.count({ + where: { reviewedAt: { gte: startOfMonth } }, + }) + metric('memento_flashcards_total', 'Total flashcards in the system', 'gauge', flashcardsTotal) + metric('memento_flashcard_reviews_this_month', 'Flashcard review events this month', 'gauge', flashcardsReviewedThisMonth) + } catch (err) { + console.error('[metrics] Business metrics error:', err) + metric('memento_business_metrics_error', 'Business metrics collection failed (1=error)', 'gauge', 1) + } + const body = lines.join('\n') + '\n' return new NextResponse(body, { diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml index d07f3b5..f847637 100644 --- a/monitoring/alertmanager.yml +++ b/monitoring/alertmanager.yml @@ -1,11 +1,23 @@ route: - receiver: 'telegram' + receiver: 'telegram-bot' group_wait: 10s group_interval: 5m - repeat_interval: 1h + repeat_interval: 4h + routes: + - match: + severity: critical + receiver: 'telegram-bot' + repeat_interval: 1h receivers: - - name: 'telegram' + - name: 'telegram-bot' webhook_configs: - - url: 'http://alertmanager-bridge:8080/alert' + - url: 'http://alertmanager-telegram:8080/alerts' send_resolved: true + +inhibit_rules: + - source_match: + severity: critical + target_match: + severity: warning + equal: ['alertname'] diff --git a/monitoring/alerts.yml b/monitoring/alerts.yml index 7cdb566..25898d2 100644 --- a/monitoring/alerts.yml +++ b/monitoring/alerts.yml @@ -7,7 +7,8 @@ groups: labels: severity: critical annotations: - summary: "Memento app is DOWN" + summary: "🔴 Memento app is DOWN" + description: "The Next.js application has been unreachable for 2+ minutes." - alert: PostgresDown expr: up{job="postgres"} == 0 @@ -15,7 +16,8 @@ groups: labels: severity: critical annotations: - summary: "PostgreSQL is DOWN" + summary: "🔴 PostgreSQL is DOWN" + description: "Database has been unreachable for 1+ minute." - alert: RedisDown expr: up{job="redis"} == 0 @@ -23,15 +25,27 @@ groups: labels: severity: critical annotations: - summary: "Redis is DOWN" + summary: "🔴 Redis is DOWN" + description: "Redis cache/quota store has been unreachable for 1+ minute." + - name: resources + rules: - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: warning annotations: - summary: "Disk space below 15%" + summary: "⚠️ Disk space below 15%" + description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining." + + - alert: DiskSpaceCritical + expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05 + for: 1m + labels: + severity: critical + annotations: + summary: "🔴 Disk space CRITICAL (< 5%)" - alert: HighMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 @@ -39,15 +53,25 @@ groups: labels: severity: warning annotations: - summary: "Memory usage above 90%" + summary: "⚠️ Memory usage above 90%" + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "⚠️ CPU usage above 85% for 10 minutes" + + - name: database + rules: - alert: PostgresConnectionsHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: - summary: "PostgreSQL connections above 80" + summary: "⚠️ PostgreSQL connections above 80" - alert: PostgresSlowQueries expr: pg_stat_statements_mean_exec_seconds > 5 @@ -55,15 +79,33 @@ groups: labels: severity: warning annotations: - summary: "PostgreSQL slow queries detected" + summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)" + - alert: RedisMemoryHigh + expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "⚠️ Redis memory above 85% of limit" + + - name: application + rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 3m labels: severity: warning annotations: - summary: "HTTP 5xx error rate above 5%" + summary: "⚠️ HTTP 5xx error rate above 5%" + + - alert: AppHighHeapMemory + expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90 + for: 5m + labels: + severity: warning + annotations: + summary: "⚠️ Next.js heap usage above 90%" - alert: ContainerRestarted expr: increase(container_restart_count[1h]) > 0 @@ -71,4 +113,46 @@ groups: labels: severity: warning annotations: - summary: "Container restarted in the last hour" + summary: "⚠️ Container restarted in the last hour" + description: "Container {{ $labels.name }} restarted unexpectedly." + + # ── Business Alerts ────────────────────────────────────────────────────────── + - name: business + rules: + - alert: HighChurnRate + expr: memento_churn_this_month > 10 + for: 0m + labels: + severity: warning + annotations: + summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements" + description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons." + + - alert: NoNewUsersLast7Days + expr: memento_new_users_7d == 0 + for: 1h + labels: + severity: warning + annotations: + summary: "📊 Aucune nouvelle inscription depuis 7 jours" + description: "Vérifier le funnel d'onboarding et les canaux d'acquisition." + + - alert: AgentRunsHighErrorRate + expr: | + memento_agent_runs_30d{status="error"} / + (memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2 + for: 5m + labels: + severity: warning + annotations: + summary: "🤖 Taux d'erreur agents IA > 20% ce mois" + description: "{{ $value | humanizePercentage }} des runs d'agents échouent." + + - alert: BusinessMetricsCollectionFailed + expr: memento_business_metrics_error == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "⚠️ Collecte métriques business en erreur" + description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business." diff --git a/monitoring/docker-compose.monitoring.yml b/monitoring/docker-compose.monitoring.yml index 477db31..2242e32 100644 --- a/monitoring/docker-compose.monitoring.yml +++ b/monitoring/docker-compose.monitoring.yml @@ -1,49 +1,73 @@ services: prometheus: - image: prom/prometheus:latest + image: prom/prometheus:v2.53.0 container_name: memento-prometheus restart: unless-stopped volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./alerts.yml:/etc/prometheus/alerts.yml:ro + - ./metrics-token:/etc/prometheus/metrics-token:ro - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' ports: - - "9090:9090" + - "127.0.0.1:9090:9090" networks: - memento-monitoring - memento-net grafana: - image: grafana/grafana:latest + image: grafana/grafana:11.1.0 container_name: memento-grafana restart: unless-stopped environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin} GF_USERS_ALLOW_SIGN_UP: "false" - GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3001}" + GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3002}" + GF_SECURITY_DISABLE_GRAVATAR: "true" + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" volumes: - grafana-data:/var/lib/grafana - ./grafana-provisioning:/etc/grafana/provisioning:ro - ./grafana-dashboards:/etc/grafana/dashboards:ro ports: - - "3002:3000" + - "127.0.0.1:3002:3000" networks: - memento-monitoring alertmanager: - image: prom/alertmanager:latest + image: prom/alertmanager:v0.27.0 container_name: memento-alertmanager restart: unless-stopped volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro ports: - - "9093:9093" + - "127.0.0.1:9093:9093" + networks: + - memento-monitoring + + # Real Telegram webhook bridge (replaces the fake alpine sleep) + alertmanager-telegram: + image: metalmatze/alertmanager-bot:0.4.3 + container_name: memento-alertmanager-telegram + restart: unless-stopped + environment: + TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-} + TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-} + ALERTMANAGER_URL: http://alertmanager:9093 + STORE: /data/bolt.db + LISTEN_ADDR: 0.0.0.0:8080 + volumes: + - alertmanager-bot-data:/data networks: - memento-monitoring node-exporter: - image: prom/node-exporter:latest + image: prom/node-exporter:v1.8.1 container_name: memento-node-exporter restart: unless-stopped pid: host @@ -57,13 +81,13 @@ services: - '--path.rootfs=/rootfs' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' ports: - - "9100:9100" + - "127.0.0.1:9100:9100" networks: - memento-monitoring - memento-net postgres-exporter: - image: prometheuscommunity/postgres-exporter:latest + image: prometheuscommunity/postgres-exporter:v0.15.0 container_name: memento-postgres-exporter restart: unless-stopped env_file: @@ -71,25 +95,25 @@ services: environment: DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable" ports: - - "9187:9187" + - "127.0.0.1:9187:9187" networks: - memento-monitoring - memento-net redis-exporter: - image: oliver006/redis_exporter:latest + image: oliver006/redis_exporter:v1.62.0 container_name: memento-redis-exporter restart: unless-stopped environment: REDIS_ADDR: "redis://memento-redis:6379" ports: - - "9121:9121" + - "127.0.0.1:9121:9121" networks: - memento-monitoring - memento-net cadvisor: - image: gcr.io/cadvisor/cadvisor:latest + image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: memento-cadvisor restart: unless-stopped privileged: true @@ -102,29 +126,15 @@ services: - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro ports: - - "8081:8080" + - "127.0.0.1:8081:8080" networks: - memento-monitoring - memento-net - alertmanager-bridge: - image: alpine:latest - container_name: memento-alertmanager-bridge - restart: unless-stopped - entrypoint: | - sh -c ' - apk add --no-cache curl - while true; do - echo "Bridge running - configure webhook to forward to Telegram" - sleep 3600 - done - ' - networks: - - memento-monitoring - volumes: prometheus-data: grafana-data: + alertmanager-bot-data: networks: memento-monitoring: diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index e21b2bb..26c977d 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -13,6 +13,8 @@ alerting: scrape_configs: - job_name: 'memento-app' metrics_path: '/api/metrics' + authorization: + credentials_file: /etc/prometheus/metrics-token static_configs: - targets: ['memento-note:3000']