feat(monitoring): business metrics + hardening sécurité

Métriques business dans /api/metrics : - Abonnements par tier/status (BASIC/PRO/ENTERPRISE × ACTIVE/CANCELED) - Nouveaux abonnements ce mois vs mois dernier - Désabonnements / churn ce mois vs mois dernier - Utilisateurs actifs 7j / 30j (proxy : note modifiée) - Nouvelles inscriptions 7j / ce mois - Runs agents IA par status (30j + aujourd'hui) + tokens consommés - Usage IA par feature (requêtes + tokens ce mois) - Logins aujourd'hui / ce mois (via AuditLog) - Sessions brainstorm ce mois - Flashcards total + reviews ce mois Alertes Prometheus : - HighChurnRate (> 10 désabonnements ce mois) - NoNewUsersLast7Days (aucune inscription 7j) - AgentRunsHighErrorRate (> 20% erreurs agents) - BusinessMetricsCollectionFailed Hardening monitoring : - Ports monitoring → 127.0.0.1 (plus exposés publiquement) - Images pinned (prometheus v2.53.0, grafana 11.1.0, etc.) - alertmanager-bridge fake → metalmatze/alertmanager-bot:0.4.3 - /api/metrics sécurisé avec METRICS_TOKEN bearer - Prometheus auth bearer via credentials_file - Redis AOF + 256mb, healthcheck → /api/build-info - repeat_interval 4h, inhibit_rules alertmanager - Secrets CI/CD : AUTH_GOOGLE_SECRET, METRICS_TOKEN, GRAFANA, MCP_API_KEY Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-29 14:49:34 +00:00
parent 8571080037
commit 79fd6553b7
9 changed files with 352 additions and 51 deletions
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -174,6 +174,9 @@ jobs:
          NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }}
          TELEGRAM_BOT_TOKEN:   ${{ secrets.TELEGRAM_BOT_TOKEN }}
          TELEGRAM_CHAT_ID:     ${{ secrets.TELEGRAM_CHAT_ID }}
          METRICS_TOKEN:        ${{ secrets.METRICS_TOKEN }}
          GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          MCP_API_KEY:          ${{ secrets.MCP_API_KEY }}
        run: |
          ENV_FILE="/opt/memento/.env.docker"
          touch "$ENV_FILE"
@@ -226,6 +229,11 @@ jobs:
          upsert NEXT_PUBLIC_SOCKET_URL "$NEXT_PUBLIC_SOCKET_URL"
          upsert TELEGRAM_BOT_TOKEN     "$TELEGRAM_BOT_TOKEN"
          upsert TELEGRAM_CHAT_ID       "$TELEGRAM_CHAT_ID"
          upsert METRICS_TOKEN          "$METRICS_TOKEN"
          upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD"
          upsert MCP_API_KEY            "$MCP_API_KEY"
          # Write metrics token file for Prometheus (same secret)
          [ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true
      - name: Deploy on 192.168.1.190
        env:
--- a/.gitea/workflows/deploy.yaml
+++ b/.gitea/workflows/deploy.yaml
@@ -61,6 +61,9 @@ jobs:
          NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }}
          TELEGRAM_BOT_TOKEN:   ${{ secrets.TELEGRAM_BOT_TOKEN }}
          TELEGRAM_CHAT_ID:     ${{ secrets.TELEGRAM_CHAT_ID }}
          METRICS_TOKEN:        ${{ secrets.METRICS_TOKEN }}
          GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          MCP_API_KEY:          ${{ secrets.MCP_API_KEY }}
        run: |
          ENV_FILE="/opt/memento/.env.docker"
          touch "$ENV_FILE"
@@ -113,6 +116,10 @@ jobs:
          upsert REDIS_HOST             "redis"
          upsert TELEGRAM_BOT_TOKEN     "$TELEGRAM_BOT_TOKEN"
          upsert TELEGRAM_CHAT_ID       "$TELEGRAM_CHAT_ID"
          upsert METRICS_TOKEN          "$METRICS_TOKEN"
          upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD"
          upsert MCP_API_KEY            "$MCP_API_KEY"
          [ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true
      - name: Deploy (full build, no CI artifact)
        env:
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,4 @@ docker-data/
 # Misc
 *.tsbuildinfo
 next-env.d.ts
 monitoring/metrics-token
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -30,7 +30,7 @@ services:
    image: redis:7-alpine
    container_name: memento-redis
    restart: unless-stopped
-    command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru
+    command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru --appendonly yes --appendfsync everysec
    volumes:
      - redis-data:/data
    ports:
@@ -73,7 +73,7 @@ services:
        condition: service_healthy
    restart: unless-stopped
    healthcheck:
-      test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"]
+      test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/api/build-info',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"]
      interval: 15s
      timeout: 10s
      retries: 5
@@ -150,7 +150,7 @@ services:
          cpus: '0.25'
          memory: 128M
    healthcheck:
-      test: ["CMD-SHELL", "wget --header \"x-api-key: 1b11f42537c1442456ea413feee75bac\" -q -O /dev/null http://localhost:3001/ || exit 1"]
+      test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
--- a/memento-note/app/api/metrics/route.ts
+++ b/memento-note/app/api/metrics/route.ts
@@ -4,7 +4,17 @@ import { redis } from '@/lib/redis'
 export const dynamic = 'force-dynamic'
-export async function GET() {
+export async function GET(req: Request) {
  // Secure endpoint with bearer token (METRICS_TOKEN env var)
  const metricsToken = process.env.METRICS_TOKEN
  if (metricsToken) {
    const authHeader = req.headers.get('authorization') ?? ''
    const token = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : ''
    if (token !== metricsToken) {
      return new NextResponse('Unauthorized', { status: 401 })
    }
  }
  const lines: string[] = []
  const metric = (name: string, help: string, type: string, value: number | string, labels = '') => {
@@ -13,10 +23,19 @@ export async function GET() {
    lines.push(labels ? `${name}{${labels}} ${value}` : `${name} ${value}`)
  }
-  // Uptime
+  // Multiple labeled values for the same metric name
  const metricLabeled = (name: string, help: string, type: string, rows: Array<{ labels: string; value: number }>) => {
    lines.push(`# HELP ${name} ${help}`)
    lines.push(`# TYPE ${name} ${type}`)
    for (const row of rows) {
      lines.push(`${name}{${row.labels}} ${row.value}`)
    }
  }
  // ── Uptime ──────────────────────────────────────────────────────────────
  metric('memento_uptime_seconds', 'Application uptime in seconds', 'gauge', process.uptime().toFixed(2))
-  // Database
+  // ── Infrastructure ───────────────────────────────────────────────────────
  try {
    const dbStart = Date.now()
    const [noteCount, notebookCount, userCount] = await Promise.all([
@@ -34,7 +53,6 @@ export async function GET() {
    metric('memento_db_up', 'Database connectivity (1=up, 0=down)', 'gauge', 0)
  }
  // Redis
  try {
    const redisStart = Date.now()
    await redis.ping()
@@ -51,12 +69,171 @@ export async function GET() {
    metric('memento_redis_up', 'Redis connectivity (1=up, 0=down)', 'gauge', 0)
  }
  // Node.js process memory
  const mem = process.memoryUsage()
  metric('memento_process_heap_used_bytes', 'Node.js heap used in bytes', 'gauge', mem.heapUsed)
  metric('memento_process_heap_total_bytes', 'Node.js heap total in bytes', 'gauge', mem.heapTotal)
  metric('memento_process_rss_bytes', 'Node.js RSS memory in bytes', 'gauge', mem.rss)
  // ── Business metrics ─────────────────────────────────────────────────────
  try {
    const now = new Date()
    const startOfMonth = new Date(now.getFullYear(), now.getMonth(), 1)
    const startOfLastMonth = new Date(now.getFullYear(), now.getMonth() - 1, 1)
    const endOfLastMonth = new Date(now.getFullYear(), now.getMonth(), 0, 23, 59, 59)
    const last7days = new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000)
    const last30days = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000)
    // ── Subscriptions par tier ──
    const subsByTier = await prisma.subscription.groupBy({
      by: ['tier', 'status'],
      _count: { _all: true },
    })
    const subRows = subsByTier.map(r => ({
      labels: `tier="${r.tier}",status="${r.status}"`,
      value: r._count._all,
    }))
    metricLabeled(
      'memento_subscriptions_total',
      'Total subscriptions by tier and status',
      'gauge',
      subRows,
    )
    // Totaux agrégés utiles
    const activeSubs = subsByTier
      .filter(r => r.status === 'ACTIVE')
      .reduce((s, r) => s + r._count._all, 0)
    const canceledSubs = subsByTier
      .filter(r => r.status === 'CANCELED')
      .reduce((s, r) => s + r._count._all, 0)
    metric('memento_subscriptions_active_total', 'Total active subscriptions (all tiers)', 'gauge', activeSubs)
    metric('memento_subscriptions_canceled_total', 'Total canceled subscriptions (all tiers)', 'gauge', canceledSubs)
    // ── Nouveaux abonnements ce mois ──
    const newSubsThisMonth = await prisma.subscription.count({
      where: { createdAt: { gte: startOfMonth }, status: 'ACTIVE' },
    })
    const newSubsLastMonth = await prisma.subscription.count({
      where: { createdAt: { gte: startOfLastMonth, lte: endOfLastMonth }, status: 'ACTIVE' },
    })
    metric('memento_subscriptions_new_this_month', 'New active subscriptions created this month', 'gauge', newSubsThisMonth)
    metric('memento_subscriptions_new_last_month', 'New active subscriptions created last month', 'gauge', newSubsLastMonth)
    // ── Désabonnements (cancelAtPeriodEnd ou canceledAt ce mois) ──
    const churnsThisMonth = await prisma.subscription.count({
      where: {
        OR: [
          { canceledAt: { gte: startOfMonth } },
          { cancelAtPeriodEnd: true, updatedAt: { gte: startOfMonth } },
        ],
      },
    })
    const churnsLastMonth = await prisma.subscription.count({
      where: {
        OR: [
          { canceledAt: { gte: startOfLastMonth, lte: endOfLastMonth } },
          { cancelAtPeriodEnd: true, updatedAt: { gte: startOfLastMonth, lte: endOfLastMonth } },
        ],
      },
    })
    metric('memento_churn_this_month', 'Cancellations / pending cancellations this month', 'gauge', churnsThisMonth)
    metric('memento_churn_last_month', 'Cancellations / pending cancellations last month', 'gauge', churnsLastMonth)
    // ── Utilisateurs actifs ──
    const activeUsers7d = await prisma.note.groupBy({
      by: ['userId'],
      where: { updatedAt: { gte: last7days } },
    })
    const activeUsers30d = await prisma.note.groupBy({
      by: ['userId'],
      where: { updatedAt: { gte: last30days } },
    })
    metric('memento_active_users_7d', 'Users who modified at least one note in the last 7 days', 'gauge', activeUsers7d.length)
    metric('memento_active_users_30d', 'Users who modified at least one note in the last 30 days', 'gauge', activeUsers30d.length)
    // Nouveaux utilisateurs
    const newUsers7d = await prisma.user.count({ where: { createdAt: { gte: last7days } } })
    const newUsersThisMonth = await prisma.user.count({ where: { createdAt: { gte: startOfMonth } } })
    metric('memento_new_users_7d', 'New user registrations in the last 7 days', 'gauge', newUsers7d)
    metric('memento_new_users_this_month', 'New user registrations this month', 'gauge', newUsersThisMonth)
    // ── Agents IA ──
    const agentsByStatus = await prisma.agentAction.groupBy({
      by: ['status'],
      _count: { _all: true },
      where: { createdAt: { gte: last30days } },
    })
    const agentRows = agentsByStatus.map(r => ({
      labels: `status="${r.status}"`,
      value: r._count._all,
    }))
    metricLabeled(
      'memento_agent_runs_30d',
      'Agent runs by status in the last 30 days',
      'gauge',
      agentRows,
    )
    const agentRunsToday = await prisma.agentAction.count({
      where: { createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) } },
    })
    metric('memento_agent_runs_today', 'Agent runs triggered today', 'gauge', agentRunsToday)
    // Tokens consommés par les agents
    const agentTokens = await prisma.agentAction.aggregate({
      _sum: { tokensUsed: true },
      where: { createdAt: { gte: startOfMonth } },
    })
    metric('memento_agent_tokens_this_month', 'Total tokens consumed by agents this month', 'gauge', agentTokens._sum.tokensUsed ?? 0)
    // ── Usage IA par feature (ce mois) ──
    const usageByFeature = await prisma.usageLog.groupBy({
      by: ['feature'],
      _sum: { requestsCount: true, tokensUsed: true },
      where: { periodStart: { gte: startOfMonth } },
    })
    const usageRequestRows = usageByFeature.map(r => ({
      labels: `feature="${r.feature}"`,
      value: r._sum.requestsCount ?? 0,
    }))
    const usageTokenRows = usageByFeature.map(r => ({
      labels: `feature="${r.feature}"`,
      value: r._sum.tokensUsed ?? 0,
    }))
    metricLabeled('memento_ai_requests_this_month', 'AI API requests by feature this month', 'gauge', usageRequestRows)
    metricLabeled('memento_ai_tokens_this_month', 'AI tokens consumed by feature this month', 'gauge', usageTokenRows)
    // ── Logins (AuditLog) ──
    const loginsToday = await prisma.auditLog.count({
      where: {
        action: 'LOGIN',
        createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) },
      },
    })
    const loginsThisMonth = await prisma.auditLog.count({
      where: { action: 'LOGIN', createdAt: { gte: startOfMonth } },
    })
    metric('memento_logins_today', 'Login events today', 'gauge', loginsToday)
    metric('memento_logins_this_month', 'Login events this month', 'gauge', loginsThisMonth)
    // ── Brainstorm sessions ──
    const brainstormThisMonth = await prisma.brainstormSession.count({
      where: { createdAt: { gte: startOfMonth } },
    })
    metric('memento_brainstorm_sessions_this_month', 'Brainstorm sessions created this month', 'gauge', brainstormThisMonth)
    // ── Flashcards ──
    const flashcardsTotal = await prisma.flashcard.count()
    const flashcardsReviewedThisMonth = await prisma.flashcardReview.count({
      where: { reviewedAt: { gte: startOfMonth } },
    })
    metric('memento_flashcards_total', 'Total flashcards in the system', 'gauge', flashcardsTotal)
    metric('memento_flashcard_reviews_this_month', 'Flashcard review events this month', 'gauge', flashcardsReviewedThisMonth)
  } catch (err) {
    console.error('[metrics] Business metrics error:', err)
    metric('memento_business_metrics_error', 'Business metrics collection failed (1=error)', 'gauge', 1)
  }
  const body = lines.join('\n') + '\n'
  return new NextResponse(body, {
--- a/monitoring/alertmanager.yml
+++ b/monitoring/alertmanager.yml
@@ -1,11 +1,23 @@
 route:
-  receiver: 'telegram'
+  receiver: 'telegram-bot'
  group_wait: 10s
  group_interval: 5m
  repeat_interval: 4h
  routes:
    - match:
        severity: critical
      receiver: 'telegram-bot'
      repeat_interval: 1h
 receivers:
-  - name: 'telegram'
+  - name: 'telegram-bot'
    webhook_configs:
-      - url: 'http://alertmanager-bridge:8080/alert'
+      - url: 'http://alertmanager-telegram:8080/alerts'
        send_resolved: true
 inhibit_rules:
  - source_match:
      severity: critical
    target_match:
      severity: warning
    equal: ['alertname']
--- a/monitoring/alerts.yml
+++ b/monitoring/alerts.yml
@@ -7,7 +7,8 @@ groups:
        labels:
          severity: critical
        annotations:
-          summary: "Memento app is DOWN"
+          summary: "🔴 Memento app is DOWN"
          description: "The Next.js application has been unreachable for 2+ minutes."
      - alert: PostgresDown
        expr: up{job="postgres"} == 0
@@ -15,7 +16,8 @@ groups:
        labels:
          severity: critical
        annotations:
-          summary: "PostgreSQL is DOWN"
+          summary: "🔴 PostgreSQL is DOWN"
          description: "Database has been unreachable for 1+ minute."
      - alert: RedisDown
        expr: up{job="redis"} == 0
@@ -23,15 +25,27 @@ groups:
        labels:
          severity: critical
        annotations:
-          summary: "Redis is DOWN"
+          summary: "🔴 Redis is DOWN"
          description: "Redis cache/quota store has been unreachable for 1+ minute."
  - name: resources
    rules:
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
        for: 5m
        labels:
          severity: warning
        annotations:
-          summary: "Disk space below 15%"
+          summary: "⚠️ Disk space below 15%"
          description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining."
      - alert: DiskSpaceCritical
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "🔴 Disk space CRITICAL (< 5%)"
      - alert: HighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
@@ -39,15 +53,25 @@ groups:
        labels:
          severity: warning
        annotations:
-          summary: "Memory usage above 90%"
+          summary: "⚠️ Memory usage above 90%"
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "⚠️ CPU usage above 85% for 10 minutes"
  - name: database
    rules:
      - alert: PostgresConnectionsHigh
        expr: pg_stat_activity_count > 80
        for: 5m
        labels:
          severity: warning
        annotations:
-          summary: "PostgreSQL connections above 80"
+          summary: "⚠️ PostgreSQL connections above 80"
      - alert: PostgresSlowQueries
        expr: pg_stat_statements_mean_exec_seconds > 5
@@ -55,15 +79,33 @@ groups:
        labels:
          severity: warning
        annotations:
-          summary: "PostgreSQL slow queries detected"
+          summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)"
      - alert: RedisMemoryHigh
        expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "⚠️ Redis memory above 85% of limit"
  - name: application
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 3m
        labels:
          severity: warning
        annotations:
-          summary: "HTTP 5xx error rate above 5%"
+          summary: "⚠️ HTTP 5xx error rate above 5%"
      - alert: AppHighHeapMemory
        expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "⚠️ Next.js heap usage above 90%"
      - alert: ContainerRestarted
        expr: increase(container_restart_count[1h]) > 0
@@ -71,4 +113,46 @@ groups:
        labels:
          severity: warning
        annotations:
-          summary: "Container restarted in the last hour"
+          summary: "⚠️ Container restarted in the last hour"
          description: "Container {{ $labels.name }} restarted unexpectedly."
  # ── Business Alerts ──────────────────────────────────────────────────────────
  - name: business
    rules:
      - alert: HighChurnRate
        expr: memento_churn_this_month > 10
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements"
          description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons."
      - alert: NoNewUsersLast7Days
        expr: memento_new_users_7d == 0
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "📊 Aucune nouvelle inscription depuis 7 jours"
          description: "Vérifier le funnel d'onboarding et les canaux d'acquisition."
      - alert: AgentRunsHighErrorRate
        expr: |
          memento_agent_runs_30d{status="error"} /
          (memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "🤖 Taux d'erreur agents IA > 20% ce mois"
          description: "{{ $value | humanizePercentage }} des runs d'agents échouent."
      - alert: BusinessMetricsCollectionFailed
        expr: memento_business_metrics_error == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "⚠️ Collecte métriques business en erreur"
          description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business."
--- a/monitoring/docker-compose.monitoring.yml
+++ b/monitoring/docker-compose.monitoring.yml
@@ -1,49 +1,73 @@
 services:
  prometheus:
-    image: prom/prometheus:latest
+    image: prom/prometheus:v2.53.0
    container_name: memento-prometheus
    restart: unless-stopped
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./alerts.yml:/etc/prometheus/alerts.yml:ro
      - ./metrics-token:/etc/prometheus/metrics-token:ro
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
    ports:
-      - "9090:9090"
+      - "127.0.0.1:9090:9090"
    networks:
      - memento-monitoring
      - memento-net
  grafana:
-    image: grafana/grafana:latest
+    image: grafana/grafana:11.1.0
    container_name: memento-grafana
    restart: unless-stopped
    environment:
      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin}
      GF_USERS_ALLOW_SIGN_UP: "false"
-      GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3001}"
+      GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3002}"
      GF_SECURITY_DISABLE_GRAVATAR: "true"
      GF_ANALYTICS_REPORTING_ENABLED: "false"
      GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana-provisioning:/etc/grafana/provisioning:ro
      - ./grafana-dashboards:/etc/grafana/dashboards:ro
    ports:
-      - "3002:3000"
+      - "127.0.0.1:3002:3000"
    networks:
      - memento-monitoring
  alertmanager:
-    image: prom/alertmanager:latest
+    image: prom/alertmanager:v0.27.0
    container_name: memento-alertmanager
    restart: unless-stopped
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    ports:
-      - "9093:9093"
+      - "127.0.0.1:9093:9093"
    networks:
      - memento-monitoring
  # Real Telegram webhook bridge (replaces the fake alpine sleep)
  alertmanager-telegram:
    image: metalmatze/alertmanager-bot:0.4.3
    container_name: memento-alertmanager-telegram
    restart: unless-stopped
    environment:
      TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
      TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-}
      ALERTMANAGER_URL: http://alertmanager:9093
      STORE: /data/bolt.db
      LISTEN_ADDR: 0.0.0.0:8080
    volumes:
      - alertmanager-bot-data:/data
    networks:
      - memento-monitoring
  node-exporter:
-    image: prom/node-exporter:latest
+    image: prom/node-exporter:v1.8.1
    container_name: memento-node-exporter
    restart: unless-stopped
    pid: host
@@ -57,13 +81,13 @@ services:
      - '--path.rootfs=/rootfs'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    ports:
-      - "9100:9100"
+      - "127.0.0.1:9100:9100"
    networks:
      - memento-monitoring
      - memento-net
  postgres-exporter:
-    image: prometheuscommunity/postgres-exporter:latest
+    image: prometheuscommunity/postgres-exporter:v0.15.0
    container_name: memento-postgres-exporter
    restart: unless-stopped
    env_file:
@@ -71,25 +95,25 @@ services:
    environment:
      DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
    ports:
-      - "9187:9187"
+      - "127.0.0.1:9187:9187"
    networks:
      - memento-monitoring
      - memento-net
  redis-exporter:
-    image: oliver006/redis_exporter:latest
+    image: oliver006/redis_exporter:v1.62.0
    container_name: memento-redis-exporter
    restart: unless-stopped
    environment:
      REDIS_ADDR: "redis://memento-redis:6379"
    ports:
-      - "9121:9121"
+      - "127.0.0.1:9121:9121"
    networks:
      - memento-monitoring
      - memento-net
  cadvisor:
-    image: gcr.io/cadvisor/cadvisor:latest
+    image: gcr.io/cadvisor/cadvisor:v0.49.1
    container_name: memento-cadvisor
    restart: unless-stopped
    privileged: true
@@ -102,29 +126,15 @@ services:
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    ports:
-      - "8081:8080"
+      - "127.0.0.1:8081:8080"
    networks:
      - memento-monitoring
      - memento-net
  alertmanager-bridge:
    image: alpine:latest
    container_name: memento-alertmanager-bridge
    restart: unless-stopped
    entrypoint: |
      sh -c '
        apk add --no-cache curl
        while true; do
          echo "Bridge running - configure webhook to forward to Telegram"
          sleep 3600
        done
      '
    networks:
      - memento-monitoring
 volumes:
  prometheus-data:
  grafana-data:
  alertmanager-bot-data:
 networks:
  memento-monitoring:
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -13,6 +13,8 @@ alerting:
 scrape_configs:
  - job_name: 'memento-app'
    metrics_path: '/api/metrics'
    authorization:
      credentials_file: /etc/prometheus/metrics-token
    static_configs:
      - targets: ['memento-note:3000']