feat(monitoring): business metrics + hardening sécurité
Métriques business dans /api/metrics : - Abonnements par tier/status (BASIC/PRO/ENTERPRISE × ACTIVE/CANCELED) - Nouveaux abonnements ce mois vs mois dernier - Désabonnements / churn ce mois vs mois dernier - Utilisateurs actifs 7j / 30j (proxy : note modifiée) - Nouvelles inscriptions 7j / ce mois - Runs agents IA par status (30j + aujourd'hui) + tokens consommés - Usage IA par feature (requêtes + tokens ce mois) - Logins aujourd'hui / ce mois (via AuditLog) - Sessions brainstorm ce mois - Flashcards total + reviews ce mois Alertes Prometheus : - HighChurnRate (> 10 désabonnements ce mois) - NoNewUsersLast7Days (aucune inscription 7j) - AgentRunsHighErrorRate (> 20% erreurs agents) - BusinessMetricsCollectionFailed Hardening monitoring : - Ports monitoring → 127.0.0.1 (plus exposés publiquement) - Images pinned (prometheus v2.53.0, grafana 11.1.0, etc.) - alertmanager-bridge fake → metalmatze/alertmanager-bot:0.4.3 - /api/metrics sécurisé avec METRICS_TOKEN bearer - Prometheus auth bearer via credentials_file - Redis AOF + 256mb, healthcheck → /api/build-info - repeat_interval 4h, inhibit_rules alertmanager - Secrets CI/CD : AUTH_GOOGLE_SECRET, METRICS_TOKEN, GRAFANA, MCP_API_KEY Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -1,11 +1,23 @@
|
||||
route:
|
||||
receiver: 'telegram'
|
||||
receiver: 'telegram-bot'
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 1h
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'telegram-bot'
|
||||
repeat_interval: 1h
|
||||
|
||||
receivers:
|
||||
- name: 'telegram'
|
||||
- name: 'telegram-bot'
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanager-bridge:8080/alert'
|
||||
- url: 'http://alertmanager-telegram:8080/alerts'
|
||||
send_resolved: true
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: critical
|
||||
target_match:
|
||||
severity: warning
|
||||
equal: ['alertname']
|
||||
|
||||
@@ -7,7 +7,8 @@ groups:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Memento app is DOWN"
|
||||
summary: "🔴 Memento app is DOWN"
|
||||
description: "The Next.js application has been unreachable for 2+ minutes."
|
||||
|
||||
- alert: PostgresDown
|
||||
expr: up{job="postgres"} == 0
|
||||
@@ -15,7 +16,8 @@ groups:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is DOWN"
|
||||
summary: "🔴 PostgreSQL is DOWN"
|
||||
description: "Database has been unreachable for 1+ minute."
|
||||
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
@@ -23,15 +25,27 @@ groups:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is DOWN"
|
||||
summary: "🔴 Redis is DOWN"
|
||||
description: "Redis cache/quota store has been unreachable for 1+ minute."
|
||||
|
||||
- name: resources
|
||||
rules:
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space below 15%"
|
||||
summary: "⚠️ Disk space below 15%"
|
||||
description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔴 Disk space CRITICAL (< 5%)"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
||||
@@ -39,15 +53,25 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage above 90%"
|
||||
summary: "⚠️ Memory usage above 90%"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "⚠️ CPU usage above 85% for 10 minutes"
|
||||
|
||||
- name: database
|
||||
rules:
|
||||
- alert: PostgresConnectionsHigh
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL connections above 80"
|
||||
summary: "⚠️ PostgreSQL connections above 80"
|
||||
|
||||
- alert: PostgresSlowQueries
|
||||
expr: pg_stat_statements_mean_exec_seconds > 5
|
||||
@@ -55,15 +79,33 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL slow queries detected"
|
||||
summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)"
|
||||
|
||||
- alert: RedisMemoryHigh
|
||||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "⚠️ Redis memory above 85% of limit"
|
||||
|
||||
- name: application
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "HTTP 5xx error rate above 5%"
|
||||
summary: "⚠️ HTTP 5xx error rate above 5%"
|
||||
|
||||
- alert: AppHighHeapMemory
|
||||
expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "⚠️ Next.js heap usage above 90%"
|
||||
|
||||
- alert: ContainerRestarted
|
||||
expr: increase(container_restart_count[1h]) > 0
|
||||
@@ -71,4 +113,46 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container restarted in the last hour"
|
||||
summary: "⚠️ Container restarted in the last hour"
|
||||
description: "Container {{ $labels.name }} restarted unexpectedly."
|
||||
|
||||
# ── Business Alerts ──────────────────────────────────────────────────────────
|
||||
- name: business
|
||||
rules:
|
||||
- alert: HighChurnRate
|
||||
expr: memento_churn_this_month > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements"
|
||||
description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons."
|
||||
|
||||
- alert: NoNewUsersLast7Days
|
||||
expr: memento_new_users_7d == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "📊 Aucune nouvelle inscription depuis 7 jours"
|
||||
description: "Vérifier le funnel d'onboarding et les canaux d'acquisition."
|
||||
|
||||
- alert: AgentRunsHighErrorRate
|
||||
expr: |
|
||||
memento_agent_runs_30d{status="error"} /
|
||||
(memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "🤖 Taux d'erreur agents IA > 20% ce mois"
|
||||
description: "{{ $value | humanizePercentage }} des runs d'agents échouent."
|
||||
|
||||
- alert: BusinessMetricsCollectionFailed
|
||||
expr: memento_business_metrics_error == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "⚠️ Collecte métriques business en erreur"
|
||||
description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business."
|
||||
|
||||
@@ -1,49 +1,73 @@
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
image: prom/prometheus:v2.53.0
|
||||
container_name: memento-prometheus
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- ./metrics-token:/etc/prometheus/metrics-token:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
- "127.0.0.1:9090:9090"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento-net
|
||||
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
image: grafana/grafana:11.1.0
|
||||
container_name: memento-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3001}"
|
||||
GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3002}"
|
||||
GF_SECURITY_DISABLE_GRAVATAR: "true"
|
||||
GF_ANALYTICS_REPORTING_ENABLED: "false"
|
||||
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./grafana-provisioning:/etc/grafana/provisioning:ro
|
||||
- ./grafana-dashboards:/etc/grafana/dashboards:ro
|
||||
ports:
|
||||
- "3002:3000"
|
||||
- "127.0.0.1:3002:3000"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: memento-alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
ports:
|
||||
- "9093:9093"
|
||||
- "127.0.0.1:9093:9093"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
# Real Telegram webhook bridge (replaces the fake alpine sleep)
|
||||
alertmanager-telegram:
|
||||
image: metalmatze/alertmanager-bot:0.4.3
|
||||
container_name: memento-alertmanager-telegram
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
|
||||
TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-}
|
||||
ALERTMANAGER_URL: http://alertmanager:9093
|
||||
STORE: /data/bolt.db
|
||||
LISTEN_ADDR: 0.0.0.0:8080
|
||||
volumes:
|
||||
- alertmanager-bot-data:/data
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
image: prom/node-exporter:v1.8.1
|
||||
container_name: memento-node-exporter
|
||||
restart: unless-stopped
|
||||
pid: host
|
||||
@@ -57,13 +81,13 @@ services:
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
- "127.0.0.1:9100:9100"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento-net
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:latest
|
||||
image: prometheuscommunity/postgres-exporter:v0.15.0
|
||||
container_name: memento-postgres-exporter
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
@@ -71,25 +95,25 @@ services:
|
||||
environment:
|
||||
DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
|
||||
ports:
|
||||
- "9187:9187"
|
||||
- "127.0.0.1:9187:9187"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento-net
|
||||
|
||||
redis-exporter:
|
||||
image: oliver006/redis_exporter:latest
|
||||
image: oliver006/redis_exporter:v1.62.0
|
||||
container_name: memento-redis-exporter
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
REDIS_ADDR: "redis://memento-redis:6379"
|
||||
ports:
|
||||
- "9121:9121"
|
||||
- "127.0.0.1:9121:9121"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento-net
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||
container_name: memento-cadvisor
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
@@ -102,29 +126,15 @@ services:
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
ports:
|
||||
- "8081:8080"
|
||||
- "127.0.0.1:8081:8080"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento-net
|
||||
|
||||
alertmanager-bridge:
|
||||
image: alpine:latest
|
||||
container_name: memento-alertmanager-bridge
|
||||
restart: unless-stopped
|
||||
entrypoint: |
|
||||
sh -c '
|
||||
apk add --no-cache curl
|
||||
while true; do
|
||||
echo "Bridge running - configure webhook to forward to Telegram"
|
||||
sleep 3600
|
||||
done
|
||||
'
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
alertmanager-bot-data:
|
||||
|
||||
networks:
|
||||
memento-monitoring:
|
||||
|
||||
@@ -13,6 +13,8 @@ alerting:
|
||||
scrape_configs:
|
||||
- job_name: 'memento-app'
|
||||
metrics_path: '/api/metrics'
|
||||
authorization:
|
||||
credentials_file: /etc/prometheus/metrics-token
|
||||
static_configs:
|
||||
- targets: ['memento-note:3000']
|
||||
|
||||
|
||||
Reference in New Issue
Block a user