Métriques business dans /api/metrics : - Abonnements par tier/status (BASIC/PRO/ENTERPRISE × ACTIVE/CANCELED) - Nouveaux abonnements ce mois vs mois dernier - Désabonnements / churn ce mois vs mois dernier - Utilisateurs actifs 7j / 30j (proxy : note modifiée) - Nouvelles inscriptions 7j / ce mois - Runs agents IA par status (30j + aujourd'hui) + tokens consommés - Usage IA par feature (requêtes + tokens ce mois) - Logins aujourd'hui / ce mois (via AuditLog) - Sessions brainstorm ce mois - Flashcards total + reviews ce mois Alertes Prometheus : - HighChurnRate (> 10 désabonnements ce mois) - NoNewUsersLast7Days (aucune inscription 7j) - AgentRunsHighErrorRate (> 20% erreurs agents) - BusinessMetricsCollectionFailed Hardening monitoring : - Ports monitoring → 127.0.0.1 (plus exposés publiquement) - Images pinned (prometheus v2.53.0, grafana 11.1.0, etc.) - alertmanager-bridge fake → metalmatze/alertmanager-bot:0.4.3 - /api/metrics sécurisé avec METRICS_TOKEN bearer - Prometheus auth bearer via credentials_file - Redis AOF + 256mb, healthcheck → /api/build-info - repeat_interval 4h, inhibit_rules alertmanager - Secrets CI/CD : AUTH_GOOGLE_SECRET, METRICS_TOKEN, GRAFANA, MCP_API_KEY Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
159 lines
5.3 KiB
YAML
159 lines
5.3 KiB
YAML
groups:
|
|
- name: critical
|
|
rules:
|
|
- alert: MementoAppDown
|
|
expr: up{job="memento-app"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Memento app is DOWN"
|
|
description: "The Next.js application has been unreachable for 2+ minutes."
|
|
|
|
- alert: PostgresDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 PostgreSQL is DOWN"
|
|
description: "Database has been unreachable for 1+ minute."
|
|
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Redis is DOWN"
|
|
description: "Redis cache/quota store has been unreachable for 1+ minute."
|
|
|
|
- name: resources
|
|
rules:
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Disk space below 15%"
|
|
description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining."
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Disk space CRITICAL (< 5%)"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Memory usage above 90%"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ CPU usage above 85% for 10 minutes"
|
|
|
|
- name: database
|
|
rules:
|
|
- alert: PostgresConnectionsHigh
|
|
expr: pg_stat_activity_count > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ PostgreSQL connections above 80"
|
|
|
|
- alert: PostgresSlowQueries
|
|
expr: pg_stat_statements_mean_exec_seconds > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)"
|
|
|
|
- alert: RedisMemoryHigh
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Redis memory above 85% of limit"
|
|
|
|
- name: application
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ HTTP 5xx error rate above 5%"
|
|
|
|
- alert: AppHighHeapMemory
|
|
expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Next.js heap usage above 90%"
|
|
|
|
- alert: ContainerRestarted
|
|
expr: increase(container_restart_count[1h]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Container restarted in the last hour"
|
|
description: "Container {{ $labels.name }} restarted unexpectedly."
|
|
|
|
# ── Business Alerts ──────────────────────────────────────────────────────────
|
|
- name: business
|
|
rules:
|
|
- alert: HighChurnRate
|
|
expr: memento_churn_this_month > 10
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements"
|
|
description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons."
|
|
|
|
- alert: NoNewUsersLast7Days
|
|
expr: memento_new_users_7d == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "📊 Aucune nouvelle inscription depuis 7 jours"
|
|
description: "Vérifier le funnel d'onboarding et les canaux d'acquisition."
|
|
|
|
- alert: AgentRunsHighErrorRate
|
|
expr: |
|
|
memento_agent_runs_30d{status="error"} /
|
|
(memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "🤖 Taux d'erreur agents IA > 20% ce mois"
|
|
description: "{{ $value | humanizePercentage }} des runs d'agents échouent."
|
|
|
|
- alert: BusinessMetricsCollectionFailed
|
|
expr: memento_business_metrics_error == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ Collecte métriques business en erreur"
|
|
description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business."
|