groups: - name: critical rules: - alert: MementoAppDown expr: up{job="memento-app"} == 0 for: 2m labels: severity: critical annotations: summary: "🔴 Memento app is DOWN" description: "The Next.js application has been unreachable for 2+ minutes." - alert: PostgresDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical annotations: summary: "🔴 PostgreSQL is DOWN" description: "Database has been unreachable for 1+ minute." - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "🔴 Redis is DOWN" description: "Redis cache/quota store has been unreachable for 1+ minute." - name: resources rules: - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: warning annotations: summary: "⚠️ Disk space below 15%" description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining." - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05 for: 1m labels: severity: critical annotations: summary: "🔴 Disk space CRITICAL (< 5%)" - alert: HighMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 for: 5m labels: severity: warning annotations: summary: "⚠️ Memory usage above 90%" - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "⚠️ CPU usage above 85% for 10 minutes" - name: database rules: - alert: PostgresConnectionsHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "⚠️ PostgreSQL connections above 80" - alert: PostgresSlowQueries expr: pg_stat_statements_mean_exec_seconds > 5 for: 5m labels: severity: warning annotations: summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)" - alert: RedisMemoryHigh expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85 for: 5m labels: severity: warning annotations: summary: "⚠️ Redis memory above 85% of limit" - name: application rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 3m labels: severity: warning annotations: summary: "⚠️ HTTP 5xx error rate above 5%" - alert: AppHighHeapMemory expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90 for: 5m labels: severity: warning annotations: summary: "⚠️ Next.js heap usage above 90%" - alert: ContainerRestarted expr: increase(container_restart_count[1h]) > 0 for: 1m labels: severity: warning annotations: summary: "⚠️ Container restarted in the last hour" description: "Container {{ $labels.name }} restarted unexpectedly." # ── Business Alerts ────────────────────────────────────────────────────────── - name: business rules: - alert: HighChurnRate expr: memento_churn_this_month > 10 for: 0m labels: severity: warning annotations: summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements" description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons." - alert: NoNewUsersLast7Days expr: memento_new_users_7d == 0 for: 1h labels: severity: warning annotations: summary: "📊 Aucune nouvelle inscription depuis 7 jours" description: "Vérifier le funnel d'onboarding et les canaux d'acquisition." - alert: AgentRunsHighErrorRate expr: | memento_agent_runs_30d{status="error"} / (memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2 for: 5m labels: severity: warning annotations: summary: "🤖 Taux d'erreur agents IA > 20% ce mois" description: "{{ $value | humanizePercentage }} des runs d'agents échouent." - alert: BusinessMetricsCollectionFailed expr: memento_business_metrics_error == 1 for: 5m labels: severity: warning annotations: summary: "⚠️ Collecte métriques business en erreur" description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business."