groups: - name: critical rules: - alert: MementoAppDown expr: up{job="memento-app"} == 0 for: 2m labels: severity: critical annotations: summary: "Memento app is DOWN" - alert: PostgresDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is DOWN" - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "Redis is DOWN" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: warning annotations: summary: "Disk space below 15%" - alert: HighMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 for: 5m labels: severity: warning annotations: summary: "Memory usage above 90%" - alert: PostgresConnectionsHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "PostgreSQL connections above 80" - alert: PostgresSlowQueries expr: pg_stat_statements_mean_exec_seconds > 5 for: 5m labels: severity: warning annotations: summary: "PostgreSQL slow queries detected" - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 3m labels: severity: warning annotations: summary: "HTTP 5xx error rate above 5%" - alert: ContainerRestarted expr: increase(container_restart_count[1h]) > 0 for: 1m labels: severity: warning annotations: summary: "Container restarted in the last hour"