Files
Momento/monitoring/alerts.yml
Antigravity 8950e83db5
Some checks failed
CI / Deploy production (on server) (push) Has been cancelled
CI / Lint, Test & Build (push) Has been cancelled
feat: P0 backup system (WAL+snapshot+restore+verify), monitoring stack, admin health API
2026-05-17 14:13:01 +00:00

75 lines
2.0 KiB
YAML

groups:
- name: critical
rules:
- alert: MementoAppDown
expr: up{job="memento-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Memento app is DOWN"
- alert: PostgresDown
expr: up{job="postgres"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is DOWN"
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is DOWN"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space below 15%"
- alert: HighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "Memory usage above 90%"
- alert: PostgresConnectionsHigh
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL connections above 80"
- alert: PostgresSlowQueries
expr: pg_stat_statements_mean_exec_seconds > 5
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL slow queries detected"
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 3m
labels:
severity: warning
annotations:
summary: "HTTP 5xx error rate above 5%"
- alert: ContainerRestarted
expr: increase(container_restart_count[1h]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Container restarted in the last hour"