# Wordly.art - Prometheus Alert Rules groups: # Application alerts - name: wordly_app rules: - alert: BackendDown expr: up{job="wordly-backend"} == 0 for: 2m labels: severity: critical annotations: summary: "Wordly backend is down" description: "Backend has been down for more than 2 minutes." - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate detected" description: "More than 10% of requests are returning 5xx errors." - alert: SlowTranslations expr: histogram_quantile(0.95, rate(translation_duration_seconds_bucket[5m])) > 120 for: 10m labels: severity: warning annotations: summary: "Translations are slow" description: "95th percentile translation time is over 120 seconds." - alert: HighTranslationQueue expr: translation_queue_size > 20 for: 5m labels: severity: warning annotations: summary: "Translation queue is backing up" description: "More than 20 translations queued." # System alerts - name: wordly_system rules: - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage" description: "Server memory usage is above 90%." - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 10m labels: severity: warning annotations: summary: "Low disk space" description: "Less than 15% disk space remaining on /." - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05 for: 5m labels: severity: critical annotations: summary: "Critical disk space" description: "Less than 5% disk space remaining on /." - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "High CPU usage" description: "CPU usage is above 85% for 10 minutes." # Docker alerts - name: wordly_docker rules: - alert: ContainerRestarted expr: increase(container_restart_count[1h]) > 2 for: 5m labels: severity: warning annotations: summary: "Container restarting" description: "Container {{ $labels.name }} has restarted more than 2 times in the last hour." - alert: ContainerOOM expr: increase(container_oom_events_total[1h]) > 0 for: 1m labels: severity: critical annotations: summary: "Container OOM killed" description: "Container {{ $labels.name }} was OOM killed."