Files
office_translator/docker/prometheus/alerts.yml
sepehr ce8e150a61 feat: homelab deployment - NPM + IONOS DNS + monitoring + NAS backup
- Restructured docker-compose for Nginx Proxy Manager (no custom nginx)
- Added domain wordly.art configuration
- Added Prometheus + Grafana monitoring stack with pre-configured dashboards
- Added PostgreSQL backup script to NAS (daily/weekly/monthly rotation)
- Added alert rules for backend, system, and Docker metrics
- Updated deployment guide for NPM + IONOS DNS homelab setup
- Added marketing plan document
- PDF translator and watermark support
- Enhanced middleware, routes, and translator modules

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 11:43:28 +02:00

102 lines
3.2 KiB
YAML

# Wordly.art - Prometheus Alert Rules
groups:
# Application alerts
- name: wordly_app
rules:
- alert: BackendDown
expr: up{job="wordly-backend"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Wordly backend is down"
description: "Backend has been down for more than 2 minutes."
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "More than 10% of requests are returning 5xx errors."
- alert: SlowTranslations
expr: histogram_quantile(0.95, rate(translation_duration_seconds_bucket[5m])) > 120
for: 10m
labels:
severity: warning
annotations:
summary: "Translations are slow"
description: "95th percentile translation time is over 120 seconds."
- alert: HighTranslationQueue
expr: translation_queue_size > 20
for: 5m
labels:
severity: warning
annotations:
summary: "Translation queue is backing up"
description: "More than 20 translations queued."
# System alerts
- name: wordly_system
rules:
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Server memory usage is above 90%."
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 10m
labels:
severity: warning
annotations:
summary: "Low disk space"
description: "Less than 15% disk space remaining on /."
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "Critical disk space"
description: "Less than 5% disk space remaining on /."
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is above 85% for 10 minutes."
# Docker alerts
- name: wordly_docker
rules:
- alert: ContainerRestarted
expr: increase(container_restart_count[1h]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Container restarting"
description: "Container {{ $labels.name }} has restarted more than 2 times in the last hour."
- alert: ContainerOOM
expr: increase(container_oom_events_total[1h]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Container OOM killed"
description: "Container {{ $labels.name }} was OOM killed."