- Restructured docker-compose for Nginx Proxy Manager (no custom nginx) - Added domain wordly.art configuration - Added Prometheus + Grafana monitoring stack with pre-configured dashboards - Added PostgreSQL backup script to NAS (daily/weekly/monthly rotation) - Added alert rules for backend, system, and Docker metrics - Updated deployment guide for NPM + IONOS DNS homelab setup - Added marketing plan document - PDF translator and watermark support - Enhanced middleware, routes, and translator modules Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
102 lines
3.2 KiB
YAML
102 lines
3.2 KiB
YAML
# Wordly.art - Prometheus Alert Rules
|
|
|
|
groups:
|
|
# Application alerts
|
|
- name: wordly_app
|
|
rules:
|
|
- alert: BackendDown
|
|
expr: up{job="wordly-backend"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Wordly backend is down"
|
|
description: "Backend has been down for more than 2 minutes."
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "More than 10% of requests are returning 5xx errors."
|
|
|
|
- alert: SlowTranslations
|
|
expr: histogram_quantile(0.95, rate(translation_duration_seconds_bucket[5m])) > 120
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Translations are slow"
|
|
description: "95th percentile translation time is over 120 seconds."
|
|
|
|
- alert: HighTranslationQueue
|
|
expr: translation_queue_size > 20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Translation queue is backing up"
|
|
description: "More than 20 translations queued."
|
|
|
|
# System alerts
|
|
- name: wordly_system
|
|
rules:
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Server memory usage is above 90%."
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space"
|
|
description: "Less than 15% disk space remaining on /."
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk space"
|
|
description: "Less than 5% disk space remaining on /."
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage"
|
|
description: "CPU usage is above 85% for 10 minutes."
|
|
|
|
# Docker alerts
|
|
- name: wordly_docker
|
|
rules:
|
|
- alert: ContainerRestarted
|
|
expr: increase(container_restart_count[1h]) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container restarting"
|
|
description: "Container {{ $labels.name }} has restarted more than 2 times in the last hour."
|
|
|
|
- alert: ContainerOOM
|
|
expr: increase(container_oom_events_total[1h]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Container OOM killed"
|
|
description: "Container {{ $labels.name }} was OOM killed."
|