feat(monitoring): business metrics + hardening sécurité
All checks were successful
CI / Lint, Unit Tests & Build (push) Successful in 5m21s
CI / Deploy production (on server) (push) Successful in 3m43s

Métriques business dans /api/metrics :
- Abonnements par tier/status (BASIC/PRO/ENTERPRISE × ACTIVE/CANCELED)
- Nouveaux abonnements ce mois vs mois dernier
- Désabonnements / churn ce mois vs mois dernier
- Utilisateurs actifs 7j / 30j (proxy : note modifiée)
- Nouvelles inscriptions 7j / ce mois
- Runs agents IA par status (30j + aujourd'hui) + tokens consommés
- Usage IA par feature (requêtes + tokens ce mois)
- Logins aujourd'hui / ce mois (via AuditLog)
- Sessions brainstorm ce mois
- Flashcards total + reviews ce mois

Alertes Prometheus :
- HighChurnRate (> 10 désabonnements ce mois)
- NoNewUsersLast7Days (aucune inscription 7j)
- AgentRunsHighErrorRate (> 20% erreurs agents)
- BusinessMetricsCollectionFailed

Hardening monitoring :
- Ports monitoring → 127.0.0.1 (plus exposés publiquement)
- Images pinned (prometheus v2.53.0, grafana 11.1.0, etc.)
- alertmanager-bridge fake → metalmatze/alertmanager-bot:0.4.3
- /api/metrics sécurisé avec METRICS_TOKEN bearer
- Prometheus auth bearer via credentials_file
- Redis AOF + 256mb, healthcheck → /api/build-info
- repeat_interval 4h, inhibit_rules alertmanager
- Secrets CI/CD : AUTH_GOOGLE_SECRET, METRICS_TOKEN, GRAFANA, MCP_API_KEY

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Antigravity
2026-05-29 14:49:34 +00:00
parent 8571080037
commit 79fd6553b7
9 changed files with 352 additions and 51 deletions

View File

@@ -174,6 +174,9 @@ jobs:
NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }} NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }}
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }}
GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
MCP_API_KEY: ${{ secrets.MCP_API_KEY }}
run: | run: |
ENV_FILE="/opt/memento/.env.docker" ENV_FILE="/opt/memento/.env.docker"
touch "$ENV_FILE" touch "$ENV_FILE"
@@ -226,6 +229,11 @@ jobs:
upsert NEXT_PUBLIC_SOCKET_URL "$NEXT_PUBLIC_SOCKET_URL" upsert NEXT_PUBLIC_SOCKET_URL "$NEXT_PUBLIC_SOCKET_URL"
upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN" upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN"
upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID" upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID"
upsert METRICS_TOKEN "$METRICS_TOKEN"
upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD"
upsert MCP_API_KEY "$MCP_API_KEY"
# Write metrics token file for Prometheus (same secret)
[ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true
- name: Deploy on 192.168.1.190 - name: Deploy on 192.168.1.190
env: env:

View File

@@ -61,6 +61,9 @@ jobs:
NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }} NEXT_PUBLIC_SOCKET_URL: ${{ vars.NEXT_PUBLIC_SOCKET_URL }}
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }}
GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
MCP_API_KEY: ${{ secrets.MCP_API_KEY }}
run: | run: |
ENV_FILE="/opt/memento/.env.docker" ENV_FILE="/opt/memento/.env.docker"
touch "$ENV_FILE" touch "$ENV_FILE"
@@ -113,6 +116,10 @@ jobs:
upsert REDIS_HOST "redis" upsert REDIS_HOST "redis"
upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN" upsert TELEGRAM_BOT_TOKEN "$TELEGRAM_BOT_TOKEN"
upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID" upsert TELEGRAM_CHAT_ID "$TELEGRAM_CHAT_ID"
upsert METRICS_TOKEN "$METRICS_TOKEN"
upsert GRAFANA_ADMIN_PASSWORD "$GRAFANA_ADMIN_PASSWORD"
upsert MCP_API_KEY "$MCP_API_KEY"
[ -n "$METRICS_TOKEN" ] && echo "$METRICS_TOKEN" > /opt/memento/monitoring/metrics-token && chmod 600 /opt/memento/monitoring/metrics-token || true
- name: Deploy (full build, no CI artifact) - name: Deploy (full build, no CI artifact)
env: env:

1
.gitignore vendored
View File

@@ -50,3 +50,4 @@ docker-data/
# Misc # Misc
*.tsbuildinfo *.tsbuildinfo
next-env.d.ts next-env.d.ts
monitoring/metrics-token

View File

@@ -30,7 +30,7 @@ services:
image: redis:7-alpine image: redis:7-alpine
container_name: memento-redis container_name: memento-redis
restart: unless-stopped restart: unless-stopped
command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru --appendonly yes --appendfsync everysec
volumes: volumes:
- redis-data:/data - redis-data:/data
ports: ports:
@@ -73,7 +73,7 @@ services:
condition: service_healthy condition: service_healthy
restart: unless-stopped restart: unless-stopped
healthcheck: healthcheck:
test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"] test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/api/build-info',r=>process.exit(r.statusCode<500?0:1)).on('error',()=>process.exit(1))"]
interval: 15s interval: 15s
timeout: 10s timeout: 10s
retries: 5 retries: 5
@@ -150,7 +150,7 @@ services:
cpus: '0.25' cpus: '0.25'
memory: 128M memory: 128M
healthcheck: healthcheck:
test: ["CMD-SHELL", "wget --header \"x-api-key: 1b11f42537c1442456ea413feee75bac\" -q -O /dev/null http://localhost:3001/ || exit 1"] test: ["CMD-SHELL", "wget --header \"x-api-key: ${MCP_API_KEY:-dev-key}\" -q -O /dev/null http://localhost:3001/ || exit 1"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3

View File

@@ -4,7 +4,17 @@ import { redis } from '@/lib/redis'
export const dynamic = 'force-dynamic' export const dynamic = 'force-dynamic'
export async function GET() { export async function GET(req: Request) {
// Secure endpoint with bearer token (METRICS_TOKEN env var)
const metricsToken = process.env.METRICS_TOKEN
if (metricsToken) {
const authHeader = req.headers.get('authorization') ?? ''
const token = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : ''
if (token !== metricsToken) {
return new NextResponse('Unauthorized', { status: 401 })
}
}
const lines: string[] = [] const lines: string[] = []
const metric = (name: string, help: string, type: string, value: number | string, labels = '') => { const metric = (name: string, help: string, type: string, value: number | string, labels = '') => {
@@ -13,10 +23,19 @@ export async function GET() {
lines.push(labels ? `${name}{${labels}} ${value}` : `${name} ${value}`) lines.push(labels ? `${name}{${labels}} ${value}` : `${name} ${value}`)
} }
// Uptime // Multiple labeled values for the same metric name
const metricLabeled = (name: string, help: string, type: string, rows: Array<{ labels: string; value: number }>) => {
lines.push(`# HELP ${name} ${help}`)
lines.push(`# TYPE ${name} ${type}`)
for (const row of rows) {
lines.push(`${name}{${row.labels}} ${row.value}`)
}
}
// ── Uptime ──────────────────────────────────────────────────────────────
metric('memento_uptime_seconds', 'Application uptime in seconds', 'gauge', process.uptime().toFixed(2)) metric('memento_uptime_seconds', 'Application uptime in seconds', 'gauge', process.uptime().toFixed(2))
// Database // ── Infrastructure ───────────────────────────────────────────────────────
try { try {
const dbStart = Date.now() const dbStart = Date.now()
const [noteCount, notebookCount, userCount] = await Promise.all([ const [noteCount, notebookCount, userCount] = await Promise.all([
@@ -34,7 +53,6 @@ export async function GET() {
metric('memento_db_up', 'Database connectivity (1=up, 0=down)', 'gauge', 0) metric('memento_db_up', 'Database connectivity (1=up, 0=down)', 'gauge', 0)
} }
// Redis
try { try {
const redisStart = Date.now() const redisStart = Date.now()
await redis.ping() await redis.ping()
@@ -51,12 +69,171 @@ export async function GET() {
metric('memento_redis_up', 'Redis connectivity (1=up, 0=down)', 'gauge', 0) metric('memento_redis_up', 'Redis connectivity (1=up, 0=down)', 'gauge', 0)
} }
// Node.js process memory
const mem = process.memoryUsage() const mem = process.memoryUsage()
metric('memento_process_heap_used_bytes', 'Node.js heap used in bytes', 'gauge', mem.heapUsed) metric('memento_process_heap_used_bytes', 'Node.js heap used in bytes', 'gauge', mem.heapUsed)
metric('memento_process_heap_total_bytes', 'Node.js heap total in bytes', 'gauge', mem.heapTotal) metric('memento_process_heap_total_bytes', 'Node.js heap total in bytes', 'gauge', mem.heapTotal)
metric('memento_process_rss_bytes', 'Node.js RSS memory in bytes', 'gauge', mem.rss) metric('memento_process_rss_bytes', 'Node.js RSS memory in bytes', 'gauge', mem.rss)
// ── Business metrics ─────────────────────────────────────────────────────
try {
const now = new Date()
const startOfMonth = new Date(now.getFullYear(), now.getMonth(), 1)
const startOfLastMonth = new Date(now.getFullYear(), now.getMonth() - 1, 1)
const endOfLastMonth = new Date(now.getFullYear(), now.getMonth(), 0, 23, 59, 59)
const last7days = new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000)
const last30days = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000)
// ── Subscriptions par tier ──
const subsByTier = await prisma.subscription.groupBy({
by: ['tier', 'status'],
_count: { _all: true },
})
const subRows = subsByTier.map(r => ({
labels: `tier="${r.tier}",status="${r.status}"`,
value: r._count._all,
}))
metricLabeled(
'memento_subscriptions_total',
'Total subscriptions by tier and status',
'gauge',
subRows,
)
// Totaux agrégés utiles
const activeSubs = subsByTier
.filter(r => r.status === 'ACTIVE')
.reduce((s, r) => s + r._count._all, 0)
const canceledSubs = subsByTier
.filter(r => r.status === 'CANCELED')
.reduce((s, r) => s + r._count._all, 0)
metric('memento_subscriptions_active_total', 'Total active subscriptions (all tiers)', 'gauge', activeSubs)
metric('memento_subscriptions_canceled_total', 'Total canceled subscriptions (all tiers)', 'gauge', canceledSubs)
// ── Nouveaux abonnements ce mois ──
const newSubsThisMonth = await prisma.subscription.count({
where: { createdAt: { gte: startOfMonth }, status: 'ACTIVE' },
})
const newSubsLastMonth = await prisma.subscription.count({
where: { createdAt: { gte: startOfLastMonth, lte: endOfLastMonth }, status: 'ACTIVE' },
})
metric('memento_subscriptions_new_this_month', 'New active subscriptions created this month', 'gauge', newSubsThisMonth)
metric('memento_subscriptions_new_last_month', 'New active subscriptions created last month', 'gauge', newSubsLastMonth)
// ── Désabonnements (cancelAtPeriodEnd ou canceledAt ce mois) ──
const churnsThisMonth = await prisma.subscription.count({
where: {
OR: [
{ canceledAt: { gte: startOfMonth } },
{ cancelAtPeriodEnd: true, updatedAt: { gte: startOfMonth } },
],
},
})
const churnsLastMonth = await prisma.subscription.count({
where: {
OR: [
{ canceledAt: { gte: startOfLastMonth, lte: endOfLastMonth } },
{ cancelAtPeriodEnd: true, updatedAt: { gte: startOfLastMonth, lte: endOfLastMonth } },
],
},
})
metric('memento_churn_this_month', 'Cancellations / pending cancellations this month', 'gauge', churnsThisMonth)
metric('memento_churn_last_month', 'Cancellations / pending cancellations last month', 'gauge', churnsLastMonth)
// ── Utilisateurs actifs ──
const activeUsers7d = await prisma.note.groupBy({
by: ['userId'],
where: { updatedAt: { gte: last7days } },
})
const activeUsers30d = await prisma.note.groupBy({
by: ['userId'],
where: { updatedAt: { gte: last30days } },
})
metric('memento_active_users_7d', 'Users who modified at least one note in the last 7 days', 'gauge', activeUsers7d.length)
metric('memento_active_users_30d', 'Users who modified at least one note in the last 30 days', 'gauge', activeUsers30d.length)
// Nouveaux utilisateurs
const newUsers7d = await prisma.user.count({ where: { createdAt: { gte: last7days } } })
const newUsersThisMonth = await prisma.user.count({ where: { createdAt: { gte: startOfMonth } } })
metric('memento_new_users_7d', 'New user registrations in the last 7 days', 'gauge', newUsers7d)
metric('memento_new_users_this_month', 'New user registrations this month', 'gauge', newUsersThisMonth)
// ── Agents IA ──
const agentsByStatus = await prisma.agentAction.groupBy({
by: ['status'],
_count: { _all: true },
where: { createdAt: { gte: last30days } },
})
const agentRows = agentsByStatus.map(r => ({
labels: `status="${r.status}"`,
value: r._count._all,
}))
metricLabeled(
'memento_agent_runs_30d',
'Agent runs by status in the last 30 days',
'gauge',
agentRows,
)
const agentRunsToday = await prisma.agentAction.count({
where: { createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) } },
})
metric('memento_agent_runs_today', 'Agent runs triggered today', 'gauge', agentRunsToday)
// Tokens consommés par les agents
const agentTokens = await prisma.agentAction.aggregate({
_sum: { tokensUsed: true },
where: { createdAt: { gte: startOfMonth } },
})
metric('memento_agent_tokens_this_month', 'Total tokens consumed by agents this month', 'gauge', agentTokens._sum.tokensUsed ?? 0)
// ── Usage IA par feature (ce mois) ──
const usageByFeature = await prisma.usageLog.groupBy({
by: ['feature'],
_sum: { requestsCount: true, tokensUsed: true },
where: { periodStart: { gte: startOfMonth } },
})
const usageRequestRows = usageByFeature.map(r => ({
labels: `feature="${r.feature}"`,
value: r._sum.requestsCount ?? 0,
}))
const usageTokenRows = usageByFeature.map(r => ({
labels: `feature="${r.feature}"`,
value: r._sum.tokensUsed ?? 0,
}))
metricLabeled('memento_ai_requests_this_month', 'AI API requests by feature this month', 'gauge', usageRequestRows)
metricLabeled('memento_ai_tokens_this_month', 'AI tokens consumed by feature this month', 'gauge', usageTokenRows)
// ── Logins (AuditLog) ──
const loginsToday = await prisma.auditLog.count({
where: {
action: 'LOGIN',
createdAt: { gte: new Date(now.getFullYear(), now.getMonth(), now.getDate()) },
},
})
const loginsThisMonth = await prisma.auditLog.count({
where: { action: 'LOGIN', createdAt: { gte: startOfMonth } },
})
metric('memento_logins_today', 'Login events today', 'gauge', loginsToday)
metric('memento_logins_this_month', 'Login events this month', 'gauge', loginsThisMonth)
// ── Brainstorm sessions ──
const brainstormThisMonth = await prisma.brainstormSession.count({
where: { createdAt: { gte: startOfMonth } },
})
metric('memento_brainstorm_sessions_this_month', 'Brainstorm sessions created this month', 'gauge', brainstormThisMonth)
// ── Flashcards ──
const flashcardsTotal = await prisma.flashcard.count()
const flashcardsReviewedThisMonth = await prisma.flashcardReview.count({
where: { reviewedAt: { gte: startOfMonth } },
})
metric('memento_flashcards_total', 'Total flashcards in the system', 'gauge', flashcardsTotal)
metric('memento_flashcard_reviews_this_month', 'Flashcard review events this month', 'gauge', flashcardsReviewedThisMonth)
} catch (err) {
console.error('[metrics] Business metrics error:', err)
metric('memento_business_metrics_error', 'Business metrics collection failed (1=error)', 'gauge', 1)
}
const body = lines.join('\n') + '\n' const body = lines.join('\n') + '\n'
return new NextResponse(body, { return new NextResponse(body, {

View File

@@ -1,11 +1,23 @@
route: route:
receiver: 'telegram' receiver: 'telegram-bot'
group_wait: 10s group_wait: 10s
group_interval: 5m group_interval: 5m
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: 'telegram-bot'
repeat_interval: 1h repeat_interval: 1h
receivers: receivers:
- name: 'telegram' - name: 'telegram-bot'
webhook_configs: webhook_configs:
- url: 'http://alertmanager-bridge:8080/alert' - url: 'http://alertmanager-telegram:8080/alerts'
send_resolved: true send_resolved: true
inhibit_rules:
- source_match:
severity: critical
target_match:
severity: warning
equal: ['alertname']

View File

@@ -7,7 +7,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Memento app is DOWN" summary: "🔴 Memento app is DOWN"
description: "The Next.js application has been unreachable for 2+ minutes."
- alert: PostgresDown - alert: PostgresDown
expr: up{job="postgres"} == 0 expr: up{job="postgres"} == 0
@@ -15,7 +16,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "PostgreSQL is DOWN" summary: "🔴 PostgreSQL is DOWN"
description: "Database has been unreachable for 1+ minute."
- alert: RedisDown - alert: RedisDown
expr: up{job="redis"} == 0 expr: up{job="redis"} == 0
@@ -23,15 +25,27 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Redis is DOWN" summary: "🔴 Redis is DOWN"
description: "Redis cache/quota store has been unreachable for 1+ minute."
- name: resources
rules:
- alert: DiskSpaceLow - alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Disk space below 15%" summary: "⚠️ Disk space below 15%"
description: "Only {{ humanizePercentage (div (node_filesystem_avail_bytes{mountpoint='/'}) (node_filesystem_size_bytes{mountpoint='/'})) }} disk space remaining."
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05
for: 1m
labels:
severity: critical
annotations:
summary: "🔴 Disk space CRITICAL (< 5%)"
- alert: HighMemoryUsage - alert: HighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
@@ -39,15 +53,25 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Memory usage above 90%" summary: "⚠️ Memory usage above 90%"
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "⚠️ CPU usage above 85% for 10 minutes"
- name: database
rules:
- alert: PostgresConnectionsHigh - alert: PostgresConnectionsHigh
expr: pg_stat_activity_count > 80 expr: pg_stat_activity_count > 80
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "PostgreSQL connections above 80" summary: "⚠️ PostgreSQL connections above 80"
- alert: PostgresSlowQueries - alert: PostgresSlowQueries
expr: pg_stat_statements_mean_exec_seconds > 5 expr: pg_stat_statements_mean_exec_seconds > 5
@@ -55,15 +79,33 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "PostgreSQL slow queries detected" summary: "⚠️ PostgreSQL slow queries detected (avg > 5s)"
- alert: RedisMemoryHigh
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "⚠️ Redis memory above 85% of limit"
- name: application
rules:
- alert: HighErrorRate - alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 3m for: 3m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "HTTP 5xx error rate above 5%" summary: "⚠️ HTTP 5xx error rate above 5%"
- alert: AppHighHeapMemory
expr: memento_process_heap_used_bytes / memento_process_heap_total_bytes > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "⚠️ Next.js heap usage above 90%"
- alert: ContainerRestarted - alert: ContainerRestarted
expr: increase(container_restart_count[1h]) > 0 expr: increase(container_restart_count[1h]) > 0
@@ -71,4 +113,46 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Container restarted in the last hour" summary: "⚠️ Container restarted in the last hour"
description: "Container {{ $labels.name }} restarted unexpectedly."
# ── Business Alerts ──────────────────────────────────────────────────────────
- name: business
rules:
- alert: HighChurnRate
expr: memento_churn_this_month > 10
for: 0m
labels:
severity: warning
annotations:
summary: "📉 Churn élevé ce mois : {{ $value }} désabonnements"
description: "Plus de 10 désabonnements enregistrés ce mois. Investiguer les raisons."
- alert: NoNewUsersLast7Days
expr: memento_new_users_7d == 0
for: 1h
labels:
severity: warning
annotations:
summary: "📊 Aucune nouvelle inscription depuis 7 jours"
description: "Vérifier le funnel d'onboarding et les canaux d'acquisition."
- alert: AgentRunsHighErrorRate
expr: |
memento_agent_runs_30d{status="error"} /
(memento_agent_runs_30d{status="success"} + memento_agent_runs_30d{status="error"} + 1) > 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "🤖 Taux d'erreur agents IA > 20% ce mois"
description: "{{ $value | humanizePercentage }} des runs d'agents échouent."
- alert: BusinessMetricsCollectionFailed
expr: memento_business_metrics_error == 1
for: 5m
labels:
severity: warning
annotations:
summary: "⚠️ Collecte métriques business en erreur"
description: "L'endpoint /api/metrics ne peut pas interroger la base pour les métriques business."

View File

@@ -1,49 +1,73 @@
services: services:
prometheus: prometheus:
image: prom/prometheus:latest image: prom/prometheus:v2.53.0
container_name: memento-prometheus container_name: memento-prometheus
restart: unless-stopped restart: unless-stopped
volumes: volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./alerts.yml:/etc/prometheus/alerts.yml:ro - ./alerts.yml:/etc/prometheus/alerts.yml:ro
- ./metrics-token:/etc/prometheus/metrics-token:ro
- prometheus-data:/prometheus - prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports: ports:
- "9090:9090" - "127.0.0.1:9090:9090"
networks: networks:
- memento-monitoring - memento-monitoring
- memento-net - memento-net
grafana: grafana:
image: grafana/grafana:latest image: grafana/grafana:11.1.0
container_name: memento-grafana container_name: memento-grafana
restart: unless-stopped restart: unless-stopped
environment: environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin} GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin}
GF_USERS_ALLOW_SIGN_UP: "false" GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3001}" GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3002}"
GF_SECURITY_DISABLE_GRAVATAR: "true"
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
volumes: volumes:
- grafana-data:/var/lib/grafana - grafana-data:/var/lib/grafana
- ./grafana-provisioning:/etc/grafana/provisioning:ro - ./grafana-provisioning:/etc/grafana/provisioning:ro
- ./grafana-dashboards:/etc/grafana/dashboards:ro - ./grafana-dashboards:/etc/grafana/dashboards:ro
ports: ports:
- "3002:3000" - "127.0.0.1:3002:3000"
networks: networks:
- memento-monitoring - memento-monitoring
alertmanager: alertmanager:
image: prom/alertmanager:latest image: prom/alertmanager:v0.27.0
container_name: memento-alertmanager container_name: memento-alertmanager
restart: unless-stopped restart: unless-stopped
volumes: volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
ports: ports:
- "9093:9093" - "127.0.0.1:9093:9093"
networks:
- memento-monitoring
# Real Telegram webhook bridge (replaces the fake alpine sleep)
alertmanager-telegram:
image: metalmatze/alertmanager-bot:0.4.3
container_name: memento-alertmanager-telegram
restart: unless-stopped
environment:
TELEGRAM_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
TELEGRAM_ADMIN: ${TELEGRAM_CHAT_ID:-}
ALERTMANAGER_URL: http://alertmanager:9093
STORE: /data/bolt.db
LISTEN_ADDR: 0.0.0.0:8080
volumes:
- alertmanager-bot-data:/data
networks: networks:
- memento-monitoring - memento-monitoring
node-exporter: node-exporter:
image: prom/node-exporter:latest image: prom/node-exporter:v1.8.1
container_name: memento-node-exporter container_name: memento-node-exporter
restart: unless-stopped restart: unless-stopped
pid: host pid: host
@@ -57,13 +81,13 @@ services:
- '--path.rootfs=/rootfs' - '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports: ports:
- "9100:9100" - "127.0.0.1:9100:9100"
networks: networks:
- memento-monitoring - memento-monitoring
- memento-net - memento-net
postgres-exporter: postgres-exporter:
image: prometheuscommunity/postgres-exporter:latest image: prometheuscommunity/postgres-exporter:v0.15.0
container_name: memento-postgres-exporter container_name: memento-postgres-exporter
restart: unless-stopped restart: unless-stopped
env_file: env_file:
@@ -71,25 +95,25 @@ services:
environment: environment:
DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable" DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD:-memento}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
ports: ports:
- "9187:9187" - "127.0.0.1:9187:9187"
networks: networks:
- memento-monitoring - memento-monitoring
- memento-net - memento-net
redis-exporter: redis-exporter:
image: oliver006/redis_exporter:latest image: oliver006/redis_exporter:v1.62.0
container_name: memento-redis-exporter container_name: memento-redis-exporter
restart: unless-stopped restart: unless-stopped
environment: environment:
REDIS_ADDR: "redis://memento-redis:6379" REDIS_ADDR: "redis://memento-redis:6379"
ports: ports:
- "9121:9121" - "127.0.0.1:9121:9121"
networks: networks:
- memento-monitoring - memento-monitoring
- memento-net - memento-net
cadvisor: cadvisor:
image: gcr.io/cadvisor/cadvisor:latest image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: memento-cadvisor container_name: memento-cadvisor
restart: unless-stopped restart: unless-stopped
privileged: true privileged: true
@@ -102,29 +126,15 @@ services:
- /sys:/sys:ro - /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro - /var/lib/docker/:/var/lib/docker:ro
ports: ports:
- "8081:8080" - "127.0.0.1:8081:8080"
networks: networks:
- memento-monitoring - memento-monitoring
- memento-net - memento-net
alertmanager-bridge:
image: alpine:latest
container_name: memento-alertmanager-bridge
restart: unless-stopped
entrypoint: |
sh -c '
apk add --no-cache curl
while true; do
echo "Bridge running - configure webhook to forward to Telegram"
sleep 3600
done
'
networks:
- memento-monitoring
volumes: volumes:
prometheus-data: prometheus-data:
grafana-data: grafana-data:
alertmanager-bot-data:
networks: networks:
memento-monitoring: memento-monitoring:

View File

@@ -13,6 +13,8 @@ alerting:
scrape_configs: scrape_configs:
- job_name: 'memento-app' - job_name: 'memento-app'
metrics_path: '/api/metrics' metrics_path: '/api/metrics'
authorization:
credentials_file: /etc/prometheus/metrics-token
static_configs: static_configs:
- targets: ['memento-note:3000'] - targets: ['memento-note:3000']