feat: P0 backup system (WAL+snapshot+restore+verify), monitoring stack, admin health API
This commit is contained in:
@@ -1,29 +1,20 @@
|
||||
{
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/0c6fb2d9-1b82-4ca3-b0f4-f8373a62faca/0c6fb2d9-1b82-4ca3-b0f4-f8373a62faca.jsonl": 1778182618469,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/137b1f4b-59d9-4ce6-8d74-01f7cbae2ba7/137b1f4b-59d9-4ce6-8d74-01f7cbae2ba7.jsonl": 1778966645519,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/2e0ce74c-a31e-49d8-a0d0-a8b224813533/2e0ce74c-a31e-49d8-a0d0-a8b224813533.jsonl": 1778188935902,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/38000361-5c66-4032-8e1e-ef405e843de0/38000361-5c66-4032-8e1e-ef405e843de0.jsonl": 1778968570815,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/394af47d-c5cd-4cef-bef2-2192717439f8.jsonl": 1778951280378,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/subagents/0927d889-66b3-4007-87b4-15f8ad9e01f0.jsonl": 1778951401282,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/subagents/0ddd911c-403c-4d90-a189-069679758338.jsonl": 1778951533153,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/subagents/59f0c95a-415f-440a-bae2-96020aca9033.jsonl": 1778951400523,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/subagents/dc63a53e-55bc-4175-b49e-637b408138ac.jsonl": 1778951399831,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/394af47d-c5cd-4cef-bef2-2192717439f8/subagents/f0ad176d-04d7-4d9a-82b8-65273acd313a.jsonl": 1778946728971,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/5039e847-3035-4f43-b184-46aeceb06764/5039e847-3035-4f43-b184-46aeceb06764.jsonl": 1778838518325,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/5039e847-3035-4f43-b184-46aeceb06764/subagents/e13034a9-05cf-47e3-afa0-f6b142866ab1.jsonl": 1778837589740,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/5923e37e-370d-4867-95d0-751622982859/5923e37e-370d-4867-95d0-751622982859.jsonl": 1778968000388,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/5ac57758-0a3c-4502-9473-b63413a39013/5ac57758-0a3c-4502-9473-b63413a39013.jsonl": 1778921288478,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/5ac57758-0a3c-4502-9473-b63413a39013/subagents/b2833767-42d4-4d3f-952e-b961ea5538d3.jsonl": 1778917054076,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/65570f8a-5cd2-4573-b2d9-0983f2922d1f/65570f8a-5cd2-4573-b2d9-0983f2922d1f.jsonl": 1778231172346,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/65570f8a-5cd2-4573-b2d9-0983f2922d1f/subagents/b9a447c6-5a63-4882-b878-5aee9756ce25.jsonl": 1778227602626,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/65570f8a-5cd2-4573-b2d9-0983f2922d1f/subagents/e2881041-49a0-4dca-8df1-614a7a070038.jsonl": 1778226771429,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/7b6c0ed0-caad-4157-b048-535452685b73/7b6c0ed0-caad-4157-b048-535452685b73.jsonl": 1778852401511,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/8c2fc9f5-c359-4c67-a0f5-325ee44cebc9/8c2fc9f5-c359-4c67-a0f5-325ee44cebc9.jsonl": 1778751052502,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/92d73875-5939-48fb-9f68-86c88b0f2ff7/92d73875-5939-48fb-9f68-86c88b0f2ff7.jsonl": 1778964103281,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/92d73875-5939-48fb-9f68-86c88b0f2ff7/subagents/401ab052-4346-4e0d-8ca9-108c0a5b1a61.jsonl": 1778964141896,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/92d73875-5939-48fb-9f68-86c88b0f2ff7/92d73875-5939-48fb-9f68-86c88b0f2ff7.jsonl": 1778966017038,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/9902a438-467f-4d57-8f43-28e7d579a95f/9902a438-467f-4d57-8f43-28e7d579a95f.jsonl": 1778839341001,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/a64d78ce-86d3-4ec8-8f79-7589ad05a62c/a64d78ce-86d3-4ec8-8f79-7589ad05a62c.jsonl": 1778846298067,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/b85430f3-4520-47fd-9b4b-5200ca340a36/b85430f3-4520-47fd-9b4b-5200ca340a36.jsonl": 1779026409041,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/ca85061e-6af9-4250-8dc7-9c3bb4839c48/ca85061e-6af9-4250-8dc7-9c3bb4839c48.jsonl": 1778849848444,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/ca85061e-6af9-4250-8dc7-9c3bb4839c48/subagents/3bbaec3b-7dce-4eee-916e-7673710c1e13.jsonl": 1778848753214,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/d92dfb04-c148-4a14-a48a-39d4c634caee/d92dfb04-c148-4a14-a48a-39d4c634caee.jsonl": 1778861502433,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/e3745f62-c3b9-4a21-8942-71bc6f603f77/e3745f62-c3b9-4a21-8942-71bc6f603f77.jsonl": 1778018654221,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/e3745f62-c3b9-4a21-8942-71bc6f603f77/subagents/f028b51a-8a84-4a45-8866-95cb05ca9727.jsonl": 1778014992372,
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/f0ad176d-04d7-4d9a-82b8-65273acd313a/subagents/96507ccc-6150-4260-a55c-94abd2b57441.jsonl": 1778946698447
|
||||
"/home/devparsa/.cursor/projects/home-devparsa-dev-Momento/agent-transcripts/e3745f62-c3b9-4a21-8942-71bc6f603f77/e3745f62-c3b9-4a21-8942-71bc6f603f77.jsonl": 1778018654221
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"version": 1,
|
||||
"lastRunAtMs": 1778964093127,
|
||||
"turnsSinceLastRun": 0,
|
||||
"lastTranscriptMtimeMs": 1778964092911.085,
|
||||
"lastProcessedGenerationId": "370e16e5-9bc9-4e07-a658-507f07456acf",
|
||||
"lastRunAtMs": 1779026397247,
|
||||
"turnsSinceLastRun": 2,
|
||||
"lastTranscriptMtimeMs": 1779026397153.5342,
|
||||
"lastProcessedGenerationId": "ebfbb794-0fc5-4c91-bdd5-581bc2a276c0",
|
||||
"trialStartedAtMs": null
|
||||
}
|
||||
|
||||
12
AGENTS.md
12
AGENTS.md
@@ -2,7 +2,7 @@
|
||||
|
||||
## Learned User Preferences
|
||||
|
||||
- Préfère les échanges et le travail guidé en français.
|
||||
- Préfère les échanges en français, avec des explications détaillées et claires (éviter le jargon flou).
|
||||
- Interface : tout libellé via i18n dans les 15 fichiers `memento-note/locales/*.json` (FR et EN comme références de contenu) ; éviter le texte en dur ; traductions **contextuelles** (sens produit, pas mot à mot — ex. « connecter votre propre fournisseur ») ; lors d'une traduction complète, mettre à jour toutes les locales concernées ; si l'utilisateur demande seulement les **clés i18n**, ajouter les clés (souvent EN/FR) sans remplir les 15 locales — il traduit souvent avec un autre modèle.
|
||||
- Base de données : **INTERDIT TOTALEMENT** de lancer `prisma db push --force-reset`, `prisma migrate reset`, `DROP TABLE`, `TRUNCATE`, `pg_restore` avec clean, ou TOUTE commande qui vide/supprime des données — MÊME SI l'utilisateur est d'accord — sans avoir d'abord : (1) dumpé la base avec `bash /home/devparsa/dev/Momento/dump-db.sh`, (2) vérifié le dump fait au moins 1Mo, (3) obtenu un "OUI" explicite de l'utilisateur. **4 incidents de perte de données documentés (14/05, 15/05 x2, 16/05). NE JAMAIS REFAIRE ÇA.**
|
||||
- Design produit : migration depuis les gabarits `architectural-grid1` (base cible) et `architectural-grid` ; avancer pas à pas avec validation ; respecter la logique liste / carte de notes puis contenu au clic comme dans la référence.
|
||||
@@ -12,15 +12,19 @@
|
||||
- Flux Excalidraw / diagrammes générés : accès via notification en plus d'une simple redirection ; priorité à la mise en page et au texte contenu dans les formes ; proposer des modes visuels (ex. coloré vs plus austère) tout en visant un rendu proche du style Excalidraw (polices, look).
|
||||
- L'admin doit être intégré au nouveau design (éviter l'ancienne topbar isolée).
|
||||
- Ne pas supposer les réglages utilisateur (modes, options) sans preuve dans l'UI ou les données.
|
||||
- **Interdiction d'écrire des tests** sauf demande explicite ; ne jamais générer de code inutile ou superflu — économiser les tokens au maximum.
|
||||
- **Interdiction d'écrire des tests** sauf demande explicite ; en CI, seul `npm run test:unit` (`tests/unit/**`) — pas `tests/migration/` ; ne jamais générer de code superflu.
|
||||
- Déploiement : privilégier le chemin rapide (artifact Next.js en CI + `Dockerfile.prebuilt`) ; éviter les rebuild Docker complets inutiles (~15 min par itération).
|
||||
|
||||
## Learned Workspace Facts
|
||||
|
||||
- Application Next.js principalement sous `memento-note/`.
|
||||
- Référentiels design du workspace : `architectural-grid1/` et `architectural-grid/` à la racine du repo Momento.
|
||||
- i18n : 15 fichiers sous `memento-note/locales/` (de, en, es, fr, it, pt, nl, pl, ru, zh, ja, ko, ar, fa, hi) ; logique sous `memento-note/lib/i18n/`.
|
||||
- i18n : 15 fichiers sous `memento-note/locales/` (de, en, es, fr, it, pt, nl, pl, ru, zh, ja, ko, ar, fa, hi) ; logique sous `memento-note/lib/i18n/` ; référence `en.json` (~2218 clés) ; auditer les « non traduits » par flatten EN vs locale (souvent valeurs identiques à l'EN).
|
||||
- Workflow BMad : stories sous `docs/` (ex. `3-4-host-pays-session-logic.md`), suivi sprint dans `docs/sprint-status.yaml` ; skills sous `.claude/skills/bmad-*` ; `_bmad-output/planning-artifacts` souvent vide — planification de référence dans `docs/`.
|
||||
- PostgreSQL Docker (`memento-postgres`) sur le port 5433 ; Redis Docker (`memento-redis`) sur le port 6379 (voir règles projet).
|
||||
- Règles opérationnelles Prisma et sécurité base de données décrites dans `CLAUDE.md` à la racine du repo.
|
||||
- i18n : référence `memento-note/locales/en.json` (~2218 clés) ; des textes « non traduits » sont souvent des valeurs **identiques à l'anglais** dans une locale, pas des clés absentes — auditer avec comparaison flatten EN vs locale.
|
||||
- Production : dépôt `/opt/memento` sur `192.168.1.190`, conteneur `memento-note` sur le port **3000**, URL publique **https://note.parsanet.org** (nginx + Cloudflare) ; ne pas recréer Postgres en prod (`deploy.yaml` / `deploy-prod.sh`).
|
||||
- CI/CD Gitea : `.gitea/workflows/ci.yaml` — CI sur `ubuntu-24.04`, deploy sur runner **`docker-host`** (sur le serveur) ; deploy manuel via `.gitea/workflows/deploy.yaml` ou `bash scripts/deploy-prod.sh`.
|
||||
- Migrations dans l'image prebuilt : `docker compose exec memento-note node ./node_modules/prisma/build/index.js migrate deploy` (pas `npx prisma` dans le PATH) ; helper `scripts/migrate-docker.sh`.
|
||||
- Vérification deploy : `GET /api/build-info` (SHA Git) ; comparer `127.0.0.1:3000` et le domaine Cloudflare — purger le cache si versions divergent ; 403 sur `/api/manifest` côté domaine = souvent Cloudflare, pas l'app.
|
||||
- Guide utilisateur illustré : `docs/guide-utilisateur/README.md`, captures dans `docs/guide-utilisateur/screenshots/` ; régénération via `docs/guide-utilisateur/capture-screenshots.mjs` lancé depuis `memento-note/` (Playwright) ; URL lue depuis `NEXTAUTH_URL` ou `MOMENTO_DOC_BASE_URL`.
|
||||
|
||||
91
memento-note/app/api/admin/health/route.ts
Normal file
91
memento-note/app/api/admin/health/route.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { NextResponse } from 'next/server'
|
||||
import { getPrisma } from '@/lib/prisma'
|
||||
import { redis } from '@/lib/redis'
|
||||
|
||||
export const dynamic = 'force-dynamic'
|
||||
|
||||
export async function GET() {
|
||||
const start = Date.now()
|
||||
const checks: Record<string, { status: string; latency?: string; error?: string; [k: string]: unknown }> = {}
|
||||
|
||||
// Database check
|
||||
try {
|
||||
const dbStart = Date.now()
|
||||
const prisma = getPrisma()
|
||||
const [noteCount, notebookCount, userCount] = await Promise.all([
|
||||
prisma.note.count(),
|
||||
prisma.notebook.count(),
|
||||
prisma.user.count(),
|
||||
])
|
||||
checks.database = {
|
||||
status: 'healthy',
|
||||
latency: `${Date.now() - dbStart}ms`,
|
||||
notes: noteCount,
|
||||
notebooks: notebookCount,
|
||||
users: userCount,
|
||||
}
|
||||
} catch (e) {
|
||||
checks.database = { status: 'unhealthy', error: e instanceof Error ? e.message : 'Unknown error' }
|
||||
}
|
||||
|
||||
// Redis check
|
||||
try {
|
||||
const redisStart = Date.now()
|
||||
await redis.ping()
|
||||
const info = await redis.info('memory')
|
||||
const dbSize = await redis.dbsize()
|
||||
const memMatch = info.match(/used_memory_human:(\S+)/)
|
||||
checks.redis = {
|
||||
status: 'healthy',
|
||||
latency: `${Date.now() - redisStart}ms`,
|
||||
keys: dbSize,
|
||||
memory: memMatch ? memMatch[1] : 'unknown',
|
||||
}
|
||||
} catch (e) {
|
||||
checks.redis = { status: 'unhealthy', error: e instanceof Error ? e.message : 'Unknown error' }
|
||||
}
|
||||
|
||||
// AI providers check
|
||||
try {
|
||||
const { getAIProvider, getChatProvider } = await import('@/lib/ai/providers/registry')
|
||||
const embeddingProvider = getAIProvider()
|
||||
const chatProvider = getChatProvider()
|
||||
checks.ai = {
|
||||
status: 'configured',
|
||||
embedding: { provider: embeddingProvider },
|
||||
chat: { provider: chatProvider },
|
||||
}
|
||||
} catch (e) {
|
||||
checks.ai = { status: 'unhealthy', error: e instanceof Error ? e.message : 'Unknown error' }
|
||||
}
|
||||
|
||||
// Disk check
|
||||
try {
|
||||
const { execSync } = await import('child_process')
|
||||
const diskInfo = execSync("df -h /opt/memento | awk 'NR==2{print $2,$3,$4,$5}'").toString().trim()
|
||||
const [total, used, available, percent] = diskInfo.split(/\s+/)
|
||||
checks.storage = {
|
||||
status: parseInt(percent) > 90 ? 'warning' : 'healthy',
|
||||
total,
|
||||
used,
|
||||
available,
|
||||
usagePercent: percent,
|
||||
}
|
||||
} catch {
|
||||
checks.storage = { status: 'unknown' }
|
||||
}
|
||||
|
||||
const allHealthy = Object.values(checks).every(c => c.status === 'healthy' || c.status === 'configured')
|
||||
const hasDegraded = Object.values(checks).some(c => c.status === 'warning')
|
||||
|
||||
return NextResponse.json({
|
||||
status: allHealthy ? (hasDegraded ? 'degraded' : 'healthy') : 'unhealthy',
|
||||
uptime: process.uptime(),
|
||||
version: process.env.npm_package_version || '0.2.0',
|
||||
timestamp: new Date().toISOString(),
|
||||
responseTime: `${Date.now() - start}ms`,
|
||||
components: checks,
|
||||
}, {
|
||||
status: allHealthy ? 200 : 503,
|
||||
})
|
||||
}
|
||||
11
monitoring/alertmanager.yml
Normal file
11
monitoring/alertmanager.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
route:
|
||||
receiver: 'telegram'
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 1h
|
||||
|
||||
receivers:
|
||||
- name: 'telegram'
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanager-bridge:8080/alert'
|
||||
send_resolved: true
|
||||
74
monitoring/alerts.yml
Normal file
74
monitoring/alerts.yml
Normal file
@@ -0,0 +1,74 @@
|
||||
groups:
|
||||
- name: critical
|
||||
rules:
|
||||
- alert: MementoAppDown
|
||||
expr: up{job="memento-app"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Memento app is DOWN"
|
||||
|
||||
- alert: PostgresDown
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is DOWN"
|
||||
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is DOWN"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space below 15%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage above 90%"
|
||||
|
||||
- alert: PostgresConnectionsHigh
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL connections above 80"
|
||||
|
||||
- alert: PostgresSlowQueries
|
||||
expr: pg_stat_statements_mean_exec_seconds > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL slow queries detected"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "HTTP 5xx error rate above 5%"
|
||||
|
||||
- alert: ContainerRestarted
|
||||
expr: increase(container_restart_count[1h]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container restarted in the last hour"
|
||||
119
monitoring/docker-compose.monitoring.yml
Normal file
119
monitoring/docker-compose.monitoring.yml
Normal file
@@ -0,0 +1,119 @@
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: memento-prometheus
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: memento-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-memento-admin}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_SERVER_ROOT_URL: "${GRAFANA_URL:-http://localhost:3001}"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
ports:
|
||||
- "3001:3000"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: memento-alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
ports:
|
||||
- "9093:9093"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: memento-node-exporter
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:latest
|
||||
container_name: memento-postgres-exporter
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-memento}:${POSTGRES_PASSWORD}@memento-postgres:5432/${POSTGRES_DB:-memento}?sslmode=disable"
|
||||
ports:
|
||||
- "9187:9187"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento_default
|
||||
|
||||
redis-exporter:
|
||||
image: oliver006/redis_exporter:latest
|
||||
container_name: memento-redis-exporter
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
REDIS_ADDR: "redis://memento-redis:6379"
|
||||
ports:
|
||||
- "9121:9121"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
- memento_default
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
container_name: memento-cadvisor
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
alertmanager-bridge:
|
||||
image: alpine:latest
|
||||
container_name: memento-alertmanager-bridge
|
||||
restart: unless-stopped
|
||||
entrypoint: |
|
||||
sh -c '
|
||||
apk add --no-cache curl
|
||||
while true; do
|
||||
echo "Bridge running - configure webhook to forward to Telegram"
|
||||
sleep 3600
|
||||
done
|
||||
'
|
||||
networks:
|
||||
- memento-monitoring
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
||||
networks:
|
||||
memento-monitoring:
|
||||
driver: bridge
|
||||
memento_default:
|
||||
external: true
|
||||
33
monitoring/prometheus.yml
Normal file
33
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'memento-app'
|
||||
metrics_path: '/api/metrics'
|
||||
static_configs:
|
||||
- targets: ['memento-note:3000']
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
60
scripts/backup/backup.sh
Executable file
60
scripts/backup/backup.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
WAL_DIR="$BACKUP_DIR/wal"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
RETENTION_DAYS=7
|
||||
RETENTION_WEEKS=4
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
mkdir -p "$WAL_DIR" "$SNAPSHOT_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
log "=== Starting backup ==="
|
||||
|
||||
SNAPSHOT_FILE="$SNAPSHOT_DIR/memento-$TIMESTAMP.sql.gz"
|
||||
|
||||
log "Creating snapshot: $SNAPSHOT_FILE"
|
||||
docker exec "$PG_CONTAINER" pg_dump -U "$PG_USER" -d "$PG_DB" --format=custom | gzip > "$SNAPSHOT_FILE"
|
||||
|
||||
SNAPSHOT_SIZE=$(stat -c%s "$SNAPSHOT_FILE" 2>/dev/null || echo 0)
|
||||
if [ "$SNAPSHOT_SIZE" -lt 1048576 ]; then
|
||||
log "ERROR: Snapshot is only $SNAPSHOT_SIZE bytes (< 1MB). Possible failure!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Snapshot OK: $(( SNAPSHOT_SIZE / 1024 ))KB"
|
||||
|
||||
log "Verifying snapshot integrity..."
|
||||
if gzip -t "$SNAPSHOT_FILE" 2>/dev/null; then
|
||||
log "Integrity check: PASSED"
|
||||
else
|
||||
log "ERROR: Snapshot integrity check FAILED!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Cleaning old snapshots (keeping $RETENTION_DAYS days + $RETENTION_WEEKS weekly)..."
|
||||
cd "$SNAPSHOT_DIR"
|
||||
|
||||
find . -name "*.sql.gz" -mtime +$RETENTION_DAYS ! -name "*-000000.sql.gz" -delete 2>/dev/null || true
|
||||
|
||||
for i in $(seq 1 $RETENTION_WEEKS); do
|
||||
WEEK_DATE=$(date -d "$RETENTION_DAYS + $((i-1)) * 7 days" +%Y%m%d)
|
||||
WEEK_FILE=$(ls -t memento-${WEEK_DATE}*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -n "$WEEK_FILE" ]; then
|
||||
touch "$WEEK_FILE"
|
||||
fi
|
||||
done
|
||||
|
||||
SNAPSHOT_COUNT=$(ls -1 "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | wc -l)
|
||||
log "Cleanup done. $SNAPSHOT_COUNT snapshots remaining"
|
||||
|
||||
log "=== Backup complete ==="
|
||||
25
scripts/backup/install-crontab.sh
Executable file
25
scripts/backup/install-crontab.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
# Install crontab entries for automated backups
|
||||
# Usage: sudo bash install-crontab.sh
|
||||
|
||||
CRON_FILE="/opt/memento/backups/crontab"
|
||||
|
||||
cat > "$CRON_FILE" << 'EOF'
|
||||
# Memento Backup Schedule
|
||||
# Snapshots every 6 hours
|
||||
0 */6 * * * /opt/memento/scripts/backup/backup.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
|
||||
# Backup verification every 6 hours (offset by 30min)
|
||||
30 */6 * * * /opt/memento/scripts/backup/verify-backups.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
|
||||
# Offsite sync daily at 03:00
|
||||
0 3 * * * /opt/memento/scripts/backup/offsite-sync.sh >> /opt/memento/backups/backup.log 2>&1
|
||||
EOF
|
||||
|
||||
echo "Crontab file created at $CRON_FILE"
|
||||
echo ""
|
||||
echo "To install, run as the memento user:"
|
||||
echo " crontab $CRON_FILE"
|
||||
echo ""
|
||||
echo "To verify:"
|
||||
echo " crontab -l"
|
||||
47
scripts/backup/offsite-sync.sh
Executable file
47
scripts/backup/offsite-sync.sh
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SNAPSHOT_DIR="/opt/memento/backups/snapshots"
|
||||
REMOTE_HOST="${BACKUP_REMOTE_HOST:-}"
|
||||
REMOTE_PATH="${BACKUP_REMOTE_PATH:-/opt/memento-offsite-backups}"
|
||||
LOG_FILE="/opt/memento/backups/backup.log"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
send_alert() {
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" -d text="$1" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
log "=== Offsite backup sync ==="
|
||||
|
||||
if [ -z "$REMOTE_HOST" ]; then
|
||||
log "WARNING: No BACKUP_REMOTE_HOST configured. Skipping offsite sync."
|
||||
log "Set BACKUP_REMOTE_HOST=user@server in /opt/memento/.env.docker for offsite backups"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
LATEST=$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -z "$LATEST" ]; then
|
||||
log "ERROR: No snapshots to sync"
|
||||
send_alert "🚨 MEMENTO: Offsite backup failed - no snapshots found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Syncing $LATEST to $REMOTE_HOST:$REMOTE_PATH"
|
||||
rsync -az --chmod=F600 "$SNAPSHOT_DIR/" "${REMOTE_HOST}:${REMOTE_PATH}/" 2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log "Offsite sync complete"
|
||||
else
|
||||
log "ERROR: Offsite sync failed!"
|
||||
send_alert "🚨 MEMENTO: Offsite backup sync FAILED! Check rsync connectivity."
|
||||
exit 1
|
||||
fi
|
||||
114
scripts/backup/restore.sh
Executable file
114
scripts/backup/restore.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
RESTORE_TYPE="${1:-snapshot}"
|
||||
PITR_TIME="${2:-}"
|
||||
|
||||
if [ "$RESTORE_TYPE" = "pitr" ] && [ -z "$PITR_TIME" ]; then
|
||||
echo "Usage: $0 pitr \"2026-05-17 14:30:00\""
|
||||
echo " $0 snapshot [snapshot_file.sql.gz]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "=== EMERGENCY PRE-RESTORE: Dumping current state ==="
|
||||
PRE_RESTORE="$BACKUP_DIR/pre-restore-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
docker exec "$PG_CONTAINER" pg_dump -U "$PG_USER" -d "$PG_DB" --format=custom 2>/dev/null | gzip > "$PRE_RESTORE" || true
|
||||
|
||||
APP_CONTAINER="${APP_CONTAINER:-memento-web}"
|
||||
log "Stopping app container: $APP_CONTAINER"
|
||||
docker stop "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
if [ "$RESTORE_TYPE" = "snapshot" ]; then
|
||||
SNAPSHOT_FILE="${2:-$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)}"
|
||||
|
||||
if [ -z "$SNAPSHOT_FILE" ] || [ ! -f "$SNAPSHOT_FILE" ]; then
|
||||
log "ERROR: No snapshot file found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Restoring from snapshot: $SNAPSHOT_FILE"
|
||||
FILE_SIZE=$(stat -c%s "$SNAPSHOT_FILE")
|
||||
log "File size: $(( FILE_SIZE / 1024 ))KB"
|
||||
|
||||
log "Dropping existing connections..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB' AND pid <> pg_backend_pid();" 2>/dev/null || true
|
||||
|
||||
log "Terminating connections to template..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='$PG_DB';" 2>/dev/null || true
|
||||
|
||||
log "Recreating database..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_restore;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "CREATE DATABASE ${PG_DB}_restore;"
|
||||
|
||||
log "Restoring data..."
|
||||
gunzip -c "$SNAPSHOT_FILE" | docker exec -i "$PG_CONTAINER" pg_restore -U "$PG_USER" -d "${PG_DB}_restore" --no-owner --no-privileges 2>/dev/null || true
|
||||
|
||||
log "Swapping databases..."
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "DROP DATABASE IF EXISTS ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE \"$PG_DB\" RENAME TO ${PG_DB}_old;"
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d postgres -c "ALTER DATABASE ${PG_DB}_restore RENAME TO \"$PG_DB\";"
|
||||
|
||||
log "Running migrations..."
|
||||
docker exec "$APP_CONTAINER" node ./node_modules/prisma/build/index.js migrate deploy 2>/dev/null || true
|
||||
|
||||
elif [ "$RESTORE_TYPE" = "pitr" ]; then
|
||||
log "PITR restore to: $PITR_TIME"
|
||||
log "Stopping PostgreSQL..."
|
||||
docker stop "$PG_CONTAINER"
|
||||
|
||||
log "WARNING: PITR restore requires manual intervention."
|
||||
log "Steps:"
|
||||
log " 1. Copy latest base backup to PGDATA"
|
||||
log " 2. Create recovery.signal in PGDATA"
|
||||
log " 3. Set restore_command and recovery_target_time in postgresql.conf"
|
||||
log " 4. Start PostgreSQL"
|
||||
log " 5. App will reconnect automatically"
|
||||
log ""
|
||||
log "Recovery target: $PITR_TIME"
|
||||
log "WAL files location: $BACKUP_DIR/wal/"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Verifying restore..."
|
||||
NOTE_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ')
|
||||
NOTEBOOK_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Notebook\";" 2>/dev/null | tr -d ' ')
|
||||
USER_COUNT=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ')
|
||||
|
||||
log "After restore: $NOTE_COUNT notes, $NOTEBOOK_COUNT notebooks, $USER_COUNT users"
|
||||
|
||||
if [ "$NOTE_COUNT" -eq 0 ]; then
|
||||
log "ERROR: 0 notes after restore! Something went wrong."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Starting app container..."
|
||||
docker start "$APP_CONTAINER" 2>/dev/null || true
|
||||
|
||||
HEALTHY=false
|
||||
for i in $(seq 1 36); do
|
||||
CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$CODE" != "000" ] && [ "$CODE" -lt 500 ]; then
|
||||
HEALTHY=true
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ "$HEALTHY" = true ]; then
|
||||
log "=== RESTORE SUCCESSFUL === App is healthy"
|
||||
else
|
||||
log "WARNING: App not healthy after restore. Check manually."
|
||||
fi
|
||||
35
scripts/backup/setup-wal.sh
Executable file
35
scripts/backup/setup-wal.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
WAL_DIR="$BACKUP_DIR/wal"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
PG_PASSWORD="${POSTGRES_PASSWORD:-}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
mkdir -p "$WAL_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
log "=== Setting up WAL archiving ==="
|
||||
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "
|
||||
ALTER SYSTEM SET wal_level = replica;
|
||||
ALTER SYSTEM SET archive_mode = on;
|
||||
ALTER SYSTEM SET archive_command = 'cp %p /var/lib/postgresql/backups/wal/%f';
|
||||
ALTER SYSTEM SET max_wal_senders = 3;
|
||||
ALTER SYSTEM SET wal_keep_size = '1GB';
|
||||
"
|
||||
|
||||
docker exec "$PG_CONTAINER" mkdir -p /var/lib/postgresql/backups/wal
|
||||
|
||||
docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -c "SELECT pg_reload_conf();"
|
||||
|
||||
log "WAL archiving enabled. Archives stored in /var/lib/postgresql/backups/wal/"
|
||||
|
||||
WAL_COUNT=$(docker exec "$PG_CONTAINER" ls /var/lib/postgresql/backups/wal/ 2>/dev/null | wc -l)
|
||||
log "Existing WAL segments: $WAL_COUNT"
|
||||
98
scripts/backup/verify-backups.sh
Executable file
98
scripts/backup/verify-backups.sh
Executable file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/memento/backups"
|
||||
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
|
||||
PG_CONTAINER="memento-postgres"
|
||||
PG_USER="${POSTGRES_USER:-memento}"
|
||||
PG_DB="${POSTGRES_DB:-memento}"
|
||||
LOG_FILE="$BACKUP_DIR/backup.log"
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_alert() {
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="$TELEGRAM_CHAT_ID" \
|
||||
-d text="$1" \
|
||||
-d parse_mode="HTML" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
ISSUES=0
|
||||
|
||||
log "=== Backup verification ==="
|
||||
|
||||
LATEST_SNAPSHOT=$(ls -t "$SNAPSHOT_DIR"/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -z "$LATEST_SNAPSHOT" ]; then
|
||||
log "CRITICAL: No snapshots found!"
|
||||
send_alert "🚨 MEMENTO: No database backups found! Data at risk!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SNAPSHOT_AGE=$(( ( $(date +%s) - $(stat -c%Y "$LATEST_SNAPSHOT") ) / 3600 ))
|
||||
if [ "$SNAPSHOT_AGE" -gt 8 ]; then
|
||||
log "CRITICAL: Latest snapshot is ${SNAPSHOT_AGE}h old (> 8h threshold)"
|
||||
send_alert "🚨 MEMENTO: Latest DB backup is ${SNAPSHOT_AGE}h old! Backup cron may be broken!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Latest snapshot is ${SNAPSHOT_AGE}h old"
|
||||
fi
|
||||
|
||||
SNAPSHOT_SIZE=$(stat -c%s "$LATEST_SNAPSHOT" 2>/dev/null || echo 0)
|
||||
if [ "$SNAPSHOT_SIZE" -lt 1048576 ]; then
|
||||
log "CRITICAL: Snapshot is only $(( SNAPSHOT_SIZE / 1024 ))KB (< 1MB)"
|
||||
send_alert "🚨 MEMENTO: DB backup is suspiciously small ($(( SNAPSHOT_SIZE / 1024 ))KB)! Possible failure!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot size $(( SNAPSHOT_SIZE / 1024 ))KB"
|
||||
fi
|
||||
|
||||
if ! gzip -t "$LATEST_SNAPSHOT" 2>/dev/null; then
|
||||
log "CRITICAL: Snapshot integrity check FAILED"
|
||||
send_alert "🚨 MEMENTO: DB backup integrity check FAILED! Backup is corrupted!"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Snapshot integrity check passed"
|
||||
fi
|
||||
|
||||
DB_NOTES=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"Note\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
DB_USERS=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -c "SELECT COUNT(*) FROM \"User\";" 2>/dev/null | tr -d ' ' || echo "ERROR")
|
||||
|
||||
if [ "$DB_NOTES" = "ERROR" ] || [ "$DB_NOTES" = "0" ]; then
|
||||
log "CRITICAL: DB query returned $DB_NOTES notes"
|
||||
send_alert "🚨 MEMENTO: Database appears empty or unreachable! Notes count: $DB_NOTES"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Database has $DB_NOTES notes, $DB_USERS users"
|
||||
fi
|
||||
|
||||
APP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://localhost:3000/api/build-info" 2>/dev/null || echo "000")
|
||||
if [ "$APP_CODE" = "000" ] || [ "$APP_CODE" -ge 500 ]; then
|
||||
log "CRITICAL: App health check failed (HTTP $APP_CODE)"
|
||||
send_alert "🚨 MEMENTO: App is down! Health check returned HTTP $APP_CODE"
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: App is healthy (HTTP $APP_CODE)"
|
||||
fi
|
||||
|
||||
DISK_PCT=$(df -h /opt/memento | awk 'NR==2{print $5}' | tr -d '%')
|
||||
if [ "$DISK_PCT" -gt 85 ]; then
|
||||
log "WARNING: Disk usage at ${DISK_PCT}%"
|
||||
send_alert "⚠️ MEMENTO: Disk usage at ${DISK_PCT}%! Consider cleanup."
|
||||
ISSUES=$((ISSUES + 1))
|
||||
else
|
||||
log "OK: Disk usage at ${DISK_PCT}%"
|
||||
fi
|
||||
|
||||
if [ "$ISSUES" -eq 0 ]; then
|
||||
log "=== All checks PASSED ==="
|
||||
else
|
||||
log "=== $ISSUES issue(s) found ==="
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user