Files
Momento/memento-note/app/actions/scrape.ts
sepehr 99d0583871
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 44s
feat: IA Note — rename panel, add Resource tab + chat hover-actions
- Renamed 'AI Copilot' / 'Assistant IA' → 'IA Note' everywhere in UI
- Added 3rd 'Ressource' tab in IA Note panel with:
  * Optional URL field that scrapes page text (via new scrapePageText action)
  * Textarea for paste (markdown, HTML, plain text)
  * 3 integration modes: Remplacer / Compléter (AI) / Fusionner (AI)
  * Dual-format preview: Rendu + Markdown brut before applying
- Added hover-actions on assistant chat messages:
  * Remplacer / Compléter / Fusionner appear on hover
  * Triggers same preview/apply flow via resource tab
- New API route: POST /api/ai/enrich-from-resource
  * Supports complete and merge modes with language-aware prompts
- Extended scrape.ts with scrapePageText() (full content extraction)
2026-05-02 21:06:25 +02:00

204 lines
7.3 KiB
TypeScript

'use server'
import * as cheerio from 'cheerio';
export interface LinkMetadata {
url: string;
title?: string;
description?: string;
imageUrl?: string;
siteName?: string;
}
export async function fetchLinkMetadata(url: string): Promise<LinkMetadata | null> {
try {
// Add protocol if missing
let targetUrl = url;
if (!url.startsWith('http://') && !url.startsWith('https://')) {
targetUrl = 'https://' + url;
}
// SSRF protection: block internal/private IPs
const parsed = new URL(targetUrl)
const hostname = parsed.hostname.toLowerCase()
const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
if (blockedHosts.includes(hostname)) return null
if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null
const controller = new AbortController()
const timeoutId = setTimeout(() => controller.abort(), 10000)
let response: Response;
try {
response = await fetch(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
},
signal: controller.signal,
redirect: 'follow',
});
} catch (fetchError: any) {
clearTimeout(timeoutId)
if (fetchError.name === 'AbortError') {
console.error(`[Scrape] Timeout fetching ${url} (10s)`)
} else {
console.error(`[Scrape] Network error fetching ${url}:`, fetchError.message)
}
return null
}
clearTimeout(timeoutId)
if (!response.ok) {
console.error(`[Scrape] HTTP ${response.status} for ${url}`)
return null;
}
const contentType = response.headers.get('content-type') || ''
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
// Not HTML — return basic metadata
return { url: targetUrl, title: targetUrl };
}
const html = await response.text();
const $ = cheerio.load(html);
const getMeta = (prop: string) =>
$(`meta[property="${prop}"]`).attr('content') ||
$(`meta[name="${prop}"]`).attr('content');
const title = getMeta('og:title') || $('title').text()?.trim() || getMeta('twitter:title') || url;
const description = getMeta('og:description') || getMeta('description') || getMeta('twitter:description') || '';
let imageUrl = getMeta('og:image') || getMeta('twitter:image') || $('link[rel="image_src"]').attr('href');
// Resolve relative image URLs
if (imageUrl && !imageUrl.startsWith('http')) {
try {
imageUrl = new URL(imageUrl, targetUrl).href
} catch {
imageUrl = undefined
}
}
const siteName = getMeta('og:site_name') || '';
return {
url: targetUrl,
title: title.substring(0, 100),
description: description.substring(0, 200),
imageUrl,
siteName
};
} catch (error) {
console.error(`[Scrape] Error fetching ${url}:`, error);
return null;
}
}
/**
* Scrape full readable text content from a URL.
* Removes nav, header, footer, scripts, and ads — keeps main content only.
* Returns markdown-structured plain text (preserves paragraph/heading structure).
*/
export async function scrapePageText(url: string): Promise<{ text: string; title: string } | null> {
try {
let targetUrl = url
if (!url.startsWith('http://') && !url.startsWith('https://')) {
targetUrl = 'https://' + url
}
// SSRF protection
const parsed = new URL(targetUrl)
const hostname = parsed.hostname.toLowerCase()
const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
if (blockedHosts.includes(hostname)) return null
if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null
const controller = new AbortController()
const timeoutId = setTimeout(() => controller.abort(), 15000)
let response: Response
try {
response = await fetch(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr,en;q=0.8',
},
signal: controller.signal,
redirect: 'follow',
})
} catch (fetchError: any) {
clearTimeout(timeoutId)
console.error(`[ScrapeText] Fetch error for ${url}:`, fetchError.message)
return null
}
clearTimeout(timeoutId)
if (!response.ok) return null
const contentType = response.headers.get('content-type') || ''
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
// Plain text or other — return raw text
const text = await response.text()
return { text: text.slice(0, 50000), title: url }
}
const html = await response.text()
const $ = cheerio.load(html)
// Extract title
const getMeta = (prop: string) =>
$(`meta[property="${prop}"]`).attr('content') ||
$(`meta[name="${prop}"]`).attr('content')
const title = getMeta('og:title') || $('title').text()?.trim() || url
// Remove noise elements
$('script, style, noscript, nav, header, footer, aside, iframe, img, svg, figure, form, button, input, select, textarea, [role="navigation"], [role="banner"], [role="complementary"], .ads, .advertisement, .cookie-banner, .popup, .modal').remove()
// Try to find main content container
const mainSelectors = ['main', 'article', '[role="main"]', '.content', '.post-content', '.article-body', '.entry-content', '#content', '#main']
let mainEl = null
for (const sel of mainSelectors) {
if ($(sel).length) { mainEl = $(sel).first(); break }
}
const container = mainEl || $('body')
// Extract text preserving paragraph/heading structure as markdown
const lines: string[] = []
container.find('h1, h2, h3, h4, h5, h6, p, li, blockquote, pre, td, th').each((_, el) => {
const tag = (el as any).tagName?.toLowerCase()
const text = $(el).text().trim()
if (!text || text.length < 3) return
if (['h1', 'h2', 'h3'].includes(tag)) {
lines.push(`\n## ${text}`)
} else if (['h4', 'h5', 'h6'].includes(tag)) {
lines.push(`\n### ${text}`)
} else if (tag === 'li') {
lines.push(`- ${text}`)
} else if (tag === 'blockquote') {
lines.push(`> ${text}`)
} else if (tag === 'pre') {
lines.push(`\`\`\`\n${text}\n\`\`\``)
} else {
lines.push(text)
}
})
const text = lines.join('\n').replace(/\n{3,}/g, '\n\n').trim()
// Limit to ~50k characters to avoid token overflows
return { text: text.slice(0, 50000), title: title.trim() }
} catch (error) {
console.error(`[ScrapeText] Error for ${url}:`, error)
return null
}
}