All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 44s
- Renamed 'AI Copilot' / 'Assistant IA' → 'IA Note' everywhere in UI - Added 3rd 'Ressource' tab in IA Note panel with: * Optional URL field that scrapes page text (via new scrapePageText action) * Textarea for paste (markdown, HTML, plain text) * 3 integration modes: Remplacer / Compléter (AI) / Fusionner (AI) * Dual-format preview: Rendu + Markdown brut before applying - Added hover-actions on assistant chat messages: * Remplacer / Compléter / Fusionner appear on hover * Triggers same preview/apply flow via resource tab - New API route: POST /api/ai/enrich-from-resource * Supports complete and merge modes with language-aware prompts - Extended scrape.ts with scrapePageText() (full content extraction)
204 lines
7.3 KiB
TypeScript
204 lines
7.3 KiB
TypeScript
'use server'
|
|
|
|
import * as cheerio from 'cheerio';
|
|
|
|
export interface LinkMetadata {
|
|
url: string;
|
|
title?: string;
|
|
description?: string;
|
|
imageUrl?: string;
|
|
siteName?: string;
|
|
}
|
|
|
|
export async function fetchLinkMetadata(url: string): Promise<LinkMetadata | null> {
|
|
try {
|
|
// Add protocol if missing
|
|
let targetUrl = url;
|
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
|
targetUrl = 'https://' + url;
|
|
}
|
|
|
|
// SSRF protection: block internal/private IPs
|
|
const parsed = new URL(targetUrl)
|
|
const hostname = parsed.hostname.toLowerCase()
|
|
const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
|
|
if (blockedHosts.includes(hostname)) return null
|
|
if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
|
|
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null
|
|
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => controller.abort(), 10000)
|
|
|
|
let response: Response;
|
|
try {
|
|
response = await fetch(targetUrl, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
},
|
|
signal: controller.signal,
|
|
redirect: 'follow',
|
|
});
|
|
} catch (fetchError: any) {
|
|
clearTimeout(timeoutId)
|
|
if (fetchError.name === 'AbortError') {
|
|
console.error(`[Scrape] Timeout fetching ${url} (10s)`)
|
|
} else {
|
|
console.error(`[Scrape] Network error fetching ${url}:`, fetchError.message)
|
|
}
|
|
return null
|
|
}
|
|
|
|
clearTimeout(timeoutId)
|
|
|
|
if (!response.ok) {
|
|
console.error(`[Scrape] HTTP ${response.status} for ${url}`)
|
|
return null;
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type') || ''
|
|
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
|
// Not HTML — return basic metadata
|
|
return { url: targetUrl, title: targetUrl };
|
|
}
|
|
|
|
const html = await response.text();
|
|
const $ = cheerio.load(html);
|
|
|
|
const getMeta = (prop: string) =>
|
|
$(`meta[property="${prop}"]`).attr('content') ||
|
|
$(`meta[name="${prop}"]`).attr('content');
|
|
|
|
const title = getMeta('og:title') || $('title').text()?.trim() || getMeta('twitter:title') || url;
|
|
const description = getMeta('og:description') || getMeta('description') || getMeta('twitter:description') || '';
|
|
let imageUrl = getMeta('og:image') || getMeta('twitter:image') || $('link[rel="image_src"]').attr('href');
|
|
|
|
// Resolve relative image URLs
|
|
if (imageUrl && !imageUrl.startsWith('http')) {
|
|
try {
|
|
imageUrl = new URL(imageUrl, targetUrl).href
|
|
} catch {
|
|
imageUrl = undefined
|
|
}
|
|
}
|
|
|
|
const siteName = getMeta('og:site_name') || '';
|
|
|
|
return {
|
|
url: targetUrl,
|
|
title: title.substring(0, 100),
|
|
description: description.substring(0, 200),
|
|
imageUrl,
|
|
siteName
|
|
};
|
|
} catch (error) {
|
|
console.error(`[Scrape] Error fetching ${url}:`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scrape full readable text content from a URL.
|
|
* Removes nav, header, footer, scripts, and ads — keeps main content only.
|
|
* Returns markdown-structured plain text (preserves paragraph/heading structure).
|
|
*/
|
|
export async function scrapePageText(url: string): Promise<{ text: string; title: string } | null> {
|
|
try {
|
|
let targetUrl = url
|
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
|
targetUrl = 'https://' + url
|
|
}
|
|
|
|
// SSRF protection
|
|
const parsed = new URL(targetUrl)
|
|
const hostname = parsed.hostname.toLowerCase()
|
|
const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
|
|
if (blockedHosts.includes(hostname)) return null
|
|
if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
|
|
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null
|
|
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => controller.abort(), 15000)
|
|
|
|
let response: Response
|
|
try {
|
|
response = await fetch(targetUrl, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'fr,en;q=0.8',
|
|
},
|
|
signal: controller.signal,
|
|
redirect: 'follow',
|
|
})
|
|
} catch (fetchError: any) {
|
|
clearTimeout(timeoutId)
|
|
console.error(`[ScrapeText] Fetch error for ${url}:`, fetchError.message)
|
|
return null
|
|
}
|
|
|
|
clearTimeout(timeoutId)
|
|
|
|
if (!response.ok) return null
|
|
|
|
const contentType = response.headers.get('content-type') || ''
|
|
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
|
// Plain text or other — return raw text
|
|
const text = await response.text()
|
|
return { text: text.slice(0, 50000), title: url }
|
|
}
|
|
|
|
const html = await response.text()
|
|
const $ = cheerio.load(html)
|
|
|
|
// Extract title
|
|
const getMeta = (prop: string) =>
|
|
$(`meta[property="${prop}"]`).attr('content') ||
|
|
$(`meta[name="${prop}"]`).attr('content')
|
|
const title = getMeta('og:title') || $('title').text()?.trim() || url
|
|
|
|
// Remove noise elements
|
|
$('script, style, noscript, nav, header, footer, aside, iframe, img, svg, figure, form, button, input, select, textarea, [role="navigation"], [role="banner"], [role="complementary"], .ads, .advertisement, .cookie-banner, .popup, .modal').remove()
|
|
|
|
// Try to find main content container
|
|
const mainSelectors = ['main', 'article', '[role="main"]', '.content', '.post-content', '.article-body', '.entry-content', '#content', '#main']
|
|
let mainEl = null
|
|
for (const sel of mainSelectors) {
|
|
if ($(sel).length) { mainEl = $(sel).first(); break }
|
|
}
|
|
|
|
const container = mainEl || $('body')
|
|
|
|
// Extract text preserving paragraph/heading structure as markdown
|
|
const lines: string[] = []
|
|
container.find('h1, h2, h3, h4, h5, h6, p, li, blockquote, pre, td, th').each((_, el) => {
|
|
const tag = (el as any).tagName?.toLowerCase()
|
|
const text = $(el).text().trim()
|
|
if (!text || text.length < 3) return
|
|
|
|
if (['h1', 'h2', 'h3'].includes(tag)) {
|
|
lines.push(`\n## ${text}`)
|
|
} else if (['h4', 'h5', 'h6'].includes(tag)) {
|
|
lines.push(`\n### ${text}`)
|
|
} else if (tag === 'li') {
|
|
lines.push(`- ${text}`)
|
|
} else if (tag === 'blockquote') {
|
|
lines.push(`> ${text}`)
|
|
} else if (tag === 'pre') {
|
|
lines.push(`\`\`\`\n${text}\n\`\`\``)
|
|
} else {
|
|
lines.push(text)
|
|
}
|
|
})
|
|
|
|
const text = lines.join('\n').replace(/\n{3,}/g, '\n\n').trim()
|
|
|
|
// Limit to ~50k characters to avoid token overflows
|
|
return { text: text.slice(0, 50000), title: title.trim() }
|
|
} catch (error) {
|
|
console.error(`[ScrapeText] Error for ${url}:`, error)
|
|
return null
|
|
}
|
|
}
|