'use server' import * as cheerio from 'cheerio'; export interface LinkMetadata { url: string; title?: string; description?: string; imageUrl?: string; siteName?: string; } export async function fetchLinkMetadata(url: string): Promise { try { // Add protocol if missing let targetUrl = url; if (!url.startsWith('http://') && !url.startsWith('https://')) { targetUrl = 'https://' + url; } // SSRF protection: block internal/private IPs const parsed = new URL(targetUrl) const hostname = parsed.hostname.toLowerCase() const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254'] if (blockedHosts.includes(hostname)) return null if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null const controller = new AbortController() const timeoutId = setTimeout(() => controller.abort(), 10000) let response: Response; try { response = await fetch(targetUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', }, signal: controller.signal, redirect: 'follow', }); } catch (fetchError: any) { clearTimeout(timeoutId) if (fetchError.name === 'AbortError') { console.error(`[Scrape] Timeout fetching ${url} (10s)`) } else { console.error(`[Scrape] Network error fetching ${url}:`, fetchError.message) } return null } clearTimeout(timeoutId) if (!response.ok) { console.error(`[Scrape] HTTP ${response.status} for ${url}`) return null; } const contentType = response.headers.get('content-type') || '' if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) { // Not HTML — return basic metadata return { url: targetUrl, title: targetUrl }; } const html = await response.text(); const $ = cheerio.load(html); const getMeta = (prop: string) => $(`meta[property="${prop}"]`).attr('content') || $(`meta[name="${prop}"]`).attr('content'); const title = getMeta('og:title') || $('title').text()?.trim() || getMeta('twitter:title') || url; const description = getMeta('og:description') || getMeta('description') || getMeta('twitter:description') || ''; let imageUrl = getMeta('og:image') || getMeta('twitter:image') || $('link[rel="image_src"]').attr('href'); // Resolve relative image URLs if (imageUrl && !imageUrl.startsWith('http')) { try { imageUrl = new URL(imageUrl, targetUrl).href } catch { imageUrl = undefined } } const siteName = getMeta('og:site_name') || ''; return { url: targetUrl, title: title.substring(0, 100), description: description.substring(0, 200), imageUrl, siteName }; } catch (error) { console.error(`[Scrape] Error fetching ${url}:`, error); return null; } } /** * Scrape full readable text content from a URL. * Removes nav, header, footer, scripts, and ads — keeps main content only. * Returns markdown-structured plain text (preserves paragraph/heading structure). */ export async function scrapePageText(url: string): Promise<{ text: string; title: string } | null> { try { let targetUrl = url if (!url.startsWith('http://') && !url.startsWith('https://')) { targetUrl = 'https://' + url } // SSRF protection const parsed = new URL(targetUrl) const hostname = parsed.hostname.toLowerCase() const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254'] if (blockedHosts.includes(hostname)) return null if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null const controller = new AbortController() const timeoutId = setTimeout(() => controller.abort(), 15000) let response: Response try { response = await fetch(targetUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'fr,en;q=0.8', }, signal: controller.signal, redirect: 'follow', }) } catch (fetchError: any) { clearTimeout(timeoutId) console.error(`[ScrapeText] Fetch error for ${url}:`, fetchError.message) return null } clearTimeout(timeoutId) if (!response.ok) return null const contentType = response.headers.get('content-type') || '' if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) { // Plain text or other — return raw text const text = await response.text() return { text: text.slice(0, 50000), title: url } } const html = await response.text() const $ = cheerio.load(html) // Extract title const getMeta = (prop: string) => $(`meta[property="${prop}"]`).attr('content') || $(`meta[name="${prop}"]`).attr('content') const title = getMeta('og:title') || $('title').text()?.trim() || url // Remove noise elements $('script, style, noscript, nav, header, footer, aside, iframe, img, svg, figure, form, button, input, select, textarea, [role="navigation"], [role="banner"], [role="complementary"], .ads, .advertisement, .cookie-banner, .popup, .modal').remove() // Try to find main content container const mainSelectors = ['main', 'article', '[role="main"]', '.content', '.post-content', '.article-body', '.entry-content', '#content', '#main'] let mainEl = null for (const sel of mainSelectors) { if ($(sel).length) { mainEl = $(sel).first(); break } } const container = mainEl || $('body') // Extract text preserving paragraph/heading structure as markdown const lines: string[] = [] container.find('h1, h2, h3, h4, h5, h6, p, li, blockquote, pre, td, th').each((_, el) => { const tag = (el as any).tagName?.toLowerCase() const text = $(el).text().trim() if (!text || text.length < 3) return if (['h1', 'h2', 'h3'].includes(tag)) { lines.push(`\n## ${text}`) } else if (['h4', 'h5', 'h6'].includes(tag)) { lines.push(`\n### ${text}`) } else if (tag === 'li') { lines.push(`- ${text}`) } else if (tag === 'blockquote') { lines.push(`> ${text}`) } else if (tag === 'pre') { lines.push(`\`\`\`\n${text}\n\`\`\``) } else { lines.push(text) } }) const text = lines.join('\n').replace(/\n{3,}/g, '\n\n').trim() // Limit to ~50k characters to avoid token overflows return { text: text.slice(0, 50000), title: title.trim() } } catch (error) { console.error(`[ScrapeText] Error for ${url}:`, error) return null } }