Momento/memento-note/app/actions/scrape.ts

'use server'

import * as cheerio from 'cheerio';

export interface LinkMetadata {
  url: string;
  title?: string;
  description?: string;
  imageUrl?: string;
  siteName?: string;
}

export async function fetchLinkMetadata(url: string): Promise<LinkMetadata | null> {
  try {
    // Add protocol if missing
    let targetUrl = url;
    if (!url.startsWith('http://') && !url.startsWith('https://')) {
      targetUrl = 'https://' + url;
    }

    // SSRF protection: block internal/private IPs
    const parsed = new URL(targetUrl)
    const hostname = parsed.hostname.toLowerCase()
    const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
    if (blockedHosts.includes(hostname)) return null
    if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
    if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null

    const controller = new AbortController()
    const timeoutId = setTimeout(() => controller.abort(), 10000)

    let response: Response;
    try {
      response = await fetch(targetUrl, {
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
          'Accept-Language': 'en-US,en;q=0.5',
        },
        signal: controller.signal,
        redirect: 'follow',
      });
    } catch (fetchError: any) {
      clearTimeout(timeoutId)
      if (fetchError.name === 'AbortError') {
        console.error(`[Scrape] Timeout fetching ${url} (10s)`)
      } else {
        console.error(`[Scrape] Network error fetching ${url}:`, fetchError.message)
      }
      return null
    }

    clearTimeout(timeoutId)

    if (!response.ok) {
      console.error(`[Scrape] HTTP ${response.status} for ${url}`)
      return null;
    }

    const contentType = response.headers.get('content-type') || ''
    if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
      // Not HTML — return basic metadata
      return { url: targetUrl, title: targetUrl };
    }

    const html = await response.text();
    const $ = cheerio.load(html);

    const getMeta = (prop: string) =>
      $(`meta[property="${prop}"]`).attr('content') ||
      $(`meta[name="${prop}"]`).attr('content');

    const title = getMeta('og:title') || $('title').text()?.trim() || getMeta('twitter:title') || url;
    const description = getMeta('og:description') || getMeta('description') || getMeta('twitter:description') || '';
    let imageUrl = getMeta('og:image') || getMeta('twitter:image') || $('link[rel="image_src"]').attr('href');

    // Resolve relative image URLs
    if (imageUrl && !imageUrl.startsWith('http')) {
      try {
        imageUrl = new URL(imageUrl, targetUrl).href
      } catch {
        imageUrl = undefined
      }
    }

    const siteName = getMeta('og:site_name') || '';

    return {
      url: targetUrl,
      title: title.substring(0, 100),
      description: description.substring(0, 200),
      imageUrl,
      siteName
    };
  } catch (error) {
    console.error(`[Scrape] Error fetching ${url}:`, error);
    return null;
  }
}

/**
 * Scrape full readable text content from a URL.
 * Removes nav, header, footer, scripts, and ads — keeps main content only.
 * Returns markdown-structured plain text (preserves paragraph/heading structure).
 */
export async function scrapePageText(url: string): Promise<{ text: string; title: string } | null> {
  try {
    let targetUrl = url
    if (!url.startsWith('http://') && !url.startsWith('https://')) {
      targetUrl = 'https://' + url
    }

    // SSRF protection
    const parsed = new URL(targetUrl)
    const hostname = parsed.hostname.toLowerCase()
    const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
    if (blockedHosts.includes(hostname)) return null
    if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
    if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null

    const controller = new AbortController()
    const timeoutId = setTimeout(() => controller.abort(), 15000)

    let response: Response
    try {
      response = await fetch(targetUrl, {
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
          'Accept-Language': 'fr,en;q=0.8',
        },
        signal: controller.signal,
        redirect: 'follow',
      })
    } catch (fetchError: any) {
      clearTimeout(timeoutId)
      console.error(`[ScrapeText] Fetch error for ${url}:`, fetchError.message)
      return null
    }

    clearTimeout(timeoutId)

    if (!response.ok) return null

    const contentType = response.headers.get('content-type') || ''
    if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
      // Plain text or other — return raw text
      const text = await response.text()
      return { text: text.slice(0, 50000), title: url }
    }

    const html = await response.text()
    const $ = cheerio.load(html)

    // Extract title
    const getMeta = (prop: string) =>
      $(`meta[property="${prop}"]`).attr('content') ||
      $(`meta[name="${prop}"]`).attr('content')
    const title = getMeta('og:title') || $('title').text()?.trim() || url

    // Remove noise elements
    $('script, style, noscript, nav, header, footer, aside, iframe, img, svg, figure, form, button, input, select, textarea, [role="navigation"], [role="banner"], [role="complementary"], .ads, .advertisement, .cookie-banner, .popup, .modal').remove()

    // Try to find main content container
    const mainSelectors = ['main', 'article', '[role="main"]', '.content', '.post-content', '.article-body', '.entry-content', '#content', '#main']
    let mainEl = null
    for (const sel of mainSelectors) {
      if ($(sel).length) { mainEl = $(sel).first(); break }
    }

    const container = mainEl || $('body')

    // Extract text preserving paragraph/heading structure as markdown
    const lines: string[] = []
    container.find('h1, h2, h3, h4, h5, h6, p, li, blockquote, pre, td, th').each((_, el) => {
      const tag = (el as any).tagName?.toLowerCase()
      const text = $(el).text().trim()
      if (!text || text.length < 3) return

      if (['h1', 'h2', 'h3'].includes(tag)) {
        lines.push(`\n## ${text}`)
      } else if (['h4', 'h5', 'h6'].includes(tag)) {
        lines.push(`\n### ${text}`)
      } else if (tag === 'li') {
        lines.push(`- ${text}`)
      } else if (tag === 'blockquote') {
        lines.push(`> ${text}`)
      } else if (tag === 'pre') {
        lines.push(`\`\`\`\n${text}\n\`\`\``)
      } else {
        lines.push(text)
      }
    })

    const text = lines.join('\n').replace(/\n{3,}/g, '\n\n').trim()

    // Limit to ~50k characters to avoid token overflows
    return { text: text.slice(0, 50000), title: title.trim() }
  } catch (error) {
    console.error(`[ScrapeText] Error for ${url}:`, error)
    return null
  }
}