Momento/memento-note/app/actions/scrape.ts

'use server'

import * as cheerio from 'cheerio';

export interface LinkMetadata {
  url: string;
  title?: string;
  description?: string;
  imageUrl?: string;
  siteName?: string;
}

export async function fetchLinkMetadata(url: string): Promise<LinkMetadata | null> {
  try {
    // Add protocol if missing
    let targetUrl = url;
    if (!url.startsWith('http://') && !url.startsWith('https://')) {
      targetUrl = 'https://' + url;
    }

    // SSRF protection: block internal/private IPs
    const parsed = new URL(targetUrl)
    const hostname = parsed.hostname.toLowerCase()
    const blockedHosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1', '169.254.169.254']
    if (blockedHosts.includes(hostname)) return null
    if (hostname.startsWith('10.') || hostname.startsWith('172.') || hostname.startsWith('192.168.') || hostname.startsWith('fc') || hostname.startsWith('fd')) return null
    if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return null

    const response = await fetch(targetUrl, {
      headers: {
        // Use a real browser User-Agent to avoid 403 Forbidden from strict sites
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
      },
      next: { revalidate: 3600 } // Cache for 1 hour
    });

    if (!response.ok) {
      return null;
    }

    const html = await response.text();
    const $ = cheerio.load(html);

    const getMeta = (prop: string) =>
      $(`meta[property="${prop}"]`).attr('content') ||
      $(`meta[name="${prop}"]`).attr('content');

    // Robust extraction with fallbacks
    const title = getMeta('og:title') || $('title').text() || getMeta('twitter:title') || url;
    const description = getMeta('og:description') || getMeta('description') || getMeta('twitter:description') || '';
    const imageUrl = getMeta('og:image') || getMeta('twitter:image') || $('link[rel="image_src"]').attr('href');
    const siteName = getMeta('og:site_name') || '';

    return {
      url: targetUrl,
      title: title.substring(0, 100),
      description: description.substring(0, 200),
      imageUrl,
      siteName
    };
  } catch (error) {
    console.error(`[Scrape] Error fetching ${url}:`, error);
    return null;
  }
}