import { Readability } from '@mozilla/readability' import { JSDOM } from 'jsdom' import DOMPurify from 'isomorphic-dompurify' import { applyRtlToHtmlBlocks, readPageLocaleFromHtml, resolveClipLocale, wrapClipArticleHtml, type ClipLocaleHint, } from '@/lib/clip/rtl-content' export interface ExtractedArticle { title: string content: string textContent: string excerpt: string locale: ClipLocaleHint } export function extractArticleFromHtml(html: string, pageUrl: string): ExtractedArticle | null { const dom = new JSDOM(html, { url: pageUrl }) const reader = new Readability(dom.window.document) const article = reader.parse() if (!article) return null const pageLocale = readPageLocaleFromHtml(html) const readabilityDir = article.dir?.toLowerCase() === 'rtl' ? 'rtl' : 'ltr' const readabilityLang = article.lang?.split('-')[0]?.toLowerCase() const locale = resolveClipLocale( pageUrl, article.title || '', article.textContent || '', ) const mergedLocale: ClipLocaleHint = { direction: readabilityDir === 'rtl' || pageLocale.direction === 'rtl' || locale.direction === 'rtl' ? 'rtl' : 'ltr', lang: (readabilityLang === 'fa' || readabilityLang === 'ar' || readabilityLang === 'he' ? readabilityLang : undefined) || locale.lang || pageLocale.lang, } const sanitized = DOMPurify.sanitize(article.content || '') const rtlBlocks = applyRtlToHtmlBlocks(sanitized, mergedLocale) const content = wrapClipArticleHtml(rtlBlocks, mergedLocale) return { title: (article.title || '').trim(), content, textContent: (article.textContent || '').trim(), excerpt: (article.excerpt || '').trim(), locale: mergedLocale, } } export function clipFooterLocaleTag(lang?: string): string { if (lang === 'fa') return 'fa-IR' if (lang === 'ar') return 'ar' if (lang === 'he') return 'he-IL' return 'fr-FR' } export function buildClipSourceFooter(domain: string, date: Date, localeTag = 'fr-FR'): string { const formatted = date.toLocaleDateString(localeTag, { day: 'numeric', month: 'long', year: 'numeric' }) const isRtl = localeTag.startsWith('fa') || localeTag.startsWith('ar') || localeTag.startsWith('he') const label = localeTag.startsWith('fa') ? `برگرفته از ${domain} — ${formatted}` : localeTag.startsWith('ar') ? `مقتبس من ${domain} — ${formatted}` : `Extrait de ${domain} le ${formatted}` const dirAttr = isRtl ? ' dir="rtl"' : '' return `
${DOMPurify.sanitize(label)}
` }