Files
Momento/memento-note/lib/clip/extract-article.ts
Antigravity e881004c77
Some checks failed
CI / Lint, Test & Build (push) Failing after 1m7s
CI / Deploy production (on server) (push) Has been skipped
feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error
2026-05-24 18:57:33 +00:00

79 lines
2.6 KiB
TypeScript

import { Readability } from '@mozilla/readability'
import { JSDOM } from 'jsdom'
import DOMPurify from 'isomorphic-dompurify'
import {
applyRtlToHtmlBlocks,
readPageLocaleFromHtml,
resolveClipLocale,
wrapClipArticleHtml,
type ClipLocaleHint,
} from '@/lib/clip/rtl-content'
export interface ExtractedArticle {
title: string
content: string
textContent: string
excerpt: string
locale: ClipLocaleHint
}
export function extractArticleFromHtml(html: string, pageUrl: string): ExtractedArticle | null {
const dom = new JSDOM(html, { url: pageUrl })
const reader = new Readability(dom.window.document)
const article = reader.parse()
if (!article) return null
const pageLocale = readPageLocaleFromHtml(html)
const readabilityDir = article.dir?.toLowerCase() === 'rtl' ? 'rtl' : 'ltr'
const readabilityLang = article.lang?.split('-')[0]?.toLowerCase()
const locale = resolveClipLocale(
pageUrl,
article.title || '',
article.textContent || '',
)
const mergedLocale: ClipLocaleHint = {
direction:
readabilityDir === 'rtl' || pageLocale.direction === 'rtl' || locale.direction === 'rtl'
? 'rtl'
: 'ltr',
lang:
(readabilityLang === 'fa' || readabilityLang === 'ar' || readabilityLang === 'he'
? readabilityLang
: undefined) ||
locale.lang ||
pageLocale.lang,
}
const sanitized = DOMPurify.sanitize(article.content || '')
const rtlBlocks = applyRtlToHtmlBlocks(sanitized, mergedLocale)
const content = wrapClipArticleHtml(rtlBlocks, mergedLocale)
return {
title: (article.title || '').trim(),
content,
textContent: (article.textContent || '').trim(),
excerpt: (article.excerpt || '').trim(),
locale: mergedLocale,
}
}
export function clipFooterLocaleTag(lang?: string): string {
if (lang === 'fa') return 'fa-IR'
if (lang === 'ar') return 'ar'
if (lang === 'he') return 'he-IL'
return 'fr-FR'
}
export function buildClipSourceFooter(domain: string, date: Date, localeTag = 'fr-FR'): string {
const formatted = date.toLocaleDateString(localeTag, { day: 'numeric', month: 'long', year: 'numeric' })
const isRtl = localeTag.startsWith('fa') || localeTag.startsWith('ar') || localeTag.startsWith('he')
const label =
localeTag.startsWith('fa')
? `برگرفته از ${domain}${formatted}`
: localeTag.startsWith('ar')
? `مقتبس من ${domain}${formatted}`
: `Extrait de ${domain} le ${formatted}`
const dirAttr = isRtl ? ' dir="rtl"' : ''
return `<hr/><p${dirAttr}><small>${DOMPurify.sanitize(label)}</small></p>`
}