Files
Momento/memento-note/lib/clip/rtl-content.ts
Antigravity e881004c77
Some checks failed
CI / Lint, Test & Build (push) Failing after 1m7s
CI / Deploy production (on server) (push) Has been skipped
feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error
2026-05-24 18:57:33 +00:00

113 lines
4.1 KiB
TypeScript

/** Détection RTL et enveloppe HTML pour contenus clippés (persan, arabe, hébreu). */
const RTL_CHAR = /[\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/
const LTR_CHAR = /[A-Za-z0-9]/
export type ClipTextDirection = 'rtl' | 'ltr'
export interface ClipLocaleHint {
direction: ClipTextDirection
lang?: 'fa' | 'ar' | 'he'
}
export function inferLangFromUrl(url: string): ClipLocaleHint['lang'] | undefined {
const lower = url.toLowerCase()
if (/\/persian\b|\/fa\b|lang=fa|bbc\.com\/persian/.test(lower)) return 'fa'
if (/\/arabic\b|\/ar\b|lang=ar/.test(lower)) return 'ar'
if (/\/hebrew\b|\/he\b|lang=he/.test(lower)) return 'he'
return undefined
}
export function detectTextDirection(text: string): ClipTextDirection {
const sample = text.replace(/\s+/g, '').slice(0, 4000)
if (!sample) return 'ltr'
let rtl = 0
let ltr = 0
for (const ch of sample) {
if (RTL_CHAR.test(ch)) rtl++
else if (LTR_CHAR.test(ch)) ltr++
}
if (rtl === 0) return 'ltr'
return rtl >= ltr ? 'rtl' : 'ltr'
}
/** Direction du titre de note (éviter dir="auto" qui casse les chiffres persans). */
export function resolveTitleDirection(title: string, sourceUrl?: string | null): ClipTextDirection {
if (sourceUrl && inferLangFromUrl(sourceUrl)) return 'rtl'
return detectTextDirection(title)
}
export function resolveTitleLang(
title: string,
sourceUrl?: string | null,
): ClipLocaleHint['lang'] | undefined {
const urlLang = sourceUrl ? inferLangFromUrl(sourceUrl) : undefined
if (urlLang) return urlLang
if (detectTextDirection(title) !== 'rtl') return undefined
return resolveClipLocale(sourceUrl || '', title).lang
}
export function resolveClipLocale(url: string, ...texts: string[]): ClipLocaleHint {
const combined = texts.filter(Boolean).join('\n')
const direction = detectTextDirection(combined)
const urlLang = inferLangFromUrl(url)
let lang = urlLang
if (!lang && direction === 'rtl') {
if (/[\u06AF\u06CC\u06A9\u067E\u0686\u0698\u200C]/.test(combined)) lang = 'fa'
else if (/[\u0590-\u05FF]/.test(combined)) lang = 'he'
else lang = 'ar'
}
return { direction, lang }
}
/** Applique dir/lang sur les blocs HTML extraits (Readability ne les conserve pas toujours). */
export function applyRtlToHtmlBlocks(html: string, hint: ClipLocaleHint): string {
if (hint.direction !== 'rtl') return html
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
const blockTags = ['p', 'h1', 'h2', 'h3', 'h4', 'li', 'ul', 'ol', 'blockquote', 'figcaption']
let out = html
for (const tag of blockTags) {
out = out.replace(new RegExp(`<${tag}(\\s[^>]*)?>`, 'gi'), (match, attrs = '') => {
if (/dir\s*=/.test(attrs)) return match
return `<${tag}${attrs} dir="rtl"${langAttr}>`
})
}
return out
}
export function wrapClipArticleHtml(innerHtml: string, hint: ClipLocaleHint): string {
if (hint.direction !== 'rtl') return innerHtml
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
return `<div class="clip-article clip-article--rtl" dir="rtl"${langAttr}>${innerHtml}</div>`
}
export function wrapClipPlainParagraph(text: string, hint: ClipLocaleHint): string {
const escaped = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
const dirAttr = hint.direction === 'rtl' ? ' dir="rtl"' : ''
return `<p${dirAttr}${langAttr}>${escaped}</p>`
}
export function readPageLocaleFromHtml(html: string): Pick<ClipLocaleHint, 'direction' | 'lang'> {
const dirMatch =
html.match(/<html[^>]*\sdir=["'](rtl|ltr)["']/i) ||
html.match(/<body[^>]*\sdir=["'](rtl|ltr)["']/i)
const langMatch =
html.match(/<html[^>]*\slang=["']([^"']+)["']/i) ||
html.match(/<body[^>]*\slang=["']([^"']+)["']/i)
const direction: ClipTextDirection = dirMatch?.[1]?.toLowerCase() === 'rtl' ? 'rtl' : 'ltr'
const rawLang = langMatch?.[1]?.split('-')[0]?.toLowerCase()
const lang =
rawLang === 'fa' || rawLang === 'ar' || rawLang === 'he' ? (rawLang as ClipLocaleHint['lang']) : undefined
return { direction, lang }
}