113 lines
4.1 KiB
TypeScript
113 lines
4.1 KiB
TypeScript
/** Détection RTL et enveloppe HTML pour contenus clippés (persan, arabe, hébreu). */
|
|
|
|
const RTL_CHAR = /[\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/
|
|
const LTR_CHAR = /[A-Za-z0-9]/
|
|
|
|
export type ClipTextDirection = 'rtl' | 'ltr'
|
|
|
|
export interface ClipLocaleHint {
|
|
direction: ClipTextDirection
|
|
lang?: 'fa' | 'ar' | 'he'
|
|
}
|
|
|
|
export function inferLangFromUrl(url: string): ClipLocaleHint['lang'] | undefined {
|
|
const lower = url.toLowerCase()
|
|
if (/\/persian\b|\/fa\b|lang=fa|bbc\.com\/persian/.test(lower)) return 'fa'
|
|
if (/\/arabic\b|\/ar\b|lang=ar/.test(lower)) return 'ar'
|
|
if (/\/hebrew\b|\/he\b|lang=he/.test(lower)) return 'he'
|
|
return undefined
|
|
}
|
|
|
|
export function detectTextDirection(text: string): ClipTextDirection {
|
|
const sample = text.replace(/\s+/g, '').slice(0, 4000)
|
|
if (!sample) return 'ltr'
|
|
|
|
let rtl = 0
|
|
let ltr = 0
|
|
for (const ch of sample) {
|
|
if (RTL_CHAR.test(ch)) rtl++
|
|
else if (LTR_CHAR.test(ch)) ltr++
|
|
}
|
|
if (rtl === 0) return 'ltr'
|
|
return rtl >= ltr ? 'rtl' : 'ltr'
|
|
}
|
|
|
|
/** Direction du titre de note (éviter dir="auto" qui casse les chiffres persans). */
|
|
export function resolveTitleDirection(title: string, sourceUrl?: string | null): ClipTextDirection {
|
|
if (sourceUrl && inferLangFromUrl(sourceUrl)) return 'rtl'
|
|
return detectTextDirection(title)
|
|
}
|
|
|
|
export function resolveTitleLang(
|
|
title: string,
|
|
sourceUrl?: string | null,
|
|
): ClipLocaleHint['lang'] | undefined {
|
|
const urlLang = sourceUrl ? inferLangFromUrl(sourceUrl) : undefined
|
|
if (urlLang) return urlLang
|
|
if (detectTextDirection(title) !== 'rtl') return undefined
|
|
return resolveClipLocale(sourceUrl || '', title).lang
|
|
}
|
|
|
|
export function resolveClipLocale(url: string, ...texts: string[]): ClipLocaleHint {
|
|
const combined = texts.filter(Boolean).join('\n')
|
|
const direction = detectTextDirection(combined)
|
|
const urlLang = inferLangFromUrl(url)
|
|
let lang = urlLang
|
|
|
|
if (!lang && direction === 'rtl') {
|
|
if (/[\u06AF\u06CC\u06A9\u067E\u0686\u0698\u200C]/.test(combined)) lang = 'fa'
|
|
else if (/[\u0590-\u05FF]/.test(combined)) lang = 'he'
|
|
else lang = 'ar'
|
|
}
|
|
|
|
return { direction, lang }
|
|
}
|
|
|
|
/** Applique dir/lang sur les blocs HTML extraits (Readability ne les conserve pas toujours). */
|
|
export function applyRtlToHtmlBlocks(html: string, hint: ClipLocaleHint): string {
|
|
if (hint.direction !== 'rtl') return html
|
|
|
|
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
|
|
const blockTags = ['p', 'h1', 'h2', 'h3', 'h4', 'li', 'ul', 'ol', 'blockquote', 'figcaption']
|
|
|
|
let out = html
|
|
for (const tag of blockTags) {
|
|
out = out.replace(new RegExp(`<${tag}(\\s[^>]*)?>`, 'gi'), (match, attrs = '') => {
|
|
if (/dir\s*=/.test(attrs)) return match
|
|
return `<${tag}${attrs} dir="rtl"${langAttr}>`
|
|
})
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
export function wrapClipArticleHtml(innerHtml: string, hint: ClipLocaleHint): string {
|
|
if (hint.direction !== 'rtl') return innerHtml
|
|
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
|
|
return `<div class="clip-article clip-article--rtl" dir="rtl"${langAttr}>${innerHtml}</div>`
|
|
}
|
|
|
|
export function wrapClipPlainParagraph(text: string, hint: ClipLocaleHint): string {
|
|
const escaped = text
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
const langAttr = hint.lang ? ` lang="${hint.lang}"` : ''
|
|
const dirAttr = hint.direction === 'rtl' ? ' dir="rtl"' : ''
|
|
return `<p${dirAttr}${langAttr}>${escaped}</p>`
|
|
}
|
|
|
|
export function readPageLocaleFromHtml(html: string): Pick<ClipLocaleHint, 'direction' | 'lang'> {
|
|
const dirMatch =
|
|
html.match(/<html[^>]*\sdir=["'](rtl|ltr)["']/i) ||
|
|
html.match(/<body[^>]*\sdir=["'](rtl|ltr)["']/i)
|
|
const langMatch =
|
|
html.match(/<html[^>]*\slang=["']([^"']+)["']/i) ||
|
|
html.match(/<body[^>]*\slang=["']([^"']+)["']/i)
|
|
const direction: ClipTextDirection = dirMatch?.[1]?.toLowerCase() === 'rtl' ? 'rtl' : 'ltr'
|
|
const rawLang = langMatch?.[1]?.split('-')[0]?.toLowerCase()
|
|
const lang =
|
|
rawLang === 'fa' || rawLang === 'ar' || rawLang === 'he' ? (rawLang as ClipLocaleHint['lang']) : undefined
|
|
return { direction, lang }
|
|
}
|