89 lines
3.6 KiB
TypeScript
89 lines
3.6 KiB
TypeScript
/**
|
|
* Web Scrape Tool
|
|
* Uses Jina Reader API (r.jina.ai) to scrape a URL into markdown.
|
|
* Falls back to basic fetch on error.
|
|
* Supports RSS/Atom feeds: parses the feed and scrapes top articles.
|
|
*/
|
|
|
|
import { tool } from 'ai'
|
|
import { z } from 'zod'
|
|
import { toolRegistry } from './registry'
|
|
import { rssService } from '../services/rss.service'
|
|
|
|
const MAX_ARTICLE_CONTENT = 4000
|
|
const MAX_TOTAL_CONTENT = 15000
|
|
const MAX_ARTICLES_FROM_FEED = 5
|
|
|
|
async function scrapeSingleUrl(url: string, jinaKey?: string): Promise<{ content: string; url: string }> {
|
|
const headers: Record<string, string> = { 'Accept': 'text/markdown' }
|
|
if (jinaKey) {
|
|
headers['Authorization'] = `Bearer ${jinaKey}`
|
|
}
|
|
|
|
const response = await fetch(`https://r.jina.ai/${url}`, { headers })
|
|
|
|
if (!response.ok) {
|
|
const fallback = await fetch(url)
|
|
if (!fallback.ok) return { content: `Failed to fetch ${url}: ${fallback.status}`, url }
|
|
const text = await fallback.text()
|
|
return { content: text.substring(0, 10000), url }
|
|
}
|
|
|
|
const markdown = await response.text()
|
|
return { content: markdown.substring(0, MAX_TOTAL_CONTENT), url }
|
|
}
|
|
|
|
toolRegistry.register({
|
|
name: 'web_scrape',
|
|
description: 'Scrape a web page and return its content as markdown. Supports RSS/Atom feeds — will automatically parse feeds and scrape individual articles.',
|
|
isInternal: false,
|
|
buildTool: (ctx) =>
|
|
tool({
|
|
description: 'Scrape a web page URL and return its content as clean markdown text. If the URL is an RSS/Atom feed, it will parse the feed and scrape the latest articles automatically.',
|
|
inputSchema: z.object({
|
|
url: z.string().describe('The URL to scrape. Can be a regular web page or an RSS/Atom feed URL.'),
|
|
}),
|
|
execute: async ({ url }) => {
|
|
try {
|
|
// Try RSS feed detection first
|
|
if (rssService.isFeedUrl(url)) {
|
|
const feed = await rssService.parseFeed(url)
|
|
if (feed && feed.articles.length > 0) {
|
|
const jinaKey = ctx.config.JINA_API_KEY
|
|
const articlesToScrape = feed.articles.slice(0, MAX_ARTICLES_FROM_FEED)
|
|
|
|
const results = await Promise.allSettled(
|
|
articlesToScrape.map(article => scrapeSingleUrl(article.link, jinaKey))
|
|
)
|
|
|
|
const parts: string[] = []
|
|
parts.push(`# ${feed.title}\n_Flux RSS: ${url} — ${feed.articles.length} articles disponibles, ${articlesToScrape.length} scrapés_\n`)
|
|
|
|
let totalLen = 0
|
|
for (let i = 0; i < results.length; i++) {
|
|
const r = results[i]
|
|
if (r.status === 'fulfilled' && r.value.content) {
|
|
const article = articlesToScrape[i]
|
|
const header = `\n---\n\n## ${article.title}\n_Source: ${article.link}_${article.pubDate ? ` — ${new Date(article.pubDate).toISOString().split('T')[0]}` : ''}\n\n`
|
|
const content = r.value.content.substring(0, MAX_ARTICLE_CONTENT)
|
|
if (totalLen + header.length + content.length > MAX_TOTAL_CONTENT) break
|
|
parts.push(header + content)
|
|
totalLen += header.length + content.length
|
|
}
|
|
}
|
|
|
|
return { content: parts.join(''), url, feedTitle: feed.title, articlesScraped: articlesToScrape.length }
|
|
}
|
|
// If feed parsing failed, fall through to normal scraping
|
|
}
|
|
|
|
// Normal web page scraping
|
|
const result = await scrapeSingleUrl(url, ctx.config.JINA_API_KEY)
|
|
return result
|
|
} catch (e: any) {
|
|
return { error: `Scrape failed: ${e.message}` }
|
|
}
|
|
},
|
|
}),
|
|
})
|