/** * Web Scrape Tool * Uses Jina Reader API (r.jina.ai) to scrape a URL into markdown. * Falls back to basic fetch on error. * Supports RSS/Atom feeds: parses the feed and scrapes top articles. */ import { tool } from 'ai' import { z } from 'zod' import { toolRegistry } from './registry' import { rssService } from '../services/rss.service' const MAX_ARTICLE_CONTENT = 4000 const MAX_TOTAL_CONTENT = 15000 const MAX_ARTICLES_FROM_FEED = 5 async function scrapeSingleUrl(url: string, jinaKey?: string): Promise<{ content: string; url: string }> { const headers: Record = { 'Accept': 'text/markdown' } if (jinaKey) { headers['Authorization'] = `Bearer ${jinaKey}` } const response = await fetch(`https://r.jina.ai/${url}`, { headers }) if (!response.ok) { const fallback = await fetch(url) if (!fallback.ok) return { content: `Failed to fetch ${url}: ${fallback.status}`, url } const text = await fallback.text() return { content: text.substring(0, 10000), url } } const markdown = await response.text() return { content: markdown.substring(0, MAX_TOTAL_CONTENT), url } } toolRegistry.register({ name: 'web_scrape', description: 'Scrape a web page and return its content as markdown. Supports RSS/Atom feeds — will automatically parse feeds and scrape individual articles.', isInternal: false, buildTool: (ctx) => tool({ description: 'Scrape a web page URL and return its content as clean markdown text. If the URL is an RSS/Atom feed, it will parse the feed and scrape the latest articles automatically.', inputSchema: z.object({ url: z.string().describe('The URL to scrape. Can be a regular web page or an RSS/Atom feed URL.'), }), execute: async ({ url }) => { try { // Try RSS feed detection first if (rssService.isFeedUrl(url)) { const feed = await rssService.parseFeed(url) if (feed && feed.articles.length > 0) { const jinaKey = ctx.config.JINA_API_KEY const articlesToScrape = feed.articles.slice(0, MAX_ARTICLES_FROM_FEED) const results = await Promise.allSettled( articlesToScrape.map(article => scrapeSingleUrl(article.link, jinaKey)) ) const parts: string[] = [] parts.push(`# ${feed.title}\n_Flux RSS: ${url} — ${feed.articles.length} articles disponibles, ${articlesToScrape.length} scrapés_\n`) let totalLen = 0 for (let i = 0; i < results.length; i++) { const r = results[i] if (r.status === 'fulfilled' && r.value.content) { const article = articlesToScrape[i] const header = `\n---\n\n## ${article.title}\n_Source: ${article.link}_${article.pubDate ? ` — ${new Date(article.pubDate).toISOString().split('T')[0]}` : ''}\n\n` const content = r.value.content.substring(0, MAX_ARTICLE_CONTENT) if (totalLen + header.length + content.length > MAX_TOTAL_CONTENT) break parts.push(header + content) totalLen += header.length + content.length } } return { content: parts.join(''), url, feedTitle: feed.title, articlesScraped: articlesToScrape.length } } // If feed parsing failed, fall through to normal scraping } // Normal web page scraping const result = await scrapeSingleUrl(url, ctx.config.JINA_API_KEY) return result } catch (e: any) { return { error: `Scrape failed: ${e.message}` } } }, }), })