diff --git a/src/tools/descriptions/web_extract.txt b/src/tools/descriptions/web_extract.txt new file mode 100644 index 0000000..e56af1a --- /dev/null +++ b/src/tools/descriptions/web_extract.txt @@ -0,0 +1,19 @@ +从指定的网页 URL 提取内容。使用 Tavily Extract API 智能解析网页,返回结构化的文本内容。 + +适用场景: +- 获取网页文章的完整内容 +- 提取文档页面的详细信息 +- 抓取多个页面进行对比分析 +- 获取网页中的图片列表 +- 深度提取包含表格、嵌入内容的页面 + +参数说明: +- urls: URL 列表(必填,最多 20 个,也可传单个 URL 字符串) +- extract_depth: "basic" 快速提取 / "advanced" 深度提取(含表格等) +- format: "markdown" / "text" 输出格式,默认 markdown +- include_images: 是否包含图片列表,默认 false + +返回内容: +- 每个 URL 的提取内容 +- 图片列表(如果启用) +- 失败的 URL 及错误信息 diff --git a/src/tools/index.ts b/src/tools/index.ts index 0ee4e29..0a6f9f0 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -24,7 +24,7 @@ import { } from './filesystem/index.js'; // Web 工具 -import { webSearchTool } from './web/index.js'; +import { webSearchTool, webExtractTool } from './web/index.js'; // 所有工具列表(用于注册) const allToolsWithMetadata: ToolWithMetadata[] = [ @@ -47,8 +47,9 @@ const allToolsWithMetadata: ToolWithMetadata[] = [ copyFileTool, deleteFileTool, - // Web 工具 (deferLoading: true) + // Web 工具 (deferLoading: false) webSearchTool, + webExtractTool, ]; // 注册所有工具到 registry diff --git a/src/tools/web/index.ts b/src/tools/web/index.ts index 70771e6..cbf9346 100644 --- a/src/tools/web/index.ts +++ b/src/tools/web/index.ts @@ -1 +1,2 @@ export { webSearchTool } from './web_search.js'; +export { webExtractTool } from './web_extract.js'; diff --git a/src/tools/web/web_extract.ts b/src/tools/web/web_extract.ts new file mode 100644 index 0000000..d123c96 --- /dev/null +++ b/src/tools/web/web_extract.ts @@ -0,0 +1,159 @@ +import { tavily } from '@tavily/core'; +import type { ToolResult } from '../../types/index.js'; +import type { ToolWithMetadata } from '../types.js'; +import { loadDescription } from '../load_description.js'; +import { getConfig } from '../../utils/config.js'; +import { getPermissionManager } from '../../permission/index.js'; + +export const webExtractTool: ToolWithMetadata = { + name: 'web_extract', + description: loadDescription('web_extract'), + metadata: { + name: 'web_extract', + category: 'web', + description: '从网页URL提取内容', + keywords: ['extract', 'url', 'web', 'content', 'scrape', 'fetch', '提取', '网页', '抓取', '内容'], + deferLoading: false, + }, + parameters: { + urls: { + type: 'array', + description: '要提取内容的 URL 列表(最多 20 个)', + required: true, + }, + extract_depth: { + type: 'string', + description: '提取深度: "basic" 快速提取,"advanced" 深度提取(包含表格、嵌入内容等,默认 basic)', + required: false, + }, + format: { + type: 'string', + description: '输出格式: "markdown" 或 "text"(默认 markdown)', + required: false, + }, + include_images: { + type: 'boolean', + description: '是否包含提取的图片列表(默认 false)', + required: false, + }, + }, + execute: async (params: Record): Promise => { + let urls = params.urls as string | string[]; + // 支持单个 URL 字符串或数组 + if (typeof urls === 'string') { + urls = [urls]; + } + // 限制最多 20 个 URL + urls = urls.slice(0, 20); + + const extractDepth = (params.extract_depth as 'basic' | 'advanced') || 'basic'; + const format = (params.format as 'markdown' | 'text') || 'markdown'; + const includeImages = (params.include_images as boolean) || false; + + // 权限检查 + const permissionManager = getPermissionManager(); + const permResult = await permissionManager.checkWebPermission({ + query: `extract: ${urls.join(', ')}`, + searchDepth: extractDepth, + }); + + if (!permResult.allowed) { + if (permResult.needsConfirmation) { + return { + success: false, + output: '', + error: `需要用户确认网页提取: ${urls.join(', ')}\n原因: ${permResult.reason || '需要权限确认'}`, + }; + } + + return { + success: false, + output: '', + error: `网页提取权限被拒绝: ${permResult.reason || '提取不被允许'}`, + }; + } + + // 获取 Tavily API Key + const config = getConfig(); + const apiKey = process.env.TAVILY_API_KEY || config.tavilyApiKey; + + if (!apiKey) { + return { + success: false, + output: '', + error: '未配置 Tavily API Key。请设置环境变量 TAVILY_API_KEY 或在配置文件中添加 tavilyApiKey。', + }; + } + + try { + // 使用 Tavily SDK + const client = tavily({ apiKey }); + const response = await client.extract(urls, { + extractDepth, + includeImages, + }); + + // 格式化输出 + let output = `## 网页内容提取\n\n`; + + // 成功提取的结果 + if (response.results && response.results.length > 0) { + output += `### 提取成功 (${response.results.length} 个)\n\n`; + + for (const result of response.results) { + output += `---\n`; + output += `**URL:** ${result.url}\n\n`; + + // 内容处理 + let content = result.rawContent || ''; + if (format === 'text' && content) { + // 简单去除 markdown 格式 + content = content.replace(/[#*`]/g, ''); + } + // 截断过长的内容 + if (content.length > 5000) { + content = content.substring(0, 5000) + '\n\n... (内容已截断)'; + } + output += `${content}\n\n`; + + // 图片列表 + if (includeImages && result.images && result.images.length > 0) { + output += `**图片 (${result.images.length} 张):**\n`; + for (const img of result.images.slice(0, 10)) { + output += `- ${img}\n`; + } + if (result.images.length > 10) { + output += `- ... 还有 ${result.images.length - 10} 张图片\n`; + } + output += '\n'; + } + } + } + + // 失败的结果 + if (response.failedResults && response.failedResults.length > 0) { + output += `### 提取失败 (${response.failedResults.length} 个)\n\n`; + for (const failed of response.failedResults) { + output += `- ${failed.url}: ${failed.error || '未知错误'}\n`; + } + output += '\n'; + } + + // 响应时间 + if (response.responseTime) { + output += `_提取耗时: ${response.responseTime.toFixed(2)} 秒_\n`; + } + + return { + success: true, + output, + }; + } catch (error) { + return { + success: false, + output: '', + error: `提取失败: ${error instanceof Error ? error.message : String(error)}`, + }; + } + }, +};