feat: 添加 web_extract 网页内容提取工具

- 使用 Tavily Extract API 从 URL 提取网页内容 - 支持批量提取（最多 20 个 URL） - 支持 basic/advanced 提取深度 - 支持 markdown/text 输出格式 - 可选提取图片列表 - 复用 WebPermissionChecker 进行权限管控
2025-12-11 10:09:19 +08:00
parent 3c922fe16c
commit 43cfe483fb
4 changed files with 182 additions and 2 deletions
@@ -0,0 +1,19 @@
+从指定的网页 URL 提取内容。使用 Tavily Extract API 智能解析网页，返回结构化的文本内容。
+
+适用场景：
+- 获取网页文章的完整内容
+- 提取文档页面的详细信息
+- 抓取多个页面进行对比分析
+- 获取网页中的图片列表
+- 深度提取包含表格、嵌入内容的页面
+
+参数说明：
+- urls: URL 列表（必填，最多 20 个，也可传单个 URL 字符串）
+- extract_depth: "basic" 快速提取 / "advanced" 深度提取（含表格等）
+- format: "markdown" / "text" 输出格式，默认 markdown
+- include_images: 是否包含图片列表，默认 false
+
+返回内容：
+- 每个 URL 的提取内容
+- 图片列表（如果启用）
+- 失败的 URL 及错误信息
@@ -24,7 +24,7 @@ import {
 } from './filesystem/index.js';

 // Web 工具
-import { webSearchTool } from './web/index.js';
+import { webSearchTool, webExtractTool } from './web/index.js';

 // 所有工具列表（用于注册）
 const allToolsWithMetadata: ToolWithMetadata[] = [
@@ -47,8 +47,9 @@ const allToolsWithMetadata: ToolWithMetadata[] = [
  copyFileTool,
  deleteFileTool,

-  // Web 工具 (deferLoading: true)
+  // Web 工具 (deferLoading: false)
  webSearchTool,
+  webExtractTool,
 ];

 // 注册所有工具到 registry
@@ -1 +1,2 @@
 export { webSearchTool } from './web_search.js';
+export { webExtractTool } from './web_extract.js';
@@ -0,0 +1,159 @@
+import { tavily } from '@tavily/core';
+import type { ToolResult } from '../../types/index.js';
+import type { ToolWithMetadata } from '../types.js';
+import { loadDescription } from '../load_description.js';
+import { getConfig } from '../../utils/config.js';
+import { getPermissionManager } from '../../permission/index.js';
+
+export const webExtractTool: ToolWithMetadata = {
+  name: 'web_extract',
+  description: loadDescription('web_extract'),
+  metadata: {
+    name: 'web_extract',
+    category: 'web',
+    description: '从网页URL提取内容',
+    keywords: ['extract', 'url', 'web', 'content', 'scrape', 'fetch', '提取', '网页', '抓取', '内容'],
+    deferLoading: false,
+  },
+  parameters: {
+    urls: {
+      type: 'array',
+      description: '要提取内容的 URL 列表（最多 20 个）',
+      required: true,
+    },
+    extract_depth: {
+      type: 'string',
+      description: '提取深度: "basic" 快速提取，"advanced" 深度提取（包含表格、嵌入内容等，默认 basic）',
+      required: false,
+    },
+    format: {
+      type: 'string',
+      description: '输出格式: "markdown" 或 "text"（默认 markdown）',
+      required: false,
+    },
+    include_images: {
+      type: 'boolean',
+      description: '是否包含提取的图片列表（默认 false）',
+      required: false,
+    },
+  },
+  execute: async (params: Record<string, unknown>): Promise<ToolResult> => {
+    let urls = params.urls as string | string[];
+    // 支持单个 URL 字符串或数组
+    if (typeof urls === 'string') {
+      urls = [urls];
+    }
+    // 限制最多 20 个 URL
+    urls = urls.slice(0, 20);
+
+    const extractDepth = (params.extract_depth as 'basic' | 'advanced') || 'basic';
+    const format = (params.format as 'markdown' | 'text') || 'markdown';
+    const includeImages = (params.include_images as boolean) || false;
+
+    // 权限检查
+    const permissionManager = getPermissionManager();
+    const permResult = await permissionManager.checkWebPermission({
+      query: `extract: ${urls.join(', ')}`,
+      searchDepth: extractDepth,
+    });
+
+    if (!permResult.allowed) {
+      if (permResult.needsConfirmation) {
+        return {
+          success: false,
+          output: '',
+          error: `需要用户确认网页提取: ${urls.join(', ')}\n原因: ${permResult.reason || '需要权限确认'}`,
+        };
+      }
+
+      return {
+        success: false,
+        output: '',
+        error: `网页提取权限被拒绝: ${permResult.reason || '提取不被允许'}`,
+      };
+    }
+
+    // 获取 Tavily API Key
+    const config = getConfig();
+    const apiKey = process.env.TAVILY_API_KEY || config.tavilyApiKey;
+
+    if (!apiKey) {
+      return {
+        success: false,
+        output: '',
+        error: '未配置 Tavily API Key。请设置环境变量 TAVILY_API_KEY 或在配置文件中添加 tavilyApiKey。',
+      };
+    }
+
+    try {
+      // 使用 Tavily SDK
+      const client = tavily({ apiKey });
+      const response = await client.extract(urls, {
+        extractDepth,
+        includeImages,
+      });
+
+      // 格式化输出
+      let output = `## 网页内容提取\n\n`;
+
+      // 成功提取的结果
+      if (response.results && response.results.length > 0) {
+        output += `### 提取成功 (${response.results.length} 个)\n\n`;
+
+        for (const result of response.results) {
+          output += `---\n`;
+          output += `**URL:** ${result.url}\n\n`;
+
+          // 内容处理
+          let content = result.rawContent || '';
+          if (format === 'text' && content) {
+            // 简单去除 markdown 格式
+            content = content.replace(/[#*`]/g, '');
+          }
+          // 截断过长的内容
+          if (content.length > 5000) {
+            content = content.substring(0, 5000) + '\n\n... (内容已截断)';
+          }
+          output += `${content}\n\n`;
+
+          // 图片列表
+          if (includeImages && result.images && result.images.length > 0) {
+            output += `**图片 (${result.images.length} 张):**\n`;
+            for (const img of result.images.slice(0, 10)) {
+              output += `- ${img}\n`;
+            }
+            if (result.images.length > 10) {
+              output += `- ... 还有 ${result.images.length - 10} 张图片\n`;
+            }
+            output += '\n';
+          }
+        }
+      }
+
+      // 失败的结果
+      if (response.failedResults && response.failedResults.length > 0) {
+        output += `### 提取失败 (${response.failedResults.length} 个)\n\n`;
+        for (const failed of response.failedResults) {
+          output += `- ${failed.url}: ${failed.error || '未知错误'}\n`;
+        }
+        output += '\n';
+      }
+
+      // 响应时间
+      if (response.responseTime) {
+        output += `_提取耗时: ${response.responseTime.toFixed(2)} 秒_\n`;
+      }
+
+      return {
+        success: true,
+        output,
+      };
+    } catch (error) {
+      return {
+        success: false,
+        output: '',
+        error: `提取失败: ${error instanceof Error ? error.message : String(error)}`,
+      };
+    }
+  },
+};