feat: 添加 web_extract 网页内容提取工具

- 使用 Tavily Extract API 从 URL 提取网页内容 - 支持批量提取（最多 20 个 URL） - 支持 basic/advanced 提取深度 - 支持 markdown/text 输出格式 - 可选提取图片列表 - 复用 WebPermissionChecker 进行权限管控
2025-12-11 10:09:19 +08:00
parent 3c922fe16c
commit 43cfe483fb
4 changed files with 182 additions and 2 deletions
@@ -0,0 +1,19 @@
 从指定的网页 URL 提取内容。使用 Tavily Extract API 智能解析网页，返回结构化的文本内容。
 适用场景：
 - 获取网页文章的完整内容
 - 提取文档页面的详细信息
 - 抓取多个页面进行对比分析
 - 获取网页中的图片列表
 - 深度提取包含表格、嵌入内容的页面
 参数说明：
 - urls: URL 列表（必填，最多 20 个，也可传单个 URL 字符串）
 - extract_depth: "basic" 快速提取 / "advanced" 深度提取（含表格等）
 - format: "markdown" / "text" 输出格式，默认 markdown
 - include_images: 是否包含图片列表，默认 false
 返回内容：
 - 每个 URL 的提取内容
 - 图片列表（如果启用）
 - 失败的 URL 及错误信息
@@ -24,7 +24,7 @@ import {
 } from './filesystem/index.js';
 // Web 工具
-import { webSearchTool } from './web/index.js';
+import { webSearchTool, webExtractTool } from './web/index.js';
 // 所有工具列表（用于注册）
 const allToolsWithMetadata: ToolWithMetadata[] = [
@@ -47,8 +47,9 @@ const allToolsWithMetadata: ToolWithMetadata[] = [
  copyFileTool,
  deleteFileTool,
-  // Web 工具 (deferLoading: true)
+  // Web 工具 (deferLoading: false)
  webSearchTool,
  webExtractTool,
 ];
 // 注册所有工具到 registry
@@ -1 +1,2 @@
 export { webSearchTool } from './web_search.js';
 export { webExtractTool } from './web_extract.js';
@@ -0,0 +1,159 @@
 import { tavily } from '@tavily/core';
 import type { ToolResult } from '../../types/index.js';
 import type { ToolWithMetadata } from '../types.js';
 import { loadDescription } from '../load_description.js';
 import { getConfig } from '../../utils/config.js';
 import { getPermissionManager } from '../../permission/index.js';
 export const webExtractTool: ToolWithMetadata = {
  name: 'web_extract',
  description: loadDescription('web_extract'),
  metadata: {
    name: 'web_extract',
    category: 'web',
    description: '从网页URL提取内容',
    keywords: ['extract', 'url', 'web', 'content', 'scrape', 'fetch', '提取', '网页', '抓取', '内容'],
    deferLoading: false,
  },
  parameters: {
    urls: {
      type: 'array',
      description: '要提取内容的 URL 列表（最多 20 个）',
      required: true,
    },
    extract_depth: {
      type: 'string',
      description: '提取深度: "basic" 快速提取，"advanced" 深度提取（包含表格、嵌入内容等，默认 basic）',
      required: false,
    },
    format: {
      type: 'string',
      description: '输出格式: "markdown" 或 "text"（默认 markdown）',
      required: false,
    },
    include_images: {
      type: 'boolean',
      description: '是否包含提取的图片列表（默认 false）',
      required: false,
    },
  },
  execute: async (params: Record<string, unknown>): Promise<ToolResult> => {
    let urls = params.urls as string | string[];
    // 支持单个 URL 字符串或数组
    if (typeof urls === 'string') {
      urls = [urls];
    }
    // 限制最多 20 个 URL
    urls = urls.slice(0, 20);
    const extractDepth = (params.extract_depth as 'basic' | 'advanced') || 'basic';
    const format = (params.format as 'markdown' | 'text') || 'markdown';
    const includeImages = (params.include_images as boolean) || false;
    // 权限检查
    const permissionManager = getPermissionManager();
    const permResult = await permissionManager.checkWebPermission({
      query: `extract: ${urls.join(', ')}`,
      searchDepth: extractDepth,
    });
    if (!permResult.allowed) {
      if (permResult.needsConfirmation) {
        return {
          success: false,
          output: '',
          error: `需要用户确认网页提取: ${urls.join(', ')}\n原因: ${permResult.reason || '需要权限确认'}`,
        };
      }
      return {
        success: false,
        output: '',
        error: `网页提取权限被拒绝: ${permResult.reason || '提取不被允许'}`,
      };
    }
    // 获取 Tavily API Key
    const config = getConfig();
    const apiKey = process.env.TAVILY_API_KEY || config.tavilyApiKey;
    if (!apiKey) {
      return {
        success: false,
        output: '',
        error: '未配置 Tavily API Key。请设置环境变量 TAVILY_API_KEY 或在配置文件中添加 tavilyApiKey。',
      };
    }
    try {
      // 使用 Tavily SDK
      const client = tavily({ apiKey });
      const response = await client.extract(urls, {
        extractDepth,
        includeImages,
      });
      // 格式化输出
      let output = `## 网页内容提取\n\n`;
      // 成功提取的结果
      if (response.results && response.results.length > 0) {
        output += `### 提取成功 (${response.results.length} 个)\n\n`;
        for (const result of response.results) {
          output += `---\n`;
          output += `**URL:** ${result.url}\n\n`;
          // 内容处理
          let content = result.rawContent || '';
          if (format === 'text' && content) {
            // 简单去除 markdown 格式
            content = content.replace(/[#*`]/g, '');
          }
          // 截断过长的内容
          if (content.length > 5000) {
            content = content.substring(0, 5000) + '\n\n... (内容已截断)';
          }
          output += `${content}\n\n`;
          // 图片列表
          if (includeImages && result.images && result.images.length > 0) {
            output += `**图片 (${result.images.length} 张):**\n`;
            for (const img of result.images.slice(0, 10)) {
              output += `- ${img}\n`;
            }
            if (result.images.length > 10) {
              output += `- ... 还有 ${result.images.length - 10} 张图片\n`;
            }
            output += '\n';
          }
        }
      }
      // 失败的结果
      if (response.failedResults && response.failedResults.length > 0) {
        output += `### 提取失败 (${response.failedResults.length} 个)\n\n`;
        for (const failed of response.failedResults) {
          output += `- ${failed.url}: ${failed.error || '未知错误'}\n`;
        }
        output += '\n';
      }
      // 响应时间
      if (response.responseTime) {
        output += `_提取耗时: ${response.responseTime.toFixed(2)} 秒_\n`;
      }
      return {
        success: true,
        output,
      };
    } catch (error) {
      return {
        success: false,
        output: '',
        error: `提取失败: ${error instanceof Error ? error.message : String(error)}`,
      };
    }
  },
 };
`@@ -1 +1,2 @@`
	`export { webSearchTool } from './web_search.js';`	`export { webSearchTool } from './web_search.js';`
		`export { webExtractTool } from './web_extract.js';`