feat: 添加 web_extract 网页内容提取工具
- 使用 Tavily Extract API 从 URL 提取网页内容 - 支持批量提取(最多 20 个 URL) - 支持 basic/advanced 提取深度 - 支持 markdown/text 输出格式 - 可选提取图片列表 - 复用 WebPermissionChecker 进行权限管控
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
从指定的网页 URL 提取内容。使用 Tavily Extract API 智能解析网页,返回结构化的文本内容。
|
||||
|
||||
适用场景:
|
||||
- 获取网页文章的完整内容
|
||||
- 提取文档页面的详细信息
|
||||
- 抓取多个页面进行对比分析
|
||||
- 获取网页中的图片列表
|
||||
- 深度提取包含表格、嵌入内容的页面
|
||||
|
||||
参数说明:
|
||||
- urls: URL 列表(必填,最多 20 个,也可传单个 URL 字符串)
|
||||
- extract_depth: "basic" 快速提取 / "advanced" 深度提取(含表格等)
|
||||
- format: "markdown" / "text" 输出格式,默认 markdown
|
||||
- include_images: 是否包含图片列表,默认 false
|
||||
|
||||
返回内容:
|
||||
- 每个 URL 的提取内容
|
||||
- 图片列表(如果启用)
|
||||
- 失败的 URL 及错误信息
|
||||
+3
-2
@@ -24,7 +24,7 @@ import {
|
||||
} from './filesystem/index.js';
|
||||
|
||||
// Web 工具
|
||||
import { webSearchTool } from './web/index.js';
|
||||
import { webSearchTool, webExtractTool } from './web/index.js';
|
||||
|
||||
// 所有工具列表(用于注册)
|
||||
const allToolsWithMetadata: ToolWithMetadata[] = [
|
||||
@@ -47,8 +47,9 @@ const allToolsWithMetadata: ToolWithMetadata[] = [
|
||||
copyFileTool,
|
||||
deleteFileTool,
|
||||
|
||||
// Web 工具 (deferLoading: true)
|
||||
// Web 工具 (deferLoading: false)
|
||||
webSearchTool,
|
||||
webExtractTool,
|
||||
];
|
||||
|
||||
// 注册所有工具到 registry
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
export { webSearchTool } from './web_search.js';
|
||||
export { webExtractTool } from './web_extract.js';
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
import { tavily } from '@tavily/core';
|
||||
import type { ToolResult } from '../../types/index.js';
|
||||
import type { ToolWithMetadata } from '../types.js';
|
||||
import { loadDescription } from '../load_description.js';
|
||||
import { getConfig } from '../../utils/config.js';
|
||||
import { getPermissionManager } from '../../permission/index.js';
|
||||
|
||||
export const webExtractTool: ToolWithMetadata = {
|
||||
name: 'web_extract',
|
||||
description: loadDescription('web_extract'),
|
||||
metadata: {
|
||||
name: 'web_extract',
|
||||
category: 'web',
|
||||
description: '从网页URL提取内容',
|
||||
keywords: ['extract', 'url', 'web', 'content', 'scrape', 'fetch', '提取', '网页', '抓取', '内容'],
|
||||
deferLoading: false,
|
||||
},
|
||||
parameters: {
|
||||
urls: {
|
||||
type: 'array',
|
||||
description: '要提取内容的 URL 列表(最多 20 个)',
|
||||
required: true,
|
||||
},
|
||||
extract_depth: {
|
||||
type: 'string',
|
||||
description: '提取深度: "basic" 快速提取,"advanced" 深度提取(包含表格、嵌入内容等,默认 basic)',
|
||||
required: false,
|
||||
},
|
||||
format: {
|
||||
type: 'string',
|
||||
description: '输出格式: "markdown" 或 "text"(默认 markdown)',
|
||||
required: false,
|
||||
},
|
||||
include_images: {
|
||||
type: 'boolean',
|
||||
description: '是否包含提取的图片列表(默认 false)',
|
||||
required: false,
|
||||
},
|
||||
},
|
||||
execute: async (params: Record<string, unknown>): Promise<ToolResult> => {
|
||||
let urls = params.urls as string | string[];
|
||||
// 支持单个 URL 字符串或数组
|
||||
if (typeof urls === 'string') {
|
||||
urls = [urls];
|
||||
}
|
||||
// 限制最多 20 个 URL
|
||||
urls = urls.slice(0, 20);
|
||||
|
||||
const extractDepth = (params.extract_depth as 'basic' | 'advanced') || 'basic';
|
||||
const format = (params.format as 'markdown' | 'text') || 'markdown';
|
||||
const includeImages = (params.include_images as boolean) || false;
|
||||
|
||||
// 权限检查
|
||||
const permissionManager = getPermissionManager();
|
||||
const permResult = await permissionManager.checkWebPermission({
|
||||
query: `extract: ${urls.join(', ')}`,
|
||||
searchDepth: extractDepth,
|
||||
});
|
||||
|
||||
if (!permResult.allowed) {
|
||||
if (permResult.needsConfirmation) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: `需要用户确认网页提取: ${urls.join(', ')}\n原因: ${permResult.reason || '需要权限确认'}`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: `网页提取权限被拒绝: ${permResult.reason || '提取不被允许'}`,
|
||||
};
|
||||
}
|
||||
|
||||
// 获取 Tavily API Key
|
||||
const config = getConfig();
|
||||
const apiKey = process.env.TAVILY_API_KEY || config.tavilyApiKey;
|
||||
|
||||
if (!apiKey) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: '未配置 Tavily API Key。请设置环境变量 TAVILY_API_KEY 或在配置文件中添加 tavilyApiKey。',
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
// 使用 Tavily SDK
|
||||
const client = tavily({ apiKey });
|
||||
const response = await client.extract(urls, {
|
||||
extractDepth,
|
||||
includeImages,
|
||||
});
|
||||
|
||||
// 格式化输出
|
||||
let output = `## 网页内容提取\n\n`;
|
||||
|
||||
// 成功提取的结果
|
||||
if (response.results && response.results.length > 0) {
|
||||
output += `### 提取成功 (${response.results.length} 个)\n\n`;
|
||||
|
||||
for (const result of response.results) {
|
||||
output += `---\n`;
|
||||
output += `**URL:** ${result.url}\n\n`;
|
||||
|
||||
// 内容处理
|
||||
let content = result.rawContent || '';
|
||||
if (format === 'text' && content) {
|
||||
// 简单去除 markdown 格式
|
||||
content = content.replace(/[#*`]/g, '');
|
||||
}
|
||||
// 截断过长的内容
|
||||
if (content.length > 5000) {
|
||||
content = content.substring(0, 5000) + '\n\n... (内容已截断)';
|
||||
}
|
||||
output += `${content}\n\n`;
|
||||
|
||||
// 图片列表
|
||||
if (includeImages && result.images && result.images.length > 0) {
|
||||
output += `**图片 (${result.images.length} 张):**\n`;
|
||||
for (const img of result.images.slice(0, 10)) {
|
||||
output += `- ${img}\n`;
|
||||
}
|
||||
if (result.images.length > 10) {
|
||||
output += `- ... 还有 ${result.images.length - 10} 张图片\n`;
|
||||
}
|
||||
output += '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 失败的结果
|
||||
if (response.failedResults && response.failedResults.length > 0) {
|
||||
output += `### 提取失败 (${response.failedResults.length} 个)\n\n`;
|
||||
for (const failed of response.failedResults) {
|
||||
output += `- ${failed.url}: ${failed.error || '未知错误'}\n`;
|
||||
}
|
||||
output += '\n';
|
||||
}
|
||||
|
||||
// 响应时间
|
||||
if (response.responseTime) {
|
||||
output += `_提取耗时: ${response.responseTime.toFixed(2)} 秒_\n`;
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: `提取失败: ${error instanceof Error ? error.message : String(error)}`,
|
||||
};
|
||||
}
|
||||
},
|
||||
};
|
||||
Reference in New Issue
Block a user