feat: 重构 Vision 处理架构,支持自动委托 Vision Agent

- 主 Agent 收到图片后自动检测是否支持 vision,不支持时委托 Vision Agent 处理
- 添加 qwen-ai-provider-v5 支持百炼/DashScope API
- Task 工具支持 images 参数,可传递图片给子 Agent
- Vision Agent 使用独立的 VisionConfig 配置
- 移除 UI 层的 vision fallback 逻辑,统一在 Agent 层处理
- 删除废弃的 src/utils/vision.ts(原生 fetch 实现)
This commit is contained in:
2025-12-11 18:21:36 +08:00
parent 32fdb244f0
commit abbb03bf50
10 changed files with 289 additions and 308 deletions
+17
View File
@@ -19,6 +19,7 @@
"inquirer": "^12.0.0", "inquirer": "^12.0.0",
"js-yaml": "^4.1.1", "js-yaml": "^4.1.1",
"ora": "^8.1.0", "ora": "^8.1.0",
"qwen-ai-provider-v5": "^1.0.2",
"tree-sitter-bash": "^0.25.1", "tree-sitter-bash": "^0.25.1",
"uuid": "^13.0.0", "uuid": "^13.0.0",
"vscode-jsonrpc": "^8.2.1", "vscode-jsonrpc": "^8.2.1",
@@ -2594,6 +2595,22 @@
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/qwen-ai-provider-v5": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/qwen-ai-provider-v5/-/qwen-ai-provider-v5-1.0.2.tgz",
"integrity": "sha512-IMweAFhHxM2OZzeZKyDUfcxCQCLkFioQv9TkprAXttV6XeTBTSjjUc17S9dUW4rOgtWLsCXoAkaAPUHj1jQYtg==",
"license": "Apache-2.0",
"dependencies": {
"@ai-sdk/provider": "^2.0.0",
"@ai-sdk/provider-utils": "^3.0.0"
},
"engines": {
"node": ">=18.0.0"
},
"peerDependencies": {
"zod": "^3.25.76 || ^4.1.8"
}
},
"node_modules/resolve-pkg-maps": { "node_modules/resolve-pkg-maps": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
+1
View File
@@ -36,6 +36,7 @@
"inquirer": "^12.0.0", "inquirer": "^12.0.0",
"js-yaml": "^4.1.1", "js-yaml": "^4.1.1",
"ora": "^8.1.0", "ora": "^8.1.0",
"qwen-ai-provider-v5": "^1.0.2",
"tree-sitter-bash": "^0.25.1", "tree-sitter-bash": "^0.25.1",
"uuid": "^13.0.0", "uuid": "^13.0.0",
"vscode-jsonrpc": "^8.2.1", "vscode-jsonrpc": "^8.2.1",
+56 -3
View File
@@ -1,6 +1,7 @@
import { createAnthropic } from '@ai-sdk/anthropic'; import { createAnthropic } from '@ai-sdk/anthropic';
import { createDeepSeek } from '@ai-sdk/deepseek'; import { createDeepSeek } from '@ai-sdk/deepseek';
import { createOpenAI } from '@ai-sdk/openai'; import { createOpenAI } from '@ai-sdk/openai';
import { createQwen } from 'qwen-ai-provider-v5';
import { import {
generateText, generateText,
streamText, streamText,
@@ -9,13 +10,14 @@ import {
type Tool as AITool, type Tool as AITool,
type LanguageModel, type LanguageModel,
} from 'ai'; } from 'ai';
import type { Tool, ToolResult, ProviderType, AgentConfig } from '../types/index.js'; import type { Tool, ToolResult, ProviderType, AgentConfig, ContentBlock } from '../types/index.js';
import { buildZodSchema } from '../types/index.js'; import { buildZodSchema } from '../types/index.js';
import { ToolRegistry } from '../tools/registry.js'; import { ToolRegistry } from '../tools/registry.js';
import type { import type {
AgentInfo, AgentInfo,
AgentExecutionContext, AgentExecutionContext,
AgentExecutionResult, AgentExecutionResult,
ImageData,
} from './types.js'; } from './types.js';
import { checkBashPermission } from './permission-merger.js'; import { checkBashPermission } from './permission-merger.js';
@@ -28,6 +30,14 @@ interface ProviderOptions {
// Provider 工厂函数类型 // Provider 工厂函数类型
type ProviderFactory = (options: ProviderOptions) => (model: string) => LanguageModel; type ProviderFactory = (options: ProviderOptions) => (model: string) => LanguageModel;
/**
* 检查 baseUrl 是否为阿里云百炼/DashScope
*/
function isDashScopeUrl(baseUrl?: string): boolean {
if (!baseUrl) return false;
return baseUrl.includes('dashscope');
}
// Provider 注册表 // Provider 注册表
const providers: Record<ProviderType, ProviderFactory> = { const providers: Record<ProviderType, ProviderFactory> = {
anthropic: ({ apiKey, baseUrl }) => { anthropic: ({ apiKey, baseUrl }) => {
@@ -39,6 +49,11 @@ const providers: Record<ProviderType, ProviderFactory> = {
return (model) => client(model); return (model) => client(model);
}, },
openai: ({ apiKey, baseUrl }) => { openai: ({ apiKey, baseUrl }) => {
// 如果是百炼的 URL,使用 qwen provider
if (isDashScopeUrl(baseUrl)) {
const client = createQwen({ apiKey, baseURL: baseUrl });
return (model) => client(model);
}
const client = createOpenAI({ apiKey, baseURL: baseUrl }); const client = createOpenAI({ apiKey, baseURL: baseUrl });
return (model) => client(model); return (model) => client(model);
}, },
@@ -79,7 +94,7 @@ export class AgentExecutor {
prompt: string, prompt: string,
context: AgentExecutionContext context: AgentExecutionContext
): Promise<AgentExecutionResult> { ): Promise<AgentExecutionResult> {
const { onStream, onToolCall, onToolResult } = context; const { onStream, onToolCall, onToolResult, images } = context;
// 获取过滤后的工具 // 获取过滤后的工具
const tools = this.getFilteredTools(); const tools = this.getFilteredTools();
@@ -93,11 +108,14 @@ export class AgentExecutor {
const maxSteps = this.agentInfo.maxSteps ?? 10; const maxSteps = this.agentInfo.maxSteps ?? 10;
const maxTokens = this.agentInfo.model?.maxTokens ?? this.baseConfig.maxTokens; const maxTokens = this.agentInfo.model?.maxTokens ?? this.baseConfig.maxTokens;
// 构建消息内容(支持图片)
const messageContent = this.buildMessageContent(prompt, images);
// 构建初始消息 // 构建初始消息
const messages: ModelMessage[] = [ const messages: ModelMessage[] = [
{ {
role: 'user', role: 'user',
content: prompt, content: messageContent,
}, },
]; ];
@@ -308,4 +326,39 @@ export class AgentExecutor {
// 否则使用基础配置的 systemPrompt // 否则使用基础配置的 systemPrompt
return this.baseConfig.systemPrompt; return this.baseConfig.systemPrompt;
} }
/**
* 构建消息内容(支持图片)
*/
private buildMessageContent(
prompt: string,
images?: ImageData[]
): string | ContentBlock[] {
// 如果没有图片,直接返回文本
if (!images || images.length === 0) {
return prompt;
}
// 构建多模态内容
const blocks: ContentBlock[] = [];
// 先添加图片
for (const img of images) {
blocks.push({
type: 'image',
image: img.data,
mimeType: img.mimeType,
});
}
// 再添加文本
if (prompt) {
blocks.push({
type: 'text',
text: prompt,
});
}
return blocks;
}
} }
+3 -1
View File
@@ -4,6 +4,7 @@ import { exploreAgent } from './explore.js';
import { codeReviewerAgent } from './code-reviewer.js'; import { codeReviewerAgent } from './code-reviewer.js';
import { buildAgent } from './build.js'; import { buildAgent } from './build.js';
import { planAgent } from './plan.js'; import { planAgent } from './plan.js';
import { visionAgent } from './vision.js';
/** /**
* 预设 Agent 集合 * 预设 Agent 集合
@@ -14,6 +15,7 @@ export const presetAgents: Record<string, Omit<AgentInfo, 'name'>> = {
'code-reviewer': codeReviewerAgent, 'code-reviewer': codeReviewerAgent,
build: buildAgent, build: buildAgent,
plan: planAgent, plan: planAgent,
vision: visionAgent,
}; };
/** /**
@@ -30,4 +32,4 @@ export function isPresetAgent(name: string): boolean {
return name in presetAgents; return name in presetAgents;
} }
export { generalAgent, exploreAgent, codeReviewerAgent, buildAgent, planAgent }; export { generalAgent, exploreAgent, codeReviewerAgent, buildAgent, planAgent, visionAgent };
+52
View File
@@ -0,0 +1,52 @@
import type { AgentInfo } from '../types.js';
/**
* Vision Agent
* 图片理解专家,使用多模态模型分析图片内容
*/
export const visionAgent: Omit<AgentInfo, 'name'> = {
description: '图片理解专家,分析截图、设计稿、架构图等',
mode: 'subagent',
prompt: `你是一个专业的图片分析专家。你的任务是详细描述和分析用户提供的图片内容。
分析要点:
1. **整体概述**:图片的类型(截图、设计稿、图表、照片等)和主要内容
2. **布局结构**:页面/图片的整体布局、区域划分
3. **文字内容**:提取图片中的所有可见文字(完整、准确)
4. **UI 元素**:按钮、输入框、菜单、图标等元素及其状态
5. **视觉细节**:颜色、字体、间距、对齐等设计细节
6. **交互状态**:hover、选中、禁用等状态指示
7. **潜在问题**:如果用户询问问题,指出可能的问题或改进点
输出格式:
- 使用清晰的 Markdown 格式
- 先给出整体概述,再逐一分析细节
- 如果是 UI 截图,按区域从上到下、从左到右描述
- 提取的文字用引号标注
注意事项:
- 描述要准确、具体,避免模糊表述
- 如果某些内容不清晰,明确说明
- 根据用户的问题重点分析相关部分`,
tools: {
enabled: [],
noTask: true,
},
permission: {
file: {
read: 'deny',
write: 'deny',
edit: 'deny',
delete: 'deny',
},
bash: {
enabled: false,
},
git: {
read: 'deny',
write: 'deny',
dangerous: 'deny',
},
},
maxSteps: 1,
};
+14
View File
@@ -131,6 +131,18 @@ export interface AgentConfigFile {
agents?: Record<string, Omit<AgentInfo, 'name'>>; agents?: Record<string, Omit<AgentInfo, 'name'>>;
} }
/**
* 图片数据(用于 Agent 执行上下文)
*/
export interface ImageData {
/** base64 编码的图片数据 */
data: string;
/** MIME 类型 */
mimeType: string;
/** 文件名(可选) */
filename?: string;
}
/** /**
* Agent 执行上下文 * Agent 执行上下文
*/ */
@@ -139,6 +151,8 @@ export interface AgentExecutionContext {
parentSessionId?: string; parentSessionId?: string;
/** 工作目录 */ /** 工作目录 */
workdir: string; workdir: string;
/** 图片数据(用于支持多模态输入) */
images?: ImageData[];
/** 回调:输出流 */ /** 回调:输出流 */
onStream?: (text: string) => void; onStream?: (text: string) => void;
/** 回调:工具调用 */ /** 回调:工具调用 */
+113 -7
View File
@@ -1,6 +1,7 @@
import { createAnthropic } from '@ai-sdk/anthropic'; import { createAnthropic } from '@ai-sdk/anthropic';
import { createDeepSeek } from '@ai-sdk/deepseek'; import { createDeepSeek } from '@ai-sdk/deepseek';
import { createOpenAI } from '@ai-sdk/openai'; import { createOpenAI } from '@ai-sdk/openai';
import { createQwen } from 'qwen-ai-provider-v5';
import { import {
generateText, generateText,
streamText, streamText,
@@ -18,7 +19,9 @@ import {
type TokenUsage, type TokenUsage,
type CompressionConfig, type CompressionConfig,
} from '../context/index.js'; } from '../context/index.js';
import type { AgentInfo } from '../agent/types.js'; import type { AgentInfo, ImageData } from '../agent/types.js';
import { agentRegistry, AgentExecutor } from '../agent/index.js';
import { loadVisionConfig } from '../utils/config.js';
// Provider 配置 // Provider 配置
interface ProviderOptions { interface ProviderOptions {
@@ -29,6 +32,14 @@ interface ProviderOptions {
// Provider 工厂函数类型 // Provider 工厂函数类型
type ProviderFactory = (options: ProviderOptions) => (model: string) => LanguageModel; type ProviderFactory = (options: ProviderOptions) => (model: string) => LanguageModel;
/**
* 检查 baseUrl 是否为阿里云百炼/DashScope
*/
function isDashScopeUrl(baseUrl?: string): boolean {
if (!baseUrl) return false;
return baseUrl.includes('dashscope');
}
// Provider 注册表 // Provider 注册表
const providers: Record<ProviderType, ProviderFactory> = { const providers: Record<ProviderType, ProviderFactory> = {
anthropic: ({ apiKey, baseUrl }) => { anthropic: ({ apiKey, baseUrl }) => {
@@ -40,6 +51,11 @@ const providers: Record<ProviderType, ProviderFactory> = {
return (model) => client(model); return (model) => client(model);
}, },
openai: ({ apiKey, baseUrl }) => { openai: ({ apiKey, baseUrl }) => {
// 如果是百炼的 URL,使用 qwen provider
if (isDashScopeUrl(baseUrl)) {
const client = createQwen({ apiKey, baseURL: baseUrl });
return (model) => client(model);
}
const client = createOpenAI({ apiKey, baseURL: baseUrl }); const client = createOpenAI({ apiKey, baseURL: baseUrl });
return (model) => client(model); return (model) => client(model);
}, },
@@ -236,19 +252,42 @@ export class Agent {
* @param onStream 流式输出回调 * @param onStream 流式输出回调
*/ */
async chat(userMessage: string | UserInput, onStream?: (text: string) => void): Promise<string> { async chat(userMessage: string | UserInput, onStream?: (text: string) => void): Promise<string> {
// 处理带图片的消息
let processedMessage = userMessage;
if (typeof userMessage !== 'string' && userMessage.images && userMessage.images.length > 0) {
// 检查当前模型是否支持 vision
if (!this.supportsVision()) {
// 不支持 vision,尝试使用 Vision Agent 处理图片
const visionResult = await this.processImagesWithVisionAgent(
userMessage.images,
userMessage.text,
onStream
);
if (visionResult) {
// 成功,将图片分析结果转换为文本消息
processedMessage = visionResult;
} else {
// 失败,返回错误信息
return '无法处理图片:当前模型不支持图片理解,且 Vision 服务未配置或调用失败。';
}
}
}
// 构建消息内容 // 构建消息内容
let messageContent: string | ContentBlock[]; let messageContent: string | ContentBlock[];
if (typeof userMessage === 'string') { if (typeof processedMessage === 'string') {
// 纯文本消息 // 纯文本消息
messageContent = userMessage; messageContent = processedMessage;
} else { } else {
// 带图片的消息 // 带图片的消息
const blocks: ContentBlock[] = []; const blocks: ContentBlock[] = [];
// 添加图片 // 添加图片
if (userMessage.images && userMessage.images.length > 0) { if (processedMessage.images && processedMessage.images.length > 0) {
for (const img of userMessage.images) { for (const img of processedMessage.images) {
blocks.push({ blocks.push({
type: 'image', type: 'image',
image: img.data, image: img.data,
@@ -258,10 +297,10 @@ export class Agent {
} }
// 添加文本 // 添加文本
if (userMessage.text) { if (processedMessage.text) {
blocks.push({ blocks.push({
type: 'text', type: 'text',
text: userMessage.text, text: processedMessage.text,
}); });
} }
@@ -484,6 +523,73 @@ export class Agent {
return this.currentAgentMode?.name ?? 'default'; return this.currentAgentMode?.name ?? 'default';
} }
/**
* 使用 Vision Agent 处理图片
* 当主模型不支持 vision 时,委托给 Vision Agent 分析图片
* @returns 包含图片分析结果的文本消息,或 null 表示失败
*/
private async processImagesWithVisionAgent(
images: ImageData[],
userText?: string,
onStream?: (text: string) => void
): Promise<string | null> {
// 检查 Vision 配置是否可用
const visionConfig = loadVisionConfig();
if (!visionConfig) {
onStream?.('\n⚠ Vision 服务未配置,无法处理图片\n');
return null;
}
// 获取 Vision Agent
const visionAgent = agentRegistry.get('vision');
if (!visionAgent) {
onStream?.('\n⚠ Vision Agent 未注册\n');
return null;
}
// 确保有工具注册表
if (!this.registry) {
onStream?.('\n⚠ 工具注册表未初始化\n');
return null;
}
onStream?.(`\n[委托 Vision Agent (${visionConfig.model}) 分析图片...]\n`);
// 构建 Vision 配置
const visionAgentConfig: AgentConfig = {
...this.config,
provider: visionConfig.provider,
apiKey: visionConfig.apiKey,
model: visionConfig.model,
baseUrl: visionConfig.baseUrl,
};
// 创建 Vision Agent 执行器
const executor = new AgentExecutor(visionAgent, visionAgentConfig, this.registry);
// 构建提示词
const prompt = userText || '请详细描述这张图片的内容';
// 执行 Vision 分析
const result = await executor.execute(prompt, {
workdir: process.cwd(),
images,
onStream: undefined, // Vision Agent 不使用流式输出
});
if (!result.success) {
onStream?.(`\n⚠ Vision 分析失败: ${result.error}\n`);
return null;
}
onStream?.('\n[Vision 分析完成]\n');
// 构建带分析结果的文本消息
const combinedText = `[图片分析结果 - 由 ${visionConfig.model} 提供]\n${result.text}\n\n用户问题: ${userText || '(无附加问题)'}`;
return combinedText;
}
/** /**
* 检查当前模型是否支持 vision(图片理解) * 检查当前模型是否支持 vision(图片理解)
*/ */
+31 -1
View File
@@ -1,9 +1,11 @@
import type { ToolWithMetadata } from '../types.js'; import type { ToolWithMetadata } from '../types.js';
import type { AgentConfig } from '../../types/index.js'; import type { AgentConfig } from '../../types/index.js';
import type { ImageData } from '../../agent/types.js';
import { agentRegistry, AgentExecutor } from '../../agent/index.js'; import { agentRegistry, AgentExecutor } from '../../agent/index.js';
import { toolRegistry } from '../registry.js'; import { toolRegistry } from '../registry.js';
import { SessionManager } from '../../session/index.js'; import { SessionManager } from '../../session/index.js';
import { getAgentManager } from '../../agent/manager.js'; import { getAgentManager } from '../../agent/manager.js';
import { loadVisionConfig } from '../../utils/config.js';
/** /**
* 模型预设映射 * 模型预设映射
@@ -95,6 +97,11 @@ export const taskTool: ToolWithMetadata = {
description: '是否后台运行。后台运行时立即返回 agentId,使用 agent_output 工具获取结果', description: '是否后台运行。后台运行时立即返回 agentId,使用 agent_output 工具获取结果',
required: false, required: false,
}, },
images: {
type: 'array',
description: '图片数据数组(用于 vision 相关任务),每个图片包含 data(base64)、mimeType、filename(可选)',
required: false,
},
}, },
metadata: { metadata: {
name: 'task', name: 'task',
@@ -110,12 +117,14 @@ export const taskTool: ToolWithMetadata = {
subagent_type, subagent_type,
model, model,
run_in_background, run_in_background,
images,
} = params as { } = params as {
description: string; description: string;
prompt: string; prompt: string;
subagent_type: string; subagent_type: string;
model?: string; model?: string;
run_in_background?: boolean; run_in_background?: boolean;
images?: ImageData[];
}; };
// 检查上下文是否已初始化 // 检查上下文是否已初始化
@@ -151,7 +160,26 @@ export const taskTool: ToolWithMetadata = {
// 2. 处理模型选择 // 2. 处理模型选择
let effectiveConfig = baseConfig; let effectiveConfig = baseConfig;
if (model) {
// Vision Agent 特殊处理:使用 VisionConfig 配置
if (subagent_type === 'vision') {
const visionConfig = loadVisionConfig();
if (!visionConfig) {
return {
success: false,
output: '',
error: 'Vision Agent 需要配置 Vision 服务。请在配置文件中设置 visionProvider、visionApiKey 等参数。',
};
}
// 使用 Vision 配置覆盖 baseConfig
effectiveConfig = {
...baseConfig,
provider: visionConfig.provider,
apiKey: visionConfig.apiKey,
model: visionConfig.model,
baseUrl: visionConfig.baseUrl,
};
} else if (model) {
const modelName = MODEL_PRESETS[model]; const modelName = MODEL_PRESETS[model];
if (!modelName) { if (!modelName) {
return { return {
@@ -180,6 +208,7 @@ export const taskTool: ToolWithMetadata = {
{ {
parentSessionId, parentSessionId,
workdir: process.cwd(), workdir: process.cwd(),
images,
} }
); );
@@ -209,6 +238,7 @@ export const taskTool: ToolWithMetadata = {
const result = await executor.execute(prompt, { const result = await executor.execute(prompt, {
parentSessionId, parentSessionId,
workdir: process.cwd(), workdir: process.cwd(),
images,
onStream: undefined, // 子任务不使用流式输出 onStream: undefined, // 子任务不使用流式输出
}); });
+2 -79
View File
@@ -12,11 +12,6 @@ import {
loadImages, loadImages,
formatFileSize, formatFileSize,
} from '../utils/image.js'; } from '../utils/image.js';
import {
analyzeImages,
isVisionAvailable,
getVisionInfo,
} from '../utils/vision.js';
import type { UserInput } from '../types/index.js'; import type { UserInput } from '../types/index.js';
export class TerminalUI { export class TerminalUI {
@@ -340,67 +335,6 @@ export class TerminalUI {
}; };
} }
// 处理不支持 Vision 的情况
private async handleNoVisionSupport(
userInput: UserInput
): Promise<string | null> {
// 检查 Vision 服务是否可用
if (!isVisionAvailable()) {
console.log(chalk.yellow('\n⚠ 当前模型不支持图片理解,且未配置 Vision 服务'));
console.log(chalk.gray('请在配置文件中设置 visionProvider、visionApiKey 等参数'));
console.log(chalk.gray('或切换到支持图片理解的模型(如 Claude、GPT-4o\n'));
return null;
}
const visionInfo = getVisionInfo();
// 提示用户选择
console.log(chalk.yellow('\n⚠ 当前模型不支持图片理解'));
console.log(chalk.gray('请选择处理方式:'));
console.log(chalk.white(` 1. 使用 Vision 服务 (${visionInfo.model}) 分析图片后继续对话`));
console.log(chalk.white(' 2. 取消本次输入'));
const choice = await new Promise<string>((resolve) => {
this.rl.question(chalk.green('选择 (1/2): '), resolve);
});
if (choice.trim() !== '1') {
console.log(chalk.gray('已取消\n'));
return null;
}
// 使用 Vision 服务分析图片
console.log(chalk.cyan(`\n正在使用 ${visionInfo.model} 分析图片...`));
const images = userInput.images || [];
if (images.length === 0) {
console.log(chalk.red('没有图片需要分析\n'));
return null;
}
// 调用 Vision API 分析图片
const result = await analyzeImages(
images.map(img => ({
data: img.data,
mimeType: img.mimeType,
filename: img.filename,
})),
userInput.text || undefined
);
if (!result.success) {
console.log(chalk.red(`\n图片分析失败: ${result.error}\n`));
return null;
}
console.log(chalk.green('✓ 图片分析完成\n'));
// 构建带图片描述的文本消息
const combinedText = `[Vision 服务分析结果]\n${result.description}\n\n用户原始问题: ${userInput.text}`;
return combinedText;
}
// 提问并获取用户输入 // 提问并获取用户输入
private prompt(): Promise<string> { private prompt(): Promise<string> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
@@ -507,20 +441,9 @@ export class TerminalUI {
continue; continue;
} }
let { userInput, hasImages } = processed; const { userInput, hasImages } = processed;
// 如果有图片且当前模型不支持 vision // 发送给 AI如果模型不支持 visionAgent 会自动委托 Vision Agent 处理)
if (hasImages && !this.agent.supportsVision()) {
const fallbackText = await this.handleNoVisionSupport(userInput);
if (!fallbackText) {
continue;
}
// 使用 Vision 分析结果替代图片
userInput = { text: fallbackText };
hasImages = false;
}
// 发送给 AI
process.stdout.write(chalk.gray('思考中...')); process.stdout.write(chalk.gray('思考中...'));
try { try {
-217
View File
@@ -1,217 +0,0 @@
import { loadVisionConfig, type VisionConfig } from './config.js';
/**
* Vision 服务 - 用于图片理解
* 当主模型不支持 vision 时,使用独立的 Vision 服务分析图片
* 使用原生 fetch 调用 OpenAI 兼容接口,以确保与百炼等服务兼容
*/
export interface ImageData {
/** base64 编码的图片数据 */
data: string;
/** MIME 类型 */
mimeType: string;
/** 文件名(可选) */
filename?: string;
}
export interface VisionAnalysisResult {
success: boolean;
/** 图片描述 */
description: string;
/** 错误信息(如果失败) */
error?: string;
}
/**
* 分析单张图片
*/
export async function analyzeImage(
image: ImageData,
prompt?: string
): Promise<VisionAnalysisResult> {
const config = loadVisionConfig();
if (!config) {
return {
success: false,
description: '',
error: '未配置 Vision 服务。请在配置文件中设置 visionProvider、visionApiKey 等参数。',
};
}
try {
const description = await callVisionAPI(config, [image], prompt);
return {
success: true,
description,
};
} catch (error) {
return {
success: false,
description: '',
error: error instanceof Error ? error.message : String(error),
};
}
}
/**
* 批量分析图片
*/
export async function analyzeImages(
images: ImageData[],
prompt?: string
): Promise<VisionAnalysisResult> {
const config = loadVisionConfig();
if (!config) {
return {
success: false,
description: '',
error: '未配置 Vision 服务。请在配置文件中设置 visionProvider、visionApiKey 等参数。',
};
}
if (images.length === 0) {
return {
success: false,
description: '',
error: '没有提供图片',
};
}
try {
const description = await callVisionAPI(config, images, prompt);
return {
success: true,
description,
};
} catch (error) {
return {
success: false,
description: '',
error: error instanceof Error ? error.message : String(error),
};
}
}
/**
* 调用 Vision API
* 使用原生 fetch 调用 OpenAI 兼容接口,确保与百炼等服务兼容
*/
async function callVisionAPI(
config: VisionConfig,
images: ImageData[],
userPrompt?: string
): Promise<string> {
// 目前只支持 OpenAI 兼容的 Vision API(如百炼的 qwen-vl-plus
if (config.provider !== 'openai') {
throw new Error(`暂不支持 ${config.provider} 的 Vision 服务`);
}
// 构建消息内容(OpenAI Vision API 格式)
const content: Array<
| { type: 'text'; text: string }
| { type: 'image_url'; image_url: { url: string } }
> = [];
// 添加图片(使用 data URL 格式)
for (const img of images) {
content.push({
type: 'image_url',
image_url: {
url: `data:${img.mimeType};base64,${img.data}`,
},
});
}
// 添加提示文本
const defaultPrompt = images.length === 1
? '请详细描述这张图片的内容,包括主要元素、文字、颜色、布局等信息。'
: `请详细描述这 ${images.length} 张图片的内容,包括主要元素、文字、颜色、布局等信息。`;
content.push({
type: 'text',
text: userPrompt || defaultPrompt,
});
// 构建请求体
const requestBody = {
model: config.model,
messages: [
{
role: 'user',
content,
},
],
max_tokens: 2000,
};
// 确定 API 端点
const baseUrl = config.baseUrl || 'https://api.openai.com/v1';
const endpoint = `${baseUrl.replace(/\/$/, '')}/chat/completions`;
// 发送请求
const response = await fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${config.apiKey}`,
},
body: JSON.stringify(requestBody),
});
if (!response.ok) {
const errorText = await response.text();
let errorMessage = `API 请求失败: ${response.status} ${response.statusText}`;
try {
const errorJson = JSON.parse(errorText);
if (errorJson.error?.message) {
errorMessage = errorJson.error.message;
}
} catch {
if (errorText) {
errorMessage += ` - ${errorText}`;
}
}
throw new Error(errorMessage);
}
const result = await response.json() as {
choices?: Array<{
message?: {
content?: string;
};
}>;
};
const text = result.choices?.[0]?.message?.content;
if (!text) {
throw new Error('API 返回了空响应');
}
return text;
}
/**
* 检查 Vision 服务是否可用
*/
export function isVisionAvailable(): boolean {
const config = loadVisionConfig();
return config !== null;
}
/**
* 获取 Vision 配置信息(用于显示)
*/
export function getVisionInfo(): { available: boolean; provider?: string; model?: string } {
const config = loadVisionConfig();
if (!config) {
return { available: false };
}
return {
available: true,
provider: config.provider,
model: config.model,
};
}