From 60a31b7a55f067cf592045e2911af7dae53bc789 Mon Sep 17 00:00:00 2001 From: lupengfei-johnson <330774162@qq.com> Date: Sun, 21 Sep 2025 22:20:41 +0800 Subject: [PATCH] Add adaptation for the Doubao-Seed-1.6-vision model --- .gitignore | 8 + docker/docker-compose.yml | 2 + packages/bytebot-agent/.prettierrc | 5 +- .../bytebot-agent/src/agent/agent.module.ts | 2 + .../src/agent/agent.processor.ts | 3 + .../bytebot-agent/src/agent/agent.types.ts | 2 +- .../src/doubao/doubao.constants.ts | 12 + .../bytebot-agent/src/doubao/doubao.module.ts | 10 + .../src/doubao/doubao.service.ts | 438 ++++++++++++++++++ .../bytebot-agent/src/doubao/doubao.tools.ts | 69 +++ .../src/tasks/tasks.controller.ts | 3 + 11 files changed, 551 insertions(+), 3 deletions(-) create mode 100644 packages/bytebot-agent/src/doubao/doubao.constants.ts create mode 100644 packages/bytebot-agent/src/doubao/doubao.module.ts create mode 100644 packages/bytebot-agent/src/doubao/doubao.service.ts create mode 100644 packages/bytebot-agent/src/doubao/doubao.tools.ts diff --git a/.gitignore b/.gitignore index c072cfec..518f2fe0 100644 --- a/.gitignore +++ b/.gitignore @@ -197,3 +197,11 @@ report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json # QEMU *.qcow2 + +docker/start-proxy.bat +docker/start.bat +docker/stop-proxy.bat +docker/stop.bat +packages/bytebot-agent-build.bat +packages/Dockerfile +.prettierrc diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 7d4c1e1d..5ae1ed74 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -51,6 +51,8 @@ services: - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY} - GEMINI_API_KEY=${GEMINI_API_KEY} + - DOUBAO_API_KEY=${DOUBAO_API_KEY} + - DOUBAO_BASE_URL=${DOUBAO_BASE_URL} depends_on: - postgres networks: diff --git a/packages/bytebot-agent/.prettierrc b/packages/bytebot-agent/.prettierrc index dcb72794..4510dc5c 100644 --- a/packages/bytebot-agent/.prettierrc +++ b/packages/bytebot-agent/.prettierrc @@ -1,4 +1,5 @@ { "singleQuote": true, - "trailingComma": "all" -} \ No newline at end of file + "trailingComma": "all", + "endOfLine": "auto" +} diff --git a/packages/bytebot-agent/src/agent/agent.module.ts b/packages/bytebot-agent/src/agent/agent.module.ts index 40e651ab..6e6ef718 100644 --- a/packages/bytebot-agent/src/agent/agent.module.ts +++ b/packages/bytebot-agent/src/agent/agent.module.ts @@ -11,6 +11,7 @@ import { GoogleModule } from '../google/google.module'; import { SummariesModule } from 'src/summaries/summaries.modue'; import { AgentAnalyticsService } from './agent.analytics'; import { ProxyModule } from 'src/proxy/proxy.module'; +import { DoubaoModule } from '../doubao/doubao.module'; @Module({ imports: [ @@ -22,6 +23,7 @@ import { ProxyModule } from 'src/proxy/proxy.module'; OpenAIModule, GoogleModule, ProxyModule, + DoubaoModule, ], providers: [ AgentProcessor, diff --git a/packages/bytebot-agent/src/agent/agent.processor.ts b/packages/bytebot-agent/src/agent/agent.processor.ts index c48912fa..6ceca340 100644 --- a/packages/bytebot-agent/src/agent/agent.processor.ts +++ b/packages/bytebot-agent/src/agent/agent.processor.ts @@ -39,6 +39,7 @@ import { import { SummariesService } from '../summaries/summaries.service'; import { handleComputerToolUse } from './agent.computer-use'; import { ProxyService } from '../proxy/proxy.service'; +import { DoubaoService } from '../doubao/doubao.service'; @Injectable() export class AgentProcessor { @@ -57,12 +58,14 @@ export class AgentProcessor { private readonly googleService: GoogleService, private readonly proxyService: ProxyService, private readonly inputCaptureService: InputCaptureService, + private readonly doubaoService: DoubaoService, ) { this.services = { anthropic: this.anthropicService, openai: this.openaiService, google: this.googleService, proxy: this.proxyService, + doubao: this.doubaoService, }; this.logger.log('AgentProcessor initialized'); } diff --git a/packages/bytebot-agent/src/agent/agent.types.ts b/packages/bytebot-agent/src/agent/agent.types.ts index 981ee0eb..0eb02d31 100644 --- a/packages/bytebot-agent/src/agent/agent.types.ts +++ b/packages/bytebot-agent/src/agent/agent.types.ts @@ -21,7 +21,7 @@ export interface BytebotAgentService { } export interface BytebotAgentModel { - provider: 'anthropic' | 'openai' | 'google' | 'proxy'; + provider: 'anthropic' | 'openai' | 'google' | 'proxy' | 'doubao'; name: string; title: string; contextWindow?: number; diff --git a/packages/bytebot-agent/src/doubao/doubao.constants.ts b/packages/bytebot-agent/src/doubao/doubao.constants.ts new file mode 100644 index 00000000..a670a778 --- /dev/null +++ b/packages/bytebot-agent/src/doubao/doubao.constants.ts @@ -0,0 +1,12 @@ +import { BytebotAgentModel } from 'src/agent/agent.types'; + +export const DOUBAO_MODELS: BytebotAgentModel[] = [ + { + provider: 'doubao', + name: 'doubao-seed-1-6-vision-250815', + title: 'doubao-seed-1-6-vision', + contextWindow: 1047576, + }, +]; + +export const DEFAULT_MODEL = DOUBAO_MODELS[0]; diff --git a/packages/bytebot-agent/src/doubao/doubao.module.ts b/packages/bytebot-agent/src/doubao/doubao.module.ts new file mode 100644 index 00000000..ac3b2b2b --- /dev/null +++ b/packages/bytebot-agent/src/doubao/doubao.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; +import { ConfigModule } from '@nestjs/config'; +import { DoubaoService } from './doubao.service'; + +@Module({ + imports: [ConfigModule], + providers: [DoubaoService], + exports: [DoubaoService], +}) +export class DoubaoModule {} diff --git a/packages/bytebot-agent/src/doubao/doubao.service.ts b/packages/bytebot-agent/src/doubao/doubao.service.ts new file mode 100644 index 00000000..64899206 --- /dev/null +++ b/packages/bytebot-agent/src/doubao/doubao.service.ts @@ -0,0 +1,438 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import OpenAI, { APIUserAbortError } from 'openai'; +import { + ChatCompletionMessageParam, + ChatCompletionContentPart, +} from 'openai/resources/chat/completions'; +import { + MessageContentBlock, + MessageContentType, + TextContentBlock, + ToolUseContentBlock, + ToolResultContentBlock, + ImageContentBlock, + isUserActionContentBlock, + isComputerToolUseContentBlock, + isImageContentBlock, + ThinkingContentBlock, +} from '@bytebot/shared'; +import { Message, Role } from '@prisma/client'; +import { doubaoTools } from './doubao.tools'; +import { + BytebotAgentService, + BytebotAgentInterrupt, + BytebotAgentResponse, +} from '../agent/agent.types'; +import { DEFAULT_DISPLAY_SIZE } from '../agent/agent.constants'; +import { DEFAULT_MODEL } from './doubao.constants'; + +@Injectable() +export class DoubaoService implements BytebotAgentService { + private readonly openai: OpenAI; + private readonly logger = new Logger(DoubaoService.name); + + constructor(private readonly configService: ConfigService) { + const apiKey = this.configService.get('DOUBAO_API_KEY'); + const baseURL = this.configService.get('DOUBAO_BASE_URL'); + + if (!apiKey) { + this.logger.warn( + 'DOUBAO_API_KEY is not set. DoubaoService will not work properly.', + ); + } + + // Initialize OpenAI client with proxy configuration + this.openai = new OpenAI({ + apiKey: apiKey || 'dummy-key-for-initialization', + baseURL: baseURL || 'https://ark.cn-beijing.volces.com/api/v3', + }); + } + + /** + * Main method to generate messages using the Chat Completions API + */ + async generateMessage( + systemPrompt: string, + messages: Message[], + model: string, + useTools: boolean = true, + signal?: AbortSignal, + ): Promise { + // Convert messages to Chat Completion format + const chatMessages = this.formatMessagesForChatCompletion( + systemPrompt, + messages, + ); + this.logger.debug('chatMessages' + JSON.stringify(chatMessages, null, 2)); + try { + // Prepare the Chat Completion request + const completionRequest: OpenAI.Chat.ChatCompletionCreateParams = { + model, + messages: chatMessages, + max_tokens: 8192, + ...(useTools && { tools: doubaoTools }), + reasoning_effort: 'high', + }; + + // Make the API call + const completion = await this.openai.chat.completions.create( + completionRequest, + { signal }, + ); + + this.logger.debug('completion' + JSON.stringify(completion, null, 2)); + + // Process the response + const choice = completion.choices[0]; + if (!choice || !choice.message) { + throw new Error('No valid response from Chat Completion API'); + } + + // Convert response to MessageContentBlocks + const contentBlocks = this.formatChatCompletionResponse( + choice.message, + model, + ); + + this.logger.debug( + 'contentBlocks' + JSON.stringify(contentBlocks, null, 2), + ); + + return { + contentBlocks, + tokenUsage: { + inputTokens: completion.usage?.prompt_tokens || 0, + outputTokens: completion.usage?.completion_tokens || 0, + totalTokens: completion.usage?.total_tokens || 0, + }, + }; + } catch (error: any) { + if (error instanceof APIUserAbortError) { + this.logger.log('Chat Completion API call aborted'); + throw new BytebotAgentInterrupt(); + } + + this.logger.error( + `Error sending message to proxy: ${error.message}`, + error.stack, + ); + throw error; + } + } + + /** + * Convert Bytebot messages to Chat Completion format + */ + private formatMessagesForChatCompletion( + systemPrompt: string, + messages: Message[], + ): ChatCompletionMessageParam[] { + const chatMessages: ChatCompletionMessageParam[] = []; + + // Add system message + chatMessages.push({ + role: 'system', + content: systemPrompt, + }); + + // Process each message + for (const message of messages) { + const messageContentBlocks = message.content as MessageContentBlock[]; + + // Handle user actions specially + if ( + messageContentBlocks.every((block) => isUserActionContentBlock(block)) + ) { + const userActionBlocks = messageContentBlocks.flatMap( + (block) => block.content, + ); + + for (const block of userActionBlocks) { + if (isComputerToolUseContentBlock(block)) { + chatMessages.push({ + role: 'user', + content: `User performed action: ${block.name}\n${JSON.stringify( + block.input, + null, + 2, + )}`, + }); + } else if (isImageContentBlock(block)) { + chatMessages.push({ + role: 'user', + content: [ + { + type: 'image_url', + image_url: { + url: `data:${block.source.media_type};base64,${block.source.data}`, + detail: 'high', + }, + }, + ], + }); + } + } + } else { + for (const block of messageContentBlocks) { + switch (block.type) { + case MessageContentType.Text: { + chatMessages.push({ + role: message.role === Role.USER ? 'user' : 'assistant', + content: block.text, + }); + break; + } + case MessageContentType.Image: { + const imageBlock = block as ImageContentBlock; + chatMessages.push({ + role: 'user', + content: [ + { + type: 'image_url', + image_url: { + url: `data:${imageBlock.source.media_type};base64,${imageBlock.source.data}`, + detail: 'high', + }, + }, + ], + }); + break; + } + case MessageContentType.ToolUse: { + const toolBlock = block as ToolUseContentBlock; + chatMessages.push({ + role: 'assistant', + content: '', + tool_calls: [ + { + id: toolBlock.id, + type: 'function', + function: { + name: toolBlock.name, + arguments: JSON.stringify(toolBlock.input), + }, + }, + ], + }); + break; + } + case MessageContentType.Thinking: { + const thinkingBlock = block as ThinkingContentBlock; + const message: ChatCompletionMessageParam = { + role: 'assistant', + content: '', + }; + message['reasoning_content'] = thinkingBlock.thinking; + chatMessages.push(message); + break; + } + case MessageContentType.ToolResult: { + const toolResultBlock = block as ToolResultContentBlock; + + if ( + toolResultBlock.content.every( + (content) => content.type === MessageContentType.Image, + ) + ) { + chatMessages.push({ + role: 'tool', + tool_call_id: toolResultBlock.tool_use_id, + content: 'screenshot', + }); + } + + toolResultBlock.content.forEach((content) => { + if (content.type === MessageContentType.Text) { + chatMessages.push({ + role: 'tool', + tool_call_id: toolResultBlock.tool_use_id, + content: content.text, + }); + } + + if (content.type === MessageContentType.Image) { + chatMessages.push({ + role: 'user', + content: [ + { + type: 'text', + text: 'Screenshot', + }, + { + type: 'image_url', + image_url: { + url: `data:${content.source.media_type};base64,${content.source.data}`, + detail: 'high', + }, + }, + ], + }); + } + }); + break; + } + } + } + } + } + + return chatMessages; + } + + /** + * Convert Chat Completion response to MessageContentBlocks + */ + private formatChatCompletionResponse( + message: OpenAI.Chat.ChatCompletionMessage, + model, + ): MessageContentBlock[] { + const contentBlocks: MessageContentBlock[] = []; + + // Handle text content + if (message.content) { + //Doubao sometimes places the JSON for tool_calls in the content field. It needs to be extracted and moved into the tool_calls field. + const pattern = + /<\[PLHD\d+_never_used_[a-f0-9]+\]>\[(.*?)\]<\[PLHD\d+_never_used_[a-f0-9]+\]>/; + const pattern2 = /\[?(.*?)\]?<\|FunctionCallEnd\|>/; + + const match = message.content?.match(pattern) || null; + const match2 = message.content?.match(pattern2) || null; + if (match) { + this.logger.debug( + 'Doubao places the JSON for tool_calls in the content field,content:' + + JSON.stringify(message.content), + ); + try { + const toolData = JSON.parse(match[1]); + contentBlocks.push({ + type: MessageContentType.ToolUse, + id: `call_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`, + name: toolData.name, + input: toolData.parameters, + } as ToolUseContentBlock); + } catch (error) { + this.logger.error( + 'extract from content and moved into the tool_calls field error:', + error, + ); + } + } else if (match2) { + this.logger.debug( + 'Doubao places the JSON for tool_calls in the content field,content:' + + JSON.stringify(message.content), + ); + try { + const toolData = JSON.parse(match2[1]); + contentBlocks.push({ + type: MessageContentType.ToolUse, + id: `call_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`, + name: toolData.name, + input: toolData.parameters, + } as ToolUseContentBlock); + } catch (error) { + this.logger.error( + 'extract from content and moved into the tool_calls field error:', + error, + ); + } + } else { + contentBlocks.push({ + type: MessageContentType.Text, + text: message.content, + } as TextContentBlock); + } + } + + if (message['reasoning_content']) { + contentBlocks.push({ + type: MessageContentType.Thinking, + thinking: message['reasoning_content'], + signature: message['reasoning_content'], + } as ThinkingContentBlock); + } + + // Handle tool calls + if (message.tool_calls && message.tool_calls.length > 0) { + for (const toolCall of message.tool_calls) { + if (toolCall.type === 'function') { + let parsedInput = {}; + try { + parsedInput = JSON.parse(toolCall.function.arguments || '{}'); + if ( + 'coordinates' in parsedInput && + parsedInput.coordinates !== undefined && + parsedInput.coordinates !== null + ) { + if ( + parsedInput.coordinates && + typeof parsedInput.coordinates === 'object' && + !Array.isArray(parsedInput.coordinates) + ) { + //Doubao returns coordinates based on a 1000*1000 screen, which need to be converted to the correct coordinates. + if ('x' in parsedInput.coordinates) { + this.logger.debug( + 'before doubao coordinates transfer x:' + + parsedInput.coordinates.x, + ); + if ( + typeof parsedInput?.coordinates?.x === 'number' && + Number.isInteger(parsedInput.coordinates.x) + ) { + parsedInput.coordinates.x = + (parsedInput.coordinates.x / 1000) * + DEFAULT_DISPLAY_SIZE.width; + } + this.logger.debug( + 'after doubao coordinates transfer x:' + + parsedInput.coordinates.x, + ); + } + if ('y' in parsedInput.coordinates) { + this.logger.debug( + 'before doubao coordinates transfer y:' + + parsedInput.coordinates.y, + ); + if ( + typeof parsedInput?.coordinates?.y === 'number' && + Number.isInteger(parsedInput.coordinates.y) + ) { + parsedInput.coordinates.y = + (parsedInput.coordinates.y / 1000) * + DEFAULT_DISPLAY_SIZE.height; + } + this.logger.debug( + 'after doubao coordinates transfer y:' + + parsedInput.coordinates.y, + ); + } + } + } + } catch (e) { + this.logger.warn( + `Failed to parse tool call arguments: ${toolCall.function.arguments}`, + ); + parsedInput = {}; + } + + contentBlocks.push({ + type: MessageContentType.ToolUse, + id: toolCall.id, + name: toolCall.function.name, + input: parsedInput, + } as ToolUseContentBlock); + } + } + } + + // Handle refusal + if (message.refusal) { + contentBlocks.push({ + type: MessageContentType.Text, + text: `Refusal: ${message.refusal}`, + } as TextContentBlock); + } + + return contentBlocks; + } +} diff --git a/packages/bytebot-agent/src/doubao/doubao.tools.ts b/packages/bytebot-agent/src/doubao/doubao.tools.ts new file mode 100644 index 00000000..6dee4a8d --- /dev/null +++ b/packages/bytebot-agent/src/doubao/doubao.tools.ts @@ -0,0 +1,69 @@ +import { ChatCompletionTool } from 'openai/resources'; +import { agentTools } from '../agent/agent.tools'; + +/** + * Converts an agent tool definition to OpenAI Chat Completion tool format + */ +function agentToolToChatCompletionTool(agentTool: any): ChatCompletionTool { + return { + type: 'function', + function: { + name: agentTool.name, + description: agentTool.description, + parameters: agentTool.input_schema, + }, + }; +} + +/** + * Convert tool name from snake_case to camelCase + */ +function convertToCamelCase(name: string): string { + return name + .split('_') + .map((part, index) => { + if (index === 0) return part; + if (part === 'computer') return ''; + return part.charAt(0).toUpperCase() + part.slice(1); + }) + .join('') + .replace(/^computer/, ''); +} + +/** + * All tools converted to Chat Completion format + */ +export const doubaoTools: ChatCompletionTool[] = agentTools.map((tool) => + agentToolToChatCompletionTool(tool), +); + +/** + * Individual tool exports for selective usage + */ +const toolMap = agentTools.reduce( + (acc, tool) => { + const chatCompletionTool = agentToolToChatCompletionTool(tool); + const camelCaseName = convertToCamelCase(tool.name); + acc[camelCaseName + 'Tool'] = chatCompletionTool; + return acc; + }, + {} as Record, +); + +// Export individual tools with proper names +export const moveMouseTool = toolMap.moveMouseTool; +export const traceMouseTool = toolMap.traceMouseTool; +export const clickMouseTool = toolMap.clickMouseTool; +export const pressMouseTool = toolMap.pressMouseTool; +export const dragMouseTool = toolMap.dragMouseTool; +export const scrollTool = toolMap.scrollTool; +export const typeKeysTool = toolMap.typeKeysTool; +export const pressKeysTool = toolMap.pressKeysTool; +export const typeTextTool = toolMap.typeTextTool; +export const pasteTextTool = toolMap.pasteTextTool; +export const waitTool = toolMap.waitTool; +export const screenshotTool = toolMap.screenshotTool; +export const cursorPositionTool = toolMap.cursorPositionTool; +export const setTaskStatusTool = toolMap.setTaskStatusTool; +export const createTaskTool = toolMap.createTaskTool; +export const applicationTool = toolMap.applicationTool; diff --git a/packages/bytebot-agent/src/tasks/tasks.controller.ts b/packages/bytebot-agent/src/tasks/tasks.controller.ts index 982c4a4f..a2962523 100644 --- a/packages/bytebot-agent/src/tasks/tasks.controller.ts +++ b/packages/bytebot-agent/src/tasks/tasks.controller.ts @@ -19,10 +19,12 @@ import { ANTHROPIC_MODELS } from '../anthropic/anthropic.constants'; import { OPENAI_MODELS } from '../openai/openai.constants'; import { GOOGLE_MODELS } from '../google/google.constants'; import { BytebotAgentModel } from 'src/agent/agent.types'; +import { DOUBAO_MODELS } from '../doubao/doubao.constants'; const geminiApiKey = process.env.GEMINI_API_KEY; const anthropicApiKey = process.env.ANTHROPIC_API_KEY; const openaiApiKey = process.env.OPENAI_API_KEY; +const doubaoApiKey = process.env.DOUBAO_API_KEY; const proxyUrl = process.env.BYTEBOT_LLM_PROXY_URL; @@ -30,6 +32,7 @@ const models = [ ...(anthropicApiKey ? ANTHROPIC_MODELS : []), ...(openaiApiKey ? OPENAI_MODELS : []), ...(geminiApiKey ? GOOGLE_MODELS : []), + ...(doubaoApiKey ? DOUBAO_MODELS : []), ]; @Controller('tasks')