Skip to content

Commit 592afd9

Browse files
authored
Handle text case in computer-use tool (#543)
## Summary Change the computer-use tool to handle text content in addition to image content. Previously, the tool only accepted single image blocks, but now it can process text blocks, mixed content (text + images), and multiple text blocks. - Relaxed validation to allow multiple content blocks instead of requiring exactly one - Added support for text-based tool results with proper concatenation - Enhanced error messages to be more descriptive about content requirements ## How was it tested? Added unit tests. ## Community Contribution License All community contributions in this pull request are licensed to the project maintainers under the terms of the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0). By creating this pull request I represent that I have the right to license the contributions to the project maintainers under the Apache 2 License as stated in the [Community Contribution License](https://github.com/jetify-com/opensource/blob/main/CONTRIBUTING.md#community-contribution-license).
1 parent 5570197 commit 592afd9

File tree

2 files changed

+189
-41
lines changed

2 files changed

+189
-41
lines changed

aisdk/ai/provider/openai/internal/codec/encode_prompt.go

Lines changed: 88 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"encoding/base64"
55
"encoding/json"
66
"fmt"
7+
"strings"
78

89
"github.com/openai/openai-go"
910
"github.com/openai/openai-go/responses"
@@ -409,52 +410,104 @@ func EncodeToolMessage(message *api.ToolMessage) ([]responses.ResponseInputItemU
409410

410411
// encodeComputerToolResult handles encoding the result of a computer use tool call
411412
func encodeComputerToolResult(result *api.ToolResultBlock) (responses.ResponseInputItemUnionParam, error) {
412-
if len(result.Content) != 1 {
413-
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("expected 1 content block for computer use tool result, got %d", len(result.Content))
414-
}
413+
// No content at all - this is an error
414+
if len(result.Content) == 0 {
415+
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("expected at least 1 content block for computer use tool result, got 0")
416+
}
417+
418+
// Single content block - try to handle as screenshot
419+
if len(result.Content) == 1 {
420+
content := result.Content[0]
421+
var imageBlock *api.ImageBlock
422+
switch b := content.(type) {
423+
case *api.ImageBlock:
424+
imageBlock = b
425+
case api.ImageBlock:
426+
imageBlock = &b
427+
default:
428+
// Single non-image block - check if it's text
429+
if _, ok := content.(*api.TextBlock); ok {
430+
return encodeTextToolResult(result)
431+
}
432+
// Single block that's neither image nor text - this is invalid
433+
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("computer use tool result has 1 content block of type %s, expected image or text", content.Type())
434+
}
415435

416-
content := result.Content[0]
417-
var imageBlock *api.ImageBlock
418-
switch b := content.(type) {
419-
case *api.ImageBlock:
420-
imageBlock = b
421-
case api.ImageBlock:
422-
imageBlock = &b
423-
default:
424-
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("expected image block for computer use tool result, got %T", content)
436+
// Create data URL from image data
437+
// TODO: Add helper methods to the image and file blocks to make this easier
438+
dataURL := "data:" + imageBlock.MediaType + ";base64," + base64.StdEncoding.EncodeToString(imageBlock.Data)
439+
440+
screenshot := responses.ResponseComputerToolCallOutputScreenshotParam{
441+
Type: "computer_screenshot",
442+
ImageURL: openai.String(dataURL),
443+
}
444+
445+
// Extract safety checks from provider metadata if available
446+
var acknowledgedSafetyChecks []responses.ResponseInputItemComputerCallOutputAcknowledgedSafetyCheckParam
447+
if metadata := GetMetadata(result); metadata != nil {
448+
for _, check := range metadata.ComputerSafetyChecks {
449+
acknowledgedSafetyChecks = append(acknowledgedSafetyChecks, responses.ResponseInputItemComputerCallOutputAcknowledgedSafetyCheckParam{
450+
ID: check.ID,
451+
Code: openai.String(check.Code),
452+
Message: openai.String(check.Message),
453+
})
454+
}
455+
}
456+
457+
// Create the computer call output parameter
458+
output := responses.ResponseInputItemComputerCallOutputParam{
459+
CallID: result.ToolCallID,
460+
Output: screenshot,
461+
AcknowledgedSafetyChecks: acknowledgedSafetyChecks,
462+
}
463+
464+
return responses.ResponseInputItemUnionParam{
465+
OfComputerCallOutput: &output,
466+
}, nil
425467
}
426468

427-
// Create data URL from image data
428-
// TODO: Add helper methods to the image and file blocks to make this easier
429-
dataURL := "data:" + imageBlock.MediaType + ";base64," + base64.StdEncoding.EncodeToString(imageBlock.Data)
469+
// Multiple blocks - check if any are text
470+
hasText := false
471+
for _, content := range result.Content {
472+
if _, ok := content.(*api.TextBlock); ok {
473+
hasText = true
474+
break
475+
}
476+
}
430477

431-
screenshot := responses.ResponseComputerToolCallOutputScreenshotParam{
432-
Type: "computer_screenshot",
433-
ImageURL: openai.String(dataURL),
478+
if !hasText {
479+
// Multiple blocks but none are text - this is ambiguous
480+
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("computer use tool result has %d content blocks but no text content", len(result.Content))
434481
}
435482

436-
// Extract safety checks from provider metadata if available
437-
var acknowledgedSafetyChecks []responses.ResponseInputItemComputerCallOutputAcknowledgedSafetyCheckParam
438-
if metadata := GetMetadata(result); metadata != nil {
439-
for _, check := range metadata.ComputerSafetyChecks {
440-
acknowledgedSafetyChecks = append(acknowledgedSafetyChecks, responses.ResponseInputItemComputerCallOutputAcknowledgedSafetyCheckParam{
441-
ID: check.ID,
442-
Code: openai.String(check.Code),
443-
Message: openai.String(check.Message),
444-
})
483+
// Has text content - use fallback
484+
return encodeTextToolResult(result)
485+
}
486+
487+
// encodeTextToolResult handles encoding text-based tool results, with Content[] taking precedence over Result
488+
func encodeTextToolResult(result *api.ToolResultBlock) (responses.ResponseInputItemUnionParam, error) {
489+
output := ""
490+
491+
// Check Content[] first - more expressive when available
492+
if len(result.Content) > 0 {
493+
for _, content := range result.Content {
494+
if textBlock, ok := content.(*api.TextBlock); ok {
495+
output += textBlock.Text + "\n"
496+
}
445497
}
498+
output = strings.TrimSuffix(output, "\n")
446499
}
447500

448-
// Create the computer call output parameter
449-
output := responses.ResponseInputItemComputerCallOutputParam{
450-
CallID: result.ToolCallID,
451-
Output: screenshot,
452-
AcknowledgedSafetyChecks: acknowledgedSafetyChecks,
501+
// If no text content found, use Result field
502+
if output == "" && result.Result != nil {
503+
resultJSON, err := json.Marshal(result.Result)
504+
if err != nil {
505+
return responses.ResponseInputItemUnionParam{}, fmt.Errorf("failed to marshal tool result: %v", err)
506+
}
507+
output = string(resultJSON)
453508
}
454509

455-
return responses.ResponseInputItemUnionParam{
456-
OfComputerCallOutput: &output,
457-
}, nil
510+
return responses.ResponseInputItemParamOfFunctionCallOutput(result.ToolCallID, output), nil
458511
}
459512

460513
func EncodeToolResultBlock(result *api.ToolResultBlock) (responses.ResponseInputItemUnionParam, error) {

aisdk/ai/provider/openai/internal/codec/encode_prompt_test.go

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,10 +1023,10 @@ var toolMessageTests = []testCase{
10231023
},
10241024
},
10251025
},
1026-
expectedError: "expected 1 content block for computer use tool result, got 0",
1026+
expectedError: "expected at least 1 content block for computer use tool result, got 0",
10271027
},
10281028
{
1029-
name: "computer tool result with multiple content blocks",
1029+
name: "computer tool result with multiple content blocks but no text",
10301030
input: []api.Message{
10311031
&api.ToolMessage{
10321032
Content: []api.ToolResultBlock{
@@ -1046,25 +1046,120 @@ var toolMessageTests = []testCase{
10461046
},
10471047
},
10481048
},
1049-
expectedError: "expected 1 content block for computer use tool result, got 2",
1049+
expectedError: "computer use tool result has 2 content blocks but no text content",
10501050
},
10511051
{
1052-
name: "computer tool result with wrong content type",
1052+
name: "computer tool result with text content",
10531053
input: []api.Message{
10541054
&api.ToolMessage{
10551055
Content: []api.ToolResultBlock{
10561056
{
10571057
ToolCallID: "openai.computer_use_preview",
10581058
Content: []api.ContentBlock{
10591059
&api.TextBlock{
1060-
Text: "not an image",
1060+
Text: "Action completed successfully",
10611061
},
10621062
},
10631063
},
10641064
},
10651065
},
10661066
},
1067-
expectedError: "expected image block for computer use tool result",
1067+
expectedMessages: []string{
1068+
`{
1069+
"type": "function_call_output",
1070+
"call_id": "openai.computer_use_preview",
1071+
"output": "Action completed successfully"
1072+
}`,
1073+
},
1074+
},
1075+
{
1076+
name: "computer tool result with mixed content (text and image)",
1077+
input: []api.Message{
1078+
&api.ToolMessage{
1079+
Content: []api.ToolResultBlock{
1080+
{
1081+
ToolCallID: "openai.computer_use_preview",
1082+
Content: []api.ContentBlock{
1083+
&api.TextBlock{
1084+
Text: "Screenshot taken",
1085+
},
1086+
&api.ImageBlock{
1087+
Data: []byte("test-image-data"),
1088+
MediaType: "image/png",
1089+
},
1090+
},
1091+
},
1092+
},
1093+
},
1094+
},
1095+
expectedMessages: []string{
1096+
`{
1097+
"type": "function_call_output",
1098+
"call_id": "openai.computer_use_preview",
1099+
"output": "Screenshot taken"
1100+
}`,
1101+
},
1102+
},
1103+
{
1104+
name: "computer tool result with multiple text blocks",
1105+
input: []api.Message{
1106+
&api.ToolMessage{
1107+
Content: []api.ToolResultBlock{
1108+
{
1109+
ToolCallID: "openai.computer_use_preview",
1110+
Content: []api.ContentBlock{
1111+
&api.TextBlock{
1112+
Text: "First line",
1113+
},
1114+
&api.TextBlock{
1115+
Text: "Second line",
1116+
},
1117+
},
1118+
},
1119+
},
1120+
},
1121+
},
1122+
expectedMessages: []string{
1123+
`{
1124+
"type": "function_call_output",
1125+
"call_id": "openai.computer_use_preview",
1126+
"output": "First line\nSecond line"
1127+
}`,
1128+
},
1129+
},
1130+
{
1131+
name: "computer tool result falls back to Result field",
1132+
input: []api.Message{
1133+
&api.ToolMessage{
1134+
Content: []api.ToolResultBlock{
1135+
{
1136+
ToolCallID: "openai.computer_use_preview",
1137+
Content: []api.ContentBlock{},
1138+
Result: json.RawMessage(`{"status":"error","message":"Action failed"}`),
1139+
},
1140+
},
1141+
},
1142+
},
1143+
expectedError: "expected at least 1 content block for computer use tool result, got 0",
1144+
},
1145+
{
1146+
name: "computer tool result with single reasoning block",
1147+
input: []api.Message{
1148+
&api.ToolMessage{
1149+
Content: []api.ToolResultBlock{
1150+
{
1151+
ToolCallID: "openai.computer_use_preview",
1152+
Content: []api.ContentBlock{
1153+
&api.ReasoningBlock{
1154+
Text: "Processing the request...",
1155+
Signature: "signature_123",
1156+
},
1157+
},
1158+
},
1159+
},
1160+
},
1161+
},
1162+
expectedError: "computer use tool result has 1 content block of type reasoning, expected image or text",
10681163
},
10691164
{
10701165
name: "tool message value type (not pointer)",

0 commit comments

Comments
 (0)