mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-23 04:26:05 +00:00
fix(llm): emit structured input_image content for tool-result media in OpenAI Responses (#28754)
This commit is contained in:
parent
59e486a917
commit
700d012025
5 changed files with 224 additions and 2 deletions
|
|
@ -14,6 +14,8 @@ import {
|
|||
type TextPart,
|
||||
type ToolCallPart,
|
||||
type ToolDefinition,
|
||||
type ToolResultContentPart,
|
||||
type ToolResultPart,
|
||||
} from "../schema"
|
||||
import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
|
||||
import { OpenAIOptions } from "./utils/openai-options"
|
||||
|
|
@ -55,6 +57,19 @@ const OpenAIResponsesReasoningItem = Schema.Struct({
|
|||
encrypted_content: optionalNull(Schema.String),
|
||||
})
|
||||
|
||||
// `function_call_output.output` accepts either a plain string or an ordered
|
||||
// array of content items so tools can return images in addition to text.
|
||||
// https://platform.openai.com/docs/api-reference/responses/object
|
||||
const OpenAIResponsesFunctionCallOutputContent = Schema.Union([
|
||||
OpenAIResponsesInputText,
|
||||
OpenAIResponsesInputImage,
|
||||
])
|
||||
|
||||
const OpenAIResponsesFunctionCallOutput = Schema.Union([
|
||||
Schema.String,
|
||||
Schema.Array(OpenAIResponsesFunctionCallOutputContent),
|
||||
])
|
||||
|
||||
const OpenAIResponsesInputItem = Schema.Union([
|
||||
Schema.Struct({ role: Schema.tag("system"), content: Schema.String }),
|
||||
Schema.Struct({ role: Schema.tag("user"), content: Schema.Array(OpenAIResponsesInputContent) }),
|
||||
|
|
@ -69,7 +84,7 @@ const OpenAIResponsesInputItem = Schema.Union([
|
|||
Schema.Struct({
|
||||
type: Schema.tag("function_call_output"),
|
||||
call_id: Schema.String,
|
||||
output: Schema.String,
|
||||
output: OpenAIResponsesFunctionCallOutput,
|
||||
}),
|
||||
])
|
||||
type OpenAIResponsesInputItem = Schema.Schema.Type<typeof OpenAIResponsesInputItem>
|
||||
|
|
@ -250,6 +265,27 @@ const lowerUserContent = Effect.fn("OpenAIResponses.lowerUserContent")(function*
|
|||
return yield* ProviderShared.unsupportedContent("OpenAI Responses", "user", ["text", "media"])
|
||||
})
|
||||
|
||||
// Tool results may carry structured text/images. Keep media as provider-native
|
||||
// content instead of JSON-stringifying base64 into a prompt string.
|
||||
const lowerToolResultContentItem = Effect.fn("OpenAIResponses.lowerToolResultContentItem")(function* (
|
||||
item: ToolResultContentPart,
|
||||
) {
|
||||
if (item.type === "text") return { type: "input_text" as const, text: item.text }
|
||||
if (item.mediaType.startsWith("image/"))
|
||||
return {
|
||||
type: "input_image" as const,
|
||||
image_url: ProviderShared.mediaDataUrl(item),
|
||||
}
|
||||
return yield* invalid(`OpenAI Responses tool-result media content only supports images, got ${item.mediaType}`)
|
||||
})
|
||||
|
||||
const lowerToolResultOutput = Effect.fn("OpenAIResponses.lowerToolResultOutput")(function* (part: ToolResultPart) {
|
||||
// Text/json/error results are encoded as a plain string for backward
|
||||
// compatibility with existing cassettes and provider expectations.
|
||||
if (part.result.type !== "content") return ProviderShared.toolResultText(part)
|
||||
return yield* Effect.forEach(part.result.value, lowerToolResultContentItem)
|
||||
})
|
||||
|
||||
const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (request: LLMRequest) {
|
||||
const system: OpenAIResponsesInputItem[] =
|
||||
request.system.length === 0 ? [] : [{ role: "system", content: ProviderShared.joinText(request.system) }]
|
||||
|
|
@ -298,7 +334,11 @@ const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (requ
|
|||
for (const part of message.content) {
|
||||
if (!ProviderShared.supportsContent(part, ["tool-result"]))
|
||||
return yield* ProviderShared.unsupportedContent("OpenAI Responses", "tool", ["tool-result"])
|
||||
input.push({ type: "function_call_output", call_id: part.id, output: ProviderShared.toolResultText(part) })
|
||||
input.push({
|
||||
type: "function_call_output",
|
||||
call_id: part.id,
|
||||
output: yield* lowerToolResultOutput(part),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -87,6 +87,7 @@ describeRecordedGoldenScenarios([
|
|||
{ id: "reasoning-continuation", temperature: false },
|
||||
{ id: "tool-call", temperature: false },
|
||||
{ id: "tool-loop", temperature: false },
|
||||
{ id: "image-tool-result", temperature: false, maxTokens: 40 },
|
||||
],
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -26,6 +26,19 @@ const request = LLM.request({
|
|||
|
||||
const configEnv = (env: Record<string, string>) => Effect.provide(ConfigProvider.layer(ConfigProvider.fromEnv({ env })))
|
||||
|
||||
type OpenAIToolOutput = Extract<
|
||||
OpenAIResponses.OpenAIResponsesBody["input"][number],
|
||||
{ readonly type: "function_call_output" }
|
||||
>
|
||||
|
||||
const expectToolOutput = (body: OpenAIResponses.OpenAIResponsesBody): OpenAIToolOutput => {
|
||||
const output = body.input.find(
|
||||
(item): item is OpenAIToolOutput => "type" in item && item.type === "function_call_output",
|
||||
)
|
||||
expect(output).toBeDefined()
|
||||
return output!
|
||||
}
|
||||
|
||||
describe("OpenAI Responses route", () => {
|
||||
it.effect("prepares OpenAI Responses target", () =>
|
||||
Effect.gen(function* () {
|
||||
|
|
@ -248,6 +261,84 @@ describe("OpenAI Responses route", () => {
|
|||
}),
|
||||
)
|
||||
|
||||
// Regression: screenshot/read tool results must stay structured so base64
|
||||
// image data is not JSON-stringified into `function_call_output.output`.
|
||||
it.effect("lowers image tool-result content as structured input_image items", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
|
||||
LLM.request({
|
||||
id: "req_tool_result_image",
|
||||
model,
|
||||
messages: [
|
||||
Message.user("Show me the screenshot."),
|
||||
Message.assistant([ToolCallPart.make({ id: "call_1", name: "read", input: { filePath: "shot.png" } })]),
|
||||
Message.tool({
|
||||
id: "call_1",
|
||||
name: "read",
|
||||
resultType: "content",
|
||||
result: [
|
||||
{ type: "text", text: "Image read successfully" },
|
||||
{ type: "media", mediaType: "image/png", data: "AAECAw==" },
|
||||
],
|
||||
}),
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
expect(expectToolOutput(prepared.body).output).toEqual([
|
||||
{ type: "input_text", text: "Image read successfully" },
|
||||
{ type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
|
||||
])
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("lowers single-image tool-result content as structured input_image array", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
|
||||
LLM.request({
|
||||
id: "req_tool_result_image_only",
|
||||
model,
|
||||
messages: [
|
||||
Message.assistant([ToolCallPart.make({ id: "call_1", name: "screenshot", input: {} })]),
|
||||
Message.tool({
|
||||
id: "call_1",
|
||||
name: "screenshot",
|
||||
resultType: "content",
|
||||
result: [{ type: "media", mediaType: "image/png", data: "AAECAw==" }],
|
||||
}),
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
expect(expectToolOutput(prepared.body).output).toEqual([
|
||||
{ type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
|
||||
])
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("rejects non-image media in tool-result content with a clear error", () =>
|
||||
Effect.gen(function* () {
|
||||
const error = yield* LLMClient.prepare(
|
||||
LLM.request({
|
||||
id: "req_tool_result_unsupported_media",
|
||||
model,
|
||||
messages: [
|
||||
Message.assistant([ToolCallPart.make({ id: "call_1", name: "fetch", input: {} })]),
|
||||
Message.tool({
|
||||
id: "call_1",
|
||||
name: "fetch",
|
||||
resultType: "content",
|
||||
result: [{ type: "media", mediaType: "audio/mpeg", data: "AAECAw==" }],
|
||||
}),
|
||||
],
|
||||
}),
|
||||
).pipe(Effect.flip)
|
||||
|
||||
expect(error.message).toContain("OpenAI Responses")
|
||||
expect(error.message).toContain("audio/mpeg")
|
||||
}),
|
||||
)
|
||||
|
||||
it.effect("prepares the composed native continuation request", () =>
|
||||
Effect.gen(function* () {
|
||||
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
|
||||
|
|
|
|||
|
|
@ -317,6 +317,49 @@ const runImageScenario = (context: GoldenScenarioContext) =>
|
|||
])
|
||||
})
|
||||
|
||||
// Reproduces a tool-result image round trip: a tool returns image bytes, and
|
||||
// the next model turn must receive provider-native image content instead of a
|
||||
// JSON-stringified base64 blob.
|
||||
const screenshotToolName = "read_screenshot"
|
||||
const runImageToolResultScenario = (context: GoldenScenarioContext) =>
|
||||
Effect.gen(function* () {
|
||||
const image = yield* restroomImage()
|
||||
const response = yield* generate(
|
||||
LLM.request({
|
||||
id: `${context.id}_image_tool_result`,
|
||||
model: context.model,
|
||||
system: "Read images carefully. Reply only with the visible text, lowercase, no punctuation.",
|
||||
cache: "none",
|
||||
generation: generation(context, context.maxTokens ?? 40),
|
||||
messages: [
|
||||
Message.user("Use the read_screenshot tool, then reply with the words shown."),
|
||||
Message.assistant([
|
||||
{ type: "tool-call", id: "call_screenshot_1", name: screenshotToolName, input: {} },
|
||||
]),
|
||||
Message.tool({
|
||||
id: "call_screenshot_1",
|
||||
name: screenshotToolName,
|
||||
resultType: "content",
|
||||
result: [
|
||||
{ type: "text", text: "Image read successfully" },
|
||||
{ type: "media", mediaType: "image/png", data: image },
|
||||
],
|
||||
}),
|
||||
],
|
||||
tools: [
|
||||
ToolDefinition.make({
|
||||
name: screenshotToolName,
|
||||
description: "Capture a screenshot of the current screen.",
|
||||
inputSchema: { type: "object", properties: {}, additionalProperties: false },
|
||||
}),
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
expectFinish(response.events, "stop")
|
||||
expect(normalizeImageText(response.text)).toBe(RESTROOM_IMAGE_TEXT)
|
||||
})
|
||||
|
||||
const runReasoningScenario = (context: GoldenScenarioContext) =>
|
||||
runGeneratedConversation(context, [
|
||||
user("Think briefly, then reply exactly with: Hello!"),
|
||||
|
|
@ -359,6 +402,11 @@ const goldenScenarios = {
|
|||
"tool-call": { title: "streams tool call", tags: ["tool", "tool-call", "golden"], run: runToolCallScenario },
|
||||
"tool-loop": { title: "drives a tool loop", tags: ["tool", "tool-loop", "golden"], run: runToolLoopScenario },
|
||||
image: { title: "reads image text", tags: ["media", "image", "vision", "golden"], run: runImageScenario },
|
||||
"image-tool-result": {
|
||||
title: "reads image returned from tool result",
|
||||
tags: ["media", "image", "vision", "tool", "tool-result", "golden"],
|
||||
run: runImageToolResultScenario,
|
||||
},
|
||||
reasoning: { title: "uses reasoning", tags: ["reasoning", "golden"], run: runReasoningScenario },
|
||||
"reasoning-continuation": {
|
||||
title: "continues encrypted reasoning",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue