fix(llm): emit structured input_image content for tool-result media in OpenAI Responses (#28754)

This commit is contained in:
Kit Langton 2026-05-22 12:23:23 -04:00 committed by GitHub
parent 59e486a917
commit 700d012025
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 224 additions and 2 deletions

View file

@ -14,6 +14,8 @@ import {
type TextPart,
type ToolCallPart,
type ToolDefinition,
type ToolResultContentPart,
type ToolResultPart,
} from "../schema"
import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
import { OpenAIOptions } from "./utils/openai-options"
@ -55,6 +57,19 @@ const OpenAIResponsesReasoningItem = Schema.Struct({
encrypted_content: optionalNull(Schema.String),
})
// `function_call_output.output` accepts either a plain string or an ordered
// array of content items so tools can return images in addition to text.
// https://platform.openai.com/docs/api-reference/responses/object
const OpenAIResponsesFunctionCallOutputContent = Schema.Union([
OpenAIResponsesInputText,
OpenAIResponsesInputImage,
])
const OpenAIResponsesFunctionCallOutput = Schema.Union([
Schema.String,
Schema.Array(OpenAIResponsesFunctionCallOutputContent),
])
const OpenAIResponsesInputItem = Schema.Union([
Schema.Struct({ role: Schema.tag("system"), content: Schema.String }),
Schema.Struct({ role: Schema.tag("user"), content: Schema.Array(OpenAIResponsesInputContent) }),
@ -69,7 +84,7 @@ const OpenAIResponsesInputItem = Schema.Union([
Schema.Struct({
type: Schema.tag("function_call_output"),
call_id: Schema.String,
output: Schema.String,
output: OpenAIResponsesFunctionCallOutput,
}),
])
type OpenAIResponsesInputItem = Schema.Schema.Type<typeof OpenAIResponsesInputItem>
@ -250,6 +265,27 @@ const lowerUserContent = Effect.fn("OpenAIResponses.lowerUserContent")(function*
return yield* ProviderShared.unsupportedContent("OpenAI Responses", "user", ["text", "media"])
})
// Tool results may carry structured text/images. Keep media as provider-native
// content instead of JSON-stringifying base64 into a prompt string.
const lowerToolResultContentItem = Effect.fn("OpenAIResponses.lowerToolResultContentItem")(function* (
item: ToolResultContentPart,
) {
if (item.type === "text") return { type: "input_text" as const, text: item.text }
if (item.mediaType.startsWith("image/"))
return {
type: "input_image" as const,
image_url: ProviderShared.mediaDataUrl(item),
}
return yield* invalid(`OpenAI Responses tool-result media content only supports images, got ${item.mediaType}`)
})
const lowerToolResultOutput = Effect.fn("OpenAIResponses.lowerToolResultOutput")(function* (part: ToolResultPart) {
// Text/json/error results are encoded as a plain string for backward
// compatibility with existing cassettes and provider expectations.
if (part.result.type !== "content") return ProviderShared.toolResultText(part)
return yield* Effect.forEach(part.result.value, lowerToolResultContentItem)
})
const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (request: LLMRequest) {
const system: OpenAIResponsesInputItem[] =
request.system.length === 0 ? [] : [{ role: "system", content: ProviderShared.joinText(request.system) }]
@ -298,7 +334,11 @@ const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (requ
for (const part of message.content) {
if (!ProviderShared.supportsContent(part, ["tool-result"]))
return yield* ProviderShared.unsupportedContent("OpenAI Responses", "tool", ["tool-result"])
input.push({ type: "function_call_output", call_id: part.id, output: ProviderShared.toolResultText(part) })
input.push({
type: "function_call_output",
call_id: part.id,
output: yield* lowerToolResultOutput(part),
})
}
}

File diff suppressed because one or more lines are too long

View file

@ -87,6 +87,7 @@ describeRecordedGoldenScenarios([
{ id: "reasoning-continuation", temperature: false },
{ id: "tool-call", temperature: false },
{ id: "tool-loop", temperature: false },
{ id: "image-tool-result", temperature: false, maxTokens: 40 },
],
},
{

View file

@ -26,6 +26,19 @@ const request = LLM.request({
const configEnv = (env: Record<string, string>) => Effect.provide(ConfigProvider.layer(ConfigProvider.fromEnv({ env })))
type OpenAIToolOutput = Extract<
OpenAIResponses.OpenAIResponsesBody["input"][number],
{ readonly type: "function_call_output" }
>
const expectToolOutput = (body: OpenAIResponses.OpenAIResponsesBody): OpenAIToolOutput => {
const output = body.input.find(
(item): item is OpenAIToolOutput => "type" in item && item.type === "function_call_output",
)
expect(output).toBeDefined()
return output!
}
describe("OpenAI Responses route", () => {
it.effect("prepares OpenAI Responses target", () =>
Effect.gen(function* () {
@ -248,6 +261,84 @@ describe("OpenAI Responses route", () => {
}),
)
// Regression: screenshot/read tool results must stay structured so base64
// image data is not JSON-stringified into `function_call_output.output`.
it.effect("lowers image tool-result content as structured input_image items", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
LLM.request({
id: "req_tool_result_image",
model,
messages: [
Message.user("Show me the screenshot."),
Message.assistant([ToolCallPart.make({ id: "call_1", name: "read", input: { filePath: "shot.png" } })]),
Message.tool({
id: "call_1",
name: "read",
resultType: "content",
result: [
{ type: "text", text: "Image read successfully" },
{ type: "media", mediaType: "image/png", data: "AAECAw==" },
],
}),
],
}),
)
expect(expectToolOutput(prepared.body).output).toEqual([
{ type: "input_text", text: "Image read successfully" },
{ type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
])
}),
)
it.effect("lowers single-image tool-result content as structured input_image array", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
LLM.request({
id: "req_tool_result_image_only",
model,
messages: [
Message.assistant([ToolCallPart.make({ id: "call_1", name: "screenshot", input: {} })]),
Message.tool({
id: "call_1",
name: "screenshot",
resultType: "content",
result: [{ type: "media", mediaType: "image/png", data: "AAECAw==" }],
}),
],
}),
)
expect(expectToolOutput(prepared.body).output).toEqual([
{ type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
])
}),
)
it.effect("rejects non-image media in tool-result content with a clear error", () =>
Effect.gen(function* () {
const error = yield* LLMClient.prepare(
LLM.request({
id: "req_tool_result_unsupported_media",
model,
messages: [
Message.assistant([ToolCallPart.make({ id: "call_1", name: "fetch", input: {} })]),
Message.tool({
id: "call_1",
name: "fetch",
resultType: "content",
result: [{ type: "media", mediaType: "audio/mpeg", data: "AAECAw==" }],
}),
],
}),
).pipe(Effect.flip)
expect(error.message).toContain("OpenAI Responses")
expect(error.message).toContain("audio/mpeg")
}),
)
it.effect("prepares the composed native continuation request", () =>
Effect.gen(function* () {
const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(

View file

@ -317,6 +317,49 @@ const runImageScenario = (context: GoldenScenarioContext) =>
])
})
// Reproduces a tool-result image round trip: a tool returns image bytes, and
// the next model turn must receive provider-native image content instead of a
// JSON-stringified base64 blob.
const screenshotToolName = "read_screenshot"
const runImageToolResultScenario = (context: GoldenScenarioContext) =>
Effect.gen(function* () {
const image = yield* restroomImage()
const response = yield* generate(
LLM.request({
id: `${context.id}_image_tool_result`,
model: context.model,
system: "Read images carefully. Reply only with the visible text, lowercase, no punctuation.",
cache: "none",
generation: generation(context, context.maxTokens ?? 40),
messages: [
Message.user("Use the read_screenshot tool, then reply with the words shown."),
Message.assistant([
{ type: "tool-call", id: "call_screenshot_1", name: screenshotToolName, input: {} },
]),
Message.tool({
id: "call_screenshot_1",
name: screenshotToolName,
resultType: "content",
result: [
{ type: "text", text: "Image read successfully" },
{ type: "media", mediaType: "image/png", data: image },
],
}),
],
tools: [
ToolDefinition.make({
name: screenshotToolName,
description: "Capture a screenshot of the current screen.",
inputSchema: { type: "object", properties: {}, additionalProperties: false },
}),
],
}),
)
expectFinish(response.events, "stop")
expect(normalizeImageText(response.text)).toBe(RESTROOM_IMAGE_TEXT)
})
const runReasoningScenario = (context: GoldenScenarioContext) =>
runGeneratedConversation(context, [
user("Think briefly, then reply exactly with: Hello!"),
@ -359,6 +402,11 @@ const goldenScenarios = {
"tool-call": { title: "streams tool call", tags: ["tool", "tool-call", "golden"], run: runToolCallScenario },
"tool-loop": { title: "drives a tool loop", tags: ["tool", "tool-loop", "golden"], run: runToolLoopScenario },
image: { title: "reads image text", tags: ["media", "image", "vision", "golden"], run: runImageScenario },
"image-tool-result": {
title: "reads image returned from tool result",
tags: ["media", "image", "vision", "tool", "tool-result", "golden"],
run: runImageToolResultScenario,
},
reasoning: { title: "uses reasoning", tags: ["reasoning", "golden"], run: runReasoningScenario },
"reasoning-continuation": {
title: "continues encrypted reasoning",