fix(llm): emit structured input_image content for tool-result media in OpenAI Responses (#28754)

2026-05-23 04:26:05 +00:00 · 2026-05-22 12:23:23 -04:00 · 2026-05-22 12:23:23 -04:00 · 700d012025
commit 700d012025
parent 59e486a917
5 changed files with 224 additions and 2 deletions
--- a/packages/llm/src/protocols/openai-responses.ts
+++ b/packages/llm/src/protocols/openai-responses.ts
@ -14,6 +14,8 @@ import {
  type TextPart,
  type ToolCallPart,
  type ToolDefinition,
+  type ToolResultContentPart,
+  type ToolResultPart,
 } from "../schema"
 import { JsonObject, optionalArray, optionalNull, ProviderShared } from "./shared"
 import { OpenAIOptions } from "./utils/openai-options"
@ -55,6 +57,19 @@ const OpenAIResponsesReasoningItem = Schema.Struct({
  encrypted_content: optionalNull(Schema.String),
 })

+// `function_call_output.output` accepts either a plain string or an ordered
+// array of content items so tools can return images in addition to text.
+// https://platform.openai.com/docs/api-reference/responses/object
+const OpenAIResponsesFunctionCallOutputContent = Schema.Union([
+  OpenAIResponsesInputText,
+  OpenAIResponsesInputImage,
+])
+
+const OpenAIResponsesFunctionCallOutput = Schema.Union([
+  Schema.String,
+  Schema.Array(OpenAIResponsesFunctionCallOutputContent),
+])
+
 const OpenAIResponsesInputItem = Schema.Union([
  Schema.Struct({ role: Schema.tag("system"), content: Schema.String }),
  Schema.Struct({ role: Schema.tag("user"), content: Schema.Array(OpenAIResponsesInputContent) }),
@ -69,7 +84,7 @@ const OpenAIResponsesInputItem = Schema.Union([
  Schema.Struct({
    type: Schema.tag("function_call_output"),
    call_id: Schema.String,
-    output: Schema.String,
+    output: OpenAIResponsesFunctionCallOutput,
  }),
 ])
 type OpenAIResponsesInputItem = Schema.Schema.Type<typeof OpenAIResponsesInputItem>
@ -250,6 +265,27 @@ const lowerUserContent = Effect.fn("OpenAIResponses.lowerUserContent")(function*
  return yield* ProviderShared.unsupportedContent("OpenAI Responses", "user", ["text", "media"])
 })

+// Tool results may carry structured text/images. Keep media as provider-native
+// content instead of JSON-stringifying base64 into a prompt string.
+const lowerToolResultContentItem = Effect.fn("OpenAIResponses.lowerToolResultContentItem")(function* (
+  item: ToolResultContentPart,
+) {
+  if (item.type === "text") return { type: "input_text" as const, text: item.text }
+  if (item.mediaType.startsWith("image/"))
+    return {
+      type: "input_image" as const,
+      image_url: ProviderShared.mediaDataUrl(item),
+    }
+  return yield* invalid(`OpenAI Responses tool-result media content only supports images, got ${item.mediaType}`)
+})
+
+const lowerToolResultOutput = Effect.fn("OpenAIResponses.lowerToolResultOutput")(function* (part: ToolResultPart) {
+  // Text/json/error results are encoded as a plain string for backward
+  // compatibility with existing cassettes and provider expectations.
+  if (part.result.type !== "content") return ProviderShared.toolResultText(part)
+  return yield* Effect.forEach(part.result.value, lowerToolResultContentItem)
+})
+
 const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (request: LLMRequest) {
  const system: OpenAIResponsesInputItem[] =
    request.system.length === 0 ? [] : [{ role: "system", content: ProviderShared.joinText(request.system) }]
@ -298,7 +334,11 @@ const lowerMessages = Effect.fn("OpenAIResponses.lowerMessages")(function* (requ
    for (const part of message.content) {
      if (!ProviderShared.supportsContent(part, ["tool-result"]))
        return yield* ProviderShared.unsupportedContent("OpenAI Responses", "tool", ["tool-result"])
-      input.push({ type: "function_call_output", call_id: part.id, output: ProviderShared.toolResultText(part) })
+      input.push({
+        type: "function_call_output",
+        call_id: part.id,
+        output: yield* lowerToolResultOutput(part),
+      })
    }
  }

--- a/packages/llm/test/fixtures/recordings/openai-responses/openai-responses-gpt-5-5-image-tool-result.json
+++ b/packages/llm/test/fixtures/recordings/openai-responses/openai-responses-gpt-5-5-image-tool-result.json
--- a/packages/llm/test/provider/golden.recorded.test.ts
+++ b/packages/llm/test/provider/golden.recorded.test.ts
@ -87,6 +87,7 @@ describeRecordedGoldenScenarios([
      { id: "reasoning-continuation", temperature: false },
      { id: "tool-call", temperature: false },
      { id: "tool-loop", temperature: false },
+      { id: "image-tool-result", temperature: false, maxTokens: 40 },
    ],
  },
  {
--- a/packages/llm/test/provider/openai-responses.test.ts
+++ b/packages/llm/test/provider/openai-responses.test.ts
@ -26,6 +26,19 @@ const request = LLM.request({

 const configEnv = (env: Record<string, string>) => Effect.provide(ConfigProvider.layer(ConfigProvider.fromEnv({ env })))

+type OpenAIToolOutput = Extract<
+  OpenAIResponses.OpenAIResponsesBody["input"][number],
+  { readonly type: "function_call_output" }
+>
+
+const expectToolOutput = (body: OpenAIResponses.OpenAIResponsesBody): OpenAIToolOutput => {
+  const output = body.input.find(
+    (item): item is OpenAIToolOutput => "type" in item && item.type === "function_call_output",
+  )
+  expect(output).toBeDefined()
+  return output!
+}
+
 describe("OpenAI Responses route", () => {
  it.effect("prepares OpenAI Responses target", () =>
    Effect.gen(function* () {
@ -248,6 +261,84 @@ describe("OpenAI Responses route", () => {
    }),
  )

+  // Regression: screenshot/read tool results must stay structured so base64
+  // image data is not JSON-stringified into `function_call_output.output`.
+  it.effect("lowers image tool-result content as structured input_image items", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
+        LLM.request({
+          id: "req_tool_result_image",
+          model,
+          messages: [
+            Message.user("Show me the screenshot."),
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "read", input: { filePath: "shot.png" } })]),
+            Message.tool({
+              id: "call_1",
+              name: "read",
+              resultType: "content",
+              result: [
+                { type: "text", text: "Image read successfully" },
+                { type: "media", mediaType: "image/png", data: "AAECAw==" },
+              ],
+            }),
+          ],
+        }),
+      )
+
+      expect(expectToolOutput(prepared.body).output).toEqual([
+        { type: "input_text", text: "Image read successfully" },
+        { type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
+      ])
+    }),
+  )
+
+  it.effect("lowers single-image tool-result content as structured input_image array", () =>
+    Effect.gen(function* () {
+      const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
+        LLM.request({
+          id: "req_tool_result_image_only",
+          model,
+          messages: [
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "screenshot", input: {} })]),
+            Message.tool({
+              id: "call_1",
+              name: "screenshot",
+              resultType: "content",
+              result: [{ type: "media", mediaType: "image/png", data: "AAECAw==" }],
+            }),
+          ],
+        }),
+      )
+
+      expect(expectToolOutput(prepared.body).output).toEqual([
+        { type: "input_image", image_url: "data:image/png;base64,AAECAw==" },
+      ])
+    }),
+  )
+
+  it.effect("rejects non-image media in tool-result content with a clear error", () =>
+    Effect.gen(function* () {
+      const error = yield* LLMClient.prepare(
+        LLM.request({
+          id: "req_tool_result_unsupported_media",
+          model,
+          messages: [
+            Message.assistant([ToolCallPart.make({ id: "call_1", name: "fetch", input: {} })]),
+            Message.tool({
+              id: "call_1",
+              name: "fetch",
+              resultType: "content",
+              result: [{ type: "media", mediaType: "audio/mpeg", data: "AAECAw==" }],
+            }),
+          ],
+        }),
+      ).pipe(Effect.flip)
+
+      expect(error.message).toContain("OpenAI Responses")
+      expect(error.message).toContain("audio/mpeg")
+    }),
+  )
+
  it.effect("prepares the composed native continuation request", () =>
    Effect.gen(function* () {
      const prepared = yield* LLMClient.prepare<OpenAIResponses.OpenAIResponsesBody>(
--- a/packages/llm/test/recorded-scenarios.ts
+++ b/packages/llm/test/recorded-scenarios.ts
@ -317,6 +317,49 @@ const runImageScenario = (context: GoldenScenarioContext) =>
    ])
  })

+// Reproduces a tool-result image round trip: a tool returns image bytes, and
+// the next model turn must receive provider-native image content instead of a
+// JSON-stringified base64 blob.
+const screenshotToolName = "read_screenshot"
+const runImageToolResultScenario = (context: GoldenScenarioContext) =>
+  Effect.gen(function* () {
+    const image = yield* restroomImage()
+    const response = yield* generate(
+      LLM.request({
+        id: `${context.id}_image_tool_result`,
+        model: context.model,
+        system: "Read images carefully. Reply only with the visible text, lowercase, no punctuation.",
+        cache: "none",
+        generation: generation(context, context.maxTokens ?? 40),
+        messages: [
+          Message.user("Use the read_screenshot tool, then reply with the words shown."),
+          Message.assistant([
+            { type: "tool-call", id: "call_screenshot_1", name: screenshotToolName, input: {} },
+          ]),
+          Message.tool({
+            id: "call_screenshot_1",
+            name: screenshotToolName,
+            resultType: "content",
+            result: [
+              { type: "text", text: "Image read successfully" },
+              { type: "media", mediaType: "image/png", data: image },
+            ],
+          }),
+        ],
+        tools: [
+          ToolDefinition.make({
+            name: screenshotToolName,
+            description: "Capture a screenshot of the current screen.",
+            inputSchema: { type: "object", properties: {}, additionalProperties: false },
+          }),
+        ],
+      }),
+    )
+
+    expectFinish(response.events, "stop")
+    expect(normalizeImageText(response.text)).toBe(RESTROOM_IMAGE_TEXT)
+  })
+
 const runReasoningScenario = (context: GoldenScenarioContext) =>
  runGeneratedConversation(context, [
    user("Think briefly, then reply exactly with: Hello!"),
@ -359,6 +402,11 @@ const goldenScenarios = {
  "tool-call": { title: "streams tool call", tags: ["tool", "tool-call", "golden"], run: runToolCallScenario },
  "tool-loop": { title: "drives a tool loop", tags: ["tool", "tool-loop", "golden"], run: runToolLoopScenario },
  image: { title: "reads image text", tags: ["media", "image", "vision", "golden"], run: runImageScenario },
+  "image-tool-result": {
+    title: "reads image returned from tool result",
+    tags: ["media", "image", "vision", "tool", "tool-result", "golden"],
+    run: runImageToolResultScenario,
+  },
  reasoning: { title: "uses reasoning", tags: ["reasoning", "golden"], run: runReasoningScenario },
  "reasoning-continuation": {
    title: "continues encrypted reasoning",