fix(core): split tool-result media into follow-up user message for strict OpenAI compat (#3617)

Fixes #3616. Adds opt-in `splitToolMedia` flag (default false). When enabled, media parts (image / audio / video / file) returned by MCP tool calls are split into a follow-up `role: "user"` message instead of being embedded in the `role: "tool"` message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on tool messages with HTTP 400 "Invalid 'messages' in payload". Media from parallel tool responses is accumulated and emitted as a single follow-up user message after all tool messages, preserving OpenAI's contiguity requirement for tool responses. Default behavior is unchanged for permissive providers.
2026-05-05 23:42:03 +00:00 · 2026-04-27 20:31:02 +05:30 · 2026-04-27 20:31:02 +05:30 · 414b3304cd
commit 414b3304cd
parent 8a278767ed
11 changed files with 526 additions and 11 deletions
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@ -924,6 +924,17 @@ const SETTINGS_SCHEMA = {
            parentKey: 'generationConfig',
            showInDialog: false,
          },
+          splitToolMedia: {
+            type: 'boolean',
+            label: 'Split Tool Result Media',
+            category: 'Generation Configuration',
+            requiresRestart: false,
+            default: false,
+            description:
+              'When true, media (images / audio / video / files) returned by MCP tool calls is split into a follow-up user message instead of being embedded in the tool message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on `role: "tool"` messages with HTTP 400 "Invalid \'messages\' in payload". Default false preserves the prior behavior for permissive providers. See QwenLM/qwen-code#3616.',
+            parentKey: 'generationConfig',
+            showInDialog: false,
+          },
          schemaCompliance: {
            type: 'enum',
            label: 'Tool Schema Compliance',
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@ -1454,6 +1454,7 @@ export class Config {
      this.contentGeneratorConfig.contextWindowSize = config.contextWindowSize;
      this.contentGeneratorConfig.enableCacheControl =
        config.enableCacheControl;
+      this.contentGeneratorConfig.splitToolMedia = config.splitToolMedia;

      if ('model' in sources) {
        this.contentGeneratorConfigSources['model'] = sources['model'];
@ -1470,6 +1471,10 @@ export class Config {
        this.contentGeneratorConfigSources['contextWindowSize'] =
          sources['contextWindowSize'];
      }
+      if ('splitToolMedia' in sources) {
+        this.contentGeneratorConfigSources['splitToolMedia'] =
+          sources['splitToolMedia'];
+      }
      return;
    }

--- a/packages/core/src/core/contentGenerator.ts
+++ b/packages/core/src/core/contentGenerator.ts
@ -116,6 +116,15 @@ export type ContentGeneratorConfig = {
  // Supported input modalities. Unsupported media types are replaced with text
  // placeholders. Leave undefined to use automatic detection from model name.
  modalities?: InputModalities;
+  // When true, media parts in MCP tool responses are split into a follow-up
+  // `role: "user"` message instead of being embedded inside the `role: "tool"`
+  // message. The OpenAI Chat Completions spec only permits string / text-part
+  // content on tool messages; strict OpenAI-compatible servers (notably
+  // LM Studio) reject anything else with HTTP 400 "Invalid 'messages' in
+  // payload". Enable this for any provider that strictly validates tool
+  // message content. Default: false (preserves prior behavior for permissive
+  // providers). See QwenLM/qwen-code#3616.
+  splitToolMedia?: boolean;
 };

 // Keep the public ContentGeneratorConfigSources API, but reuse the generic
--- a/packages/core/src/core/openaiContentGenerator/converter.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/converter.test.ts
@ -382,6 +382,422 @@ describe('OpenAIContentConverter', () => {
      expect(userMessage).toBeUndefined();
    });

+    it('should split tool-result media into a follow-up user message when splitToolMedia is enabled (issue #3616)', () => {
+      // Same shape as the embedded-image test above, but with the strict
+      // OpenAI-compat opt-in flag set. The tool message must stay
+      // spec-compliant (string / text-part content only) and the image must
+      // arrive in a follow-up user message.
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_1',
+                  name: 'Read',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Read',
+                  response: { output: 'Image content' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/png',
+                        data: 'base64encodedimagedata',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const strictContext: RequestContext = {
+        ...requestContext,
+        splitToolMedia: true,
+      };
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        strictContext,
+      );
+
+      const toolMessage = messages.find((m) => m.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      // Tool message content is a plain string (or text-part array) — no media
+      expect(typeof toolMessage?.content === 'string').toBe(true);
+      expect(toolMessage?.content).toContain('Image content');
+
+      // The image lives in a follow-up user message
+      const userMessage = messages.find((m) => m.role === 'user');
+      expect(userMessage).toBeDefined();
+      const userContent = userMessage?.content as Array<{
+        type: string;
+        text?: string;
+        image_url?: { url: string };
+      }>;
+      expect(Array.isArray(userContent)).toBe(true);
+      const imagePart = userContent.find((p) => p.type === 'image_url');
+      expect(imagePart?.image_url?.url).toBe(
+        'data:image/png;base64,base64encodedimagedata',
+      );
+    });
+
+    it('should keep all tool messages contiguous and merge split media into a single follow-up user message for parallel tool calls (issue #3616)', () => {
+      // Two assistant tool calls in parallel. Both responses come back in the
+      // same `user` content as separate functionResponse parts. The first
+      // returns an image; the second returns text only. OpenAI Chat
+      // Completions requires every `role: "tool"` response to appear
+      // contiguously before any non-tool message, so the synthesised user
+      // message carrying split media MUST come after BOTH tool messages,
+      // not interleaved between them.
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_screenshot',
+                  name: 'browser_take_screenshot',
+                  args: {},
+                },
+              },
+              {
+                functionCall: {
+                  id: 'call_console',
+                  name: 'browser_console_messages',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_screenshot',
+                  name: 'browser_take_screenshot',
+                  response: { output: 'Captured screenshot' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/png',
+                        data: 'shotbase64',
+                      },
+                    },
+                  ],
+                },
+              },
+              {
+                functionResponse: {
+                  id: 'call_console',
+                  name: 'browser_console_messages',
+                  response: { output: 'no console messages' },
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const strictContext: RequestContext = {
+        ...requestContext,
+        splitToolMedia: true,
+      };
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        strictContext,
+      );
+
+      // Locate the assistant turn (with the two tool calls) and assert that
+      // the next two messages are both `tool`, contiguously, before any
+      // user message.
+      const assistantIdx = messages.findIndex((m) => m.role === 'assistant');
+      expect(assistantIdx).toBeGreaterThanOrEqual(0);
+      expect(messages[assistantIdx + 1]?.role).toBe('tool');
+      expect(messages[assistantIdx + 2]?.role).toBe('tool');
+      expect(messages[assistantIdx + 3]?.role).toBe('user');
+
+      // Both tool messages have spec-compliant content (string OR array of
+      // text-typed parts only — no image_url / input_audio / video_url /
+      // file parts allowed by OpenAI on tool messages).
+      const isSpecCompliantToolContent = (content: unknown): boolean => {
+        if (typeof content === 'string') return true;
+        if (!Array.isArray(content)) return false;
+        return (content as Array<{ type: string }>).every(
+          (p) => p.type === 'text',
+        );
+      };
+      expect(
+        isSpecCompliantToolContent(
+          (messages[assistantIdx + 1] as { content: unknown }).content,
+        ),
+      ).toBe(true);
+      expect(
+        isSpecCompliantToolContent(
+          (messages[assistantIdx + 2] as { content: unknown }).content,
+        ),
+      ).toBe(true);
+
+      // Exactly one synthesised user message exists, and it carries the
+      // single image from the first tool response.
+      const userMessages = messages.filter((m) => m.role === 'user');
+      expect(userMessages).toHaveLength(1);
+      const userContent = userMessages[0].content as Array<{
+        type: string;
+        text?: string;
+        image_url?: { url: string };
+      }>;
+      const imageParts = userContent.filter((p) => p.type === 'image_url');
+      expect(imageParts).toHaveLength(1);
+      expect(imageParts[0].image_url?.url).toBe(
+        'data:image/png;base64,shotbase64',
+      );
+    });
+
+    it('should merge media from multiple media-bearing parallel tool responses into one follow-up user message (issue #3616)', () => {
+      // Both tool responses return images. The accumulator must combine them
+      // into a single user message — we should NOT see two separate user
+      // messages (which would still violate the contiguity rule because the
+      // first user message would split the tool messages apart).
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: { id: 'call_a', name: 'shot_a', args: {} },
+              },
+              {
+                functionCall: { id: 'call_b', name: 'shot_b', args: {} },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_a',
+                  name: 'shot_a',
+                  response: { output: 'A' },
+                  parts: [
+                    { inlineData: { mimeType: 'image/png', data: 'aaa' } },
+                  ],
+                },
+              },
+              {
+                functionResponse: {
+                  id: 'call_b',
+                  name: 'shot_b',
+                  response: { output: 'B' },
+                  parts: [
+                    { inlineData: { mimeType: 'image/png', data: 'bbb' } },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const strictContext: RequestContext = {
+        ...requestContext,
+        splitToolMedia: true,
+      };
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        strictContext,
+      );
+
+      const toolMessages = messages.filter((m) => m.role === 'tool');
+      const userMessages = messages.filter((m) => m.role === 'user');
+      expect(toolMessages).toHaveLength(2);
+      expect(userMessages).toHaveLength(1);
+
+      const userContent = userMessages[0].content as Array<{
+        type: string;
+        text?: string;
+        image_url?: { url: string };
+      }>;
+      const imageUrls = userContent
+        .filter((p) => p.type === 'image_url')
+        .map((p) => p.image_url?.url);
+      expect(imageUrls).toEqual([
+        'data:image/png;base64,aaa',
+        'data:image/png;base64,bbb',
+      ]);
+    });
+
+    it('should not synthesise a follow-up user message when splitToolMedia is enabled but the response has no media (issue #3616)', () => {
+      // Regression guard: when the flag is on but a tool response is text-only,
+      // the synthesis path must not emit any user message. Without this guard,
+      // a future refactor that always emits the follow-up could regress silently.
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [{ functionCall: { id: 'c', name: 'echo', args: {} } }],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'c',
+                  name: 'echo',
+                  response: { output: 'plain text result' },
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const strictContext: RequestContext = {
+        ...requestContext,
+        splitToolMedia: true,
+      };
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        strictContext,
+      );
+
+      const toolMessages = messages.filter((m) => m.role === 'tool');
+      const userMessages = messages.filter((m) => m.role === 'user');
+      expect(toolMessages).toHaveLength(1);
+      expect(userMessages).toHaveLength(0);
+    });
+
+    it('should fall back to a placeholder string when the tool response is media-only (issue #3616)', () => {
+      // When extractFunctionResponseContent returns empty AND parts contain
+      // only media, the tool message must end up with the placeholder string
+      // rather than an empty array (which would be invalid spec).
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [{ functionCall: { id: 'c', name: 'shot', args: {} } }],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'c',
+                  name: 'shot',
+                  // null response triggers extractFunctionResponseContent
+                  // to return "" — the empty-text branch we want to cover.
+                  response: null as unknown as Record<string, unknown>,
+                  parts: [
+                    { inlineData: { mimeType: 'image/png', data: 'xxx' } },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const strictContext: RequestContext = {
+        ...requestContext,
+        splitToolMedia: true,
+      };
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        strictContext,
+      );
+
+      const toolMessage = messages.find((m) => m.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      expect(toolMessage?.content).toBe(
+        '[media attached in following user message]',
+      );
+      const userMessage = messages.find((m) => m.role === 'user');
+      const userContent = userMessage?.content as Array<{
+        type: string;
+        image_url?: { url: string };
+      }>;
+      const img = userContent.find((p) => p.type === 'image_url');
+      expect(img?.image_url?.url).toBe('data:image/png;base64,xxx');
+    });
+
+    it('should preserve prior embedded-media behavior when splitToolMedia is false (default) on parallel tool calls (issue #3616)', () => {
+      // Same input as the parallel-tool-calls split test, but with the flag
+      // off. Asserts that the opt-in is actually opt-in: media stays embedded
+      // in the tool message and no follow-up user message is synthesised.
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              { functionCall: { id: 'c1', name: 's1', args: {} } },
+              { functionCall: { id: 'c2', name: 's2', args: {} } },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'c1',
+                  name: 's1',
+                  response: { output: 'r1' },
+                  parts: [
+                    { inlineData: { mimeType: 'image/png', data: 'aaa' } },
+                  ],
+                },
+              },
+              {
+                functionResponse: {
+                  id: 'c2',
+                  name: 's2',
+                  response: { output: 'r2' },
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      // requestContext default has splitToolMedia undefined / false
+      const messages = converter.convertGeminiRequestToOpenAI(
+        request,
+        requestContext,
+      );
+
+      const toolMessages = messages.filter((m) => m.role === 'tool');
+      const userMessages = messages.filter((m) => m.role === 'user');
+      expect(toolMessages).toHaveLength(2);
+      expect(userMessages).toHaveLength(0);
+      // First tool message should still carry the embedded image
+      const firstToolContent = toolMessages[0].content as Array<{
+        type: string;
+        image_url?: { url: string };
+      }>;
+      const img = firstToolContent.find((p) => p.type === 'image_url');
+      expect(img?.image_url?.url).toBe('data:image/png;base64,aaa');
+    });
+
    it('should convert function responses with fileData to tool message with embedded image_url', () => {
      const request: GenerateContentParameters = {
        model: 'models/test',
--- a/packages/core/src/core/openaiContentGenerator/converter.ts
+++ b/packages/core/src/core/openaiContentGenerator/converter.ts
@ -400,6 +400,14 @@ function processContent(
  const reasoningParts: string[] = [];
  const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = [];
  let toolCallIndex = 0;
+  // When `splitToolMedia` is enabled, media stripped from tool messages is
+  // accumulated here and emitted as a single follow-up user message after
+  // ALL tool messages in this group have been pushed. OpenAI Chat
+  // Completions requires every `role: "tool"` response for a given assistant
+  // turn to appear contiguously before any non-tool message; emitting the
+  // user message inline (after each tool message) would interleave and
+  // break that contract when multiple parallel tool calls return media.
+  const accumulatedSplitMedia: OpenAIContentPart[] = [];

  for (const part of parts) {
    if (typeof part === 'string') {
@ -441,11 +449,65 @@ function processContent(
        requestContext,
      );
      if (toolMessage) {
+        // Opt-in only (ContentGeneratorConfig.splitToolMedia). OpenAI spec
+        // only permits string / text-part content on `role: "tool"` messages.
+        // Strict OpenAI-compatible servers (e.g. LM Studio) reject tool
+        // messages containing image_url / input_audio / video_url / file
+        // parts with HTTP 400 "Invalid 'messages' in payload". When the flag
+        // is set, strip non-text media from this tool message and accumulate
+        // it; the combined media is emitted as a single follow-up user
+        // message after the parts loop completes — preserving the
+        // "all tool responses contiguous" requirement for parallel tool
+        // calls. Default (flag false) preserves prior behavior: media is
+        // embedded in the tool message and permissive providers continue
+        // to receive it that way. See #3616.
+        if (
+          requestContext.splitToolMedia &&
+          Array.isArray(toolMessage.content)
+        ) {
+          const mediaParts: OpenAIContentPart[] = [];
+          const textParts: OpenAI.Chat.ChatCompletionContentPartText[] = [];
+          for (const cp of toolMessage.content as OpenAIContentPart[]) {
+            if (
+              cp &&
+              (cp.type === 'image_url' ||
+                cp.type === 'input_audio' ||
+                cp.type === 'video_url' ||
+                cp.type === 'file')
+            ) {
+              mediaParts.push(cp);
+            } else if (cp && cp.type === 'text') {
+              textParts.push(cp);
+            }
+          }
+          if (mediaParts.length > 0) {
+            const textOnly = textParts.map((p) => p.text).join('\n');
+            toolMessage.content =
+              textOnly || '[media attached in following user message]';
+            accumulatedSplitMedia.push(...mediaParts);
+          }
+        }
        messages.push(toolMessage);
      }
    }
  }

+  // Emit one combined user message containing all media stripped from the
+  // tool messages in this group. Runs after the parts loop so all tool
+  // messages remain contiguous (OpenAI requirement for parallel tool calls).
+  if (accumulatedSplitMedia.length > 0) {
+    messages.push({
+      role: 'user',
+      content: [
+        {
+          type: 'text',
+          text: '(attached media from previous tool call)',
+        },
+        ...accumulatedSplitMedia,
+      ] as unknown as OpenAI.Chat.ChatCompletionContentPartText[],
+    });
+  }
+
  if (role === 'assistant') {
    if (
      contentParts.length === 0 &&
--- a/packages/core/src/core/openaiContentGenerator/pipeline.ts
+++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts
@ -521,6 +521,7 @@ export class ContentGenerationPipeline {
      model: effectiveModel,
      modalities: this.contentGeneratorConfig.modalities ?? {},
      startTime: Date.now(),
+      splitToolMedia: this.contentGeneratorConfig.splitToolMedia ?? false,
      ...(toolCallParser ? { toolCallParser } : {}),
    };
  }
--- a/packages/core/src/core/openaiContentGenerator/types.ts
+++ b/packages/core/src/core/openaiContentGenerator/types.ts
@ -18,6 +18,10 @@ export interface RequestContext {
  modalities: InputModalities;
  startTime: number;
  toolCallParser?: StreamingToolCallParser;
+  // When true, media parts in tool-result messages are split into a follow-up
+  // user message for strict OpenAI-compat servers. See ContentGeneratorConfig
+  // for details.
+  splitToolMedia?: boolean;
 }

 export interface ErrorHandler {
--- a/packages/core/src/models/constants.ts
+++ b/packages/core/src/models/constants.ts
@ -30,6 +30,7 @@ export const MODEL_GENERATION_CONFIG_FIELDS = [
  'customHeaders',
  'extra_body',
  'modalities',
+  'splitToolMedia',
 ] as const satisfies ReadonlyArray<keyof ContentGeneratorConfig>;

 /**
--- a/packages/core/src/models/types.ts
+++ b/packages/core/src/models/types.ts
@ -38,6 +38,7 @@ export type ModelGenerationConfig = Pick<
  | 'extra_body'
  | 'contextWindowSize'
  | 'modalities'
+  | 'splitToolMedia'
 >;

 /**
--- a/packages/vscode-ide-companion/schemas/settings.schema.json
+++ b/packages/vscode-ide-companion/schemas/settings.schema.json
@ -359,6 +359,11 @@
              "type": "boolean",
              "default": true
            },
+            "splitToolMedia": {
+              "description": "When true, media (images / audio / video / files) returned by MCP tool calls is split into a follow-up user message instead of being embedded in the tool message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on `role: \"tool\"` messages with HTTP 400 \"Invalid 'messages' in payload\". Default false preserves the prior behavior for permissive providers. See QwenLM/qwen-code#3616.",
+              "type": "boolean",
+              "default": false
+            },
            "schemaCompliance": {
              "description": "The compliance mode for tool schemas sent to the model. Use \"openapi_30\" for strict OpenAPI 3.0 compatibility (e.g., for Gemini). Options: auto, openapi_30",
              "enum": [