fix(core): split tool-result media into follow-up user message for strict OpenAI compat (#3617)

Fixes #3616.

Adds opt-in `splitToolMedia` flag (default false). When enabled, media parts (image / audio / video / file) returned by MCP tool calls are split into a follow-up `role: "user"` message instead of being embedded in the `role: "tool"` message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on tool messages with HTTP 400 "Invalid 'messages' in payload".

Media from parallel tool responses is accumulated and emitted as a single follow-up user message after all tool messages, preserving OpenAI's contiguity requirement for tool responses.

Default behavior is unchanged for permissive providers.
This commit is contained in:
Bramha.dev 2026-04-27 20:31:02 +05:30 committed by GitHub
parent 8a278767ed
commit 414b3304cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 526 additions and 11 deletions

View file

@ -924,6 +924,17 @@ const SETTINGS_SCHEMA = {
parentKey: 'generationConfig',
showInDialog: false,
},
splitToolMedia: {
type: 'boolean',
label: 'Split Tool Result Media',
category: 'Generation Configuration',
requiresRestart: false,
default: false,
description:
'When true, media (images / audio / video / files) returned by MCP tool calls is split into a follow-up user message instead of being embedded in the tool message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on `role: "tool"` messages with HTTP 400 "Invalid \'messages\' in payload". Default false preserves the prior behavior for permissive providers. See QwenLM/qwen-code#3616.',
parentKey: 'generationConfig',
showInDialog: false,
},
schemaCompliance: {
type: 'enum',
label: 'Tool Schema Compliance',

View file

@ -1454,6 +1454,7 @@ export class Config {
this.contentGeneratorConfig.contextWindowSize = config.contextWindowSize;
this.contentGeneratorConfig.enableCacheControl =
config.enableCacheControl;
this.contentGeneratorConfig.splitToolMedia = config.splitToolMedia;
if ('model' in sources) {
this.contentGeneratorConfigSources['model'] = sources['model'];
@ -1470,6 +1471,10 @@ export class Config {
this.contentGeneratorConfigSources['contextWindowSize'] =
sources['contextWindowSize'];
}
if ('splitToolMedia' in sources) {
this.contentGeneratorConfigSources['splitToolMedia'] =
sources['splitToolMedia'];
}
return;
}

View file

@ -116,6 +116,15 @@ export type ContentGeneratorConfig = {
// Supported input modalities. Unsupported media types are replaced with text
// placeholders. Leave undefined to use automatic detection from model name.
modalities?: InputModalities;
// When true, media parts in MCP tool responses are split into a follow-up
// `role: "user"` message instead of being embedded inside the `role: "tool"`
// message. The OpenAI Chat Completions spec only permits string / text-part
// content on tool messages; strict OpenAI-compatible servers (notably
// LM Studio) reject anything else with HTTP 400 "Invalid 'messages' in
// payload". Enable this for any provider that strictly validates tool
// message content. Default: false (preserves prior behavior for permissive
// providers). See QwenLM/qwen-code#3616.
splitToolMedia?: boolean;
};
// Keep the public ContentGeneratorConfigSources API, but reuse the generic

View file

@ -382,6 +382,422 @@ describe('OpenAIContentConverter', () => {
expect(userMessage).toBeUndefined();
});
it('should split tool-result media into a follow-up user message when splitToolMedia is enabled (issue #3616)', () => {
// Same shape as the embedded-image test above, but with the strict
// OpenAI-compat opt-in flag set. The tool message must stay
// spec-compliant (string / text-part content only) and the image must
// arrive in a follow-up user message.
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [
{
functionCall: {
id: 'call_1',
name: 'Read',
args: {},
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'call_1',
name: 'Read',
response: { output: 'Image content' },
parts: [
{
inlineData: {
mimeType: 'image/png',
data: 'base64encodedimagedata',
},
},
],
},
},
],
},
],
};
const strictContext: RequestContext = {
...requestContext,
splitToolMedia: true,
};
const messages = converter.convertGeminiRequestToOpenAI(
request,
strictContext,
);
const toolMessage = messages.find((m) => m.role === 'tool');
expect(toolMessage).toBeDefined();
// Tool message content is a plain string (or text-part array) — no media
expect(typeof toolMessage?.content === 'string').toBe(true);
expect(toolMessage?.content).toContain('Image content');
// The image lives in a follow-up user message
const userMessage = messages.find((m) => m.role === 'user');
expect(userMessage).toBeDefined();
const userContent = userMessage?.content as Array<{
type: string;
text?: string;
image_url?: { url: string };
}>;
expect(Array.isArray(userContent)).toBe(true);
const imagePart = userContent.find((p) => p.type === 'image_url');
expect(imagePart?.image_url?.url).toBe(
'data:image/png;base64,base64encodedimagedata',
);
});
it('should keep all tool messages contiguous and merge split media into a single follow-up user message for parallel tool calls (issue #3616)', () => {
// Two assistant tool calls in parallel. Both responses come back in the
// same `user` content as separate functionResponse parts. The first
// returns an image; the second returns text only. OpenAI Chat
// Completions requires every `role: "tool"` response to appear
// contiguously before any non-tool message, so the synthesised user
// message carrying split media MUST come after BOTH tool messages,
// not interleaved between them.
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [
{
functionCall: {
id: 'call_screenshot',
name: 'browser_take_screenshot',
args: {},
},
},
{
functionCall: {
id: 'call_console',
name: 'browser_console_messages',
args: {},
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'call_screenshot',
name: 'browser_take_screenshot',
response: { output: 'Captured screenshot' },
parts: [
{
inlineData: {
mimeType: 'image/png',
data: 'shotbase64',
},
},
],
},
},
{
functionResponse: {
id: 'call_console',
name: 'browser_console_messages',
response: { output: 'no console messages' },
},
},
],
},
],
};
const strictContext: RequestContext = {
...requestContext,
splitToolMedia: true,
};
const messages = converter.convertGeminiRequestToOpenAI(
request,
strictContext,
);
// Locate the assistant turn (with the two tool calls) and assert that
// the next two messages are both `tool`, contiguously, before any
// user message.
const assistantIdx = messages.findIndex((m) => m.role === 'assistant');
expect(assistantIdx).toBeGreaterThanOrEqual(0);
expect(messages[assistantIdx + 1]?.role).toBe('tool');
expect(messages[assistantIdx + 2]?.role).toBe('tool');
expect(messages[assistantIdx + 3]?.role).toBe('user');
// Both tool messages have spec-compliant content (string OR array of
// text-typed parts only — no image_url / input_audio / video_url /
// file parts allowed by OpenAI on tool messages).
const isSpecCompliantToolContent = (content: unknown): boolean => {
if (typeof content === 'string') return true;
if (!Array.isArray(content)) return false;
return (content as Array<{ type: string }>).every(
(p) => p.type === 'text',
);
};
expect(
isSpecCompliantToolContent(
(messages[assistantIdx + 1] as { content: unknown }).content,
),
).toBe(true);
expect(
isSpecCompliantToolContent(
(messages[assistantIdx + 2] as { content: unknown }).content,
),
).toBe(true);
// Exactly one synthesised user message exists, and it carries the
// single image from the first tool response.
const userMessages = messages.filter((m) => m.role === 'user');
expect(userMessages).toHaveLength(1);
const userContent = userMessages[0].content as Array<{
type: string;
text?: string;
image_url?: { url: string };
}>;
const imageParts = userContent.filter((p) => p.type === 'image_url');
expect(imageParts).toHaveLength(1);
expect(imageParts[0].image_url?.url).toBe(
'data:image/png;base64,shotbase64',
);
});
it('should merge media from multiple media-bearing parallel tool responses into one follow-up user message (issue #3616)', () => {
// Both tool responses return images. The accumulator must combine them
// into a single user message — we should NOT see two separate user
// messages (which would still violate the contiguity rule because the
// first user message would split the tool messages apart).
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [
{
functionCall: { id: 'call_a', name: 'shot_a', args: {} },
},
{
functionCall: { id: 'call_b', name: 'shot_b', args: {} },
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'call_a',
name: 'shot_a',
response: { output: 'A' },
parts: [
{ inlineData: { mimeType: 'image/png', data: 'aaa' } },
],
},
},
{
functionResponse: {
id: 'call_b',
name: 'shot_b',
response: { output: 'B' },
parts: [
{ inlineData: { mimeType: 'image/png', data: 'bbb' } },
],
},
},
],
},
],
};
const strictContext: RequestContext = {
...requestContext,
splitToolMedia: true,
};
const messages = converter.convertGeminiRequestToOpenAI(
request,
strictContext,
);
const toolMessages = messages.filter((m) => m.role === 'tool');
const userMessages = messages.filter((m) => m.role === 'user');
expect(toolMessages).toHaveLength(2);
expect(userMessages).toHaveLength(1);
const userContent = userMessages[0].content as Array<{
type: string;
text?: string;
image_url?: { url: string };
}>;
const imageUrls = userContent
.filter((p) => p.type === 'image_url')
.map((p) => p.image_url?.url);
expect(imageUrls).toEqual([
'data:image/png;base64,aaa',
'data:image/png;base64,bbb',
]);
});
it('should not synthesise a follow-up user message when splitToolMedia is enabled but the response has no media (issue #3616)', () => {
// Regression guard: when the flag is on but a tool response is text-only,
// the synthesis path must not emit any user message. Without this guard,
// a future refactor that always emits the follow-up could regress silently.
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [{ functionCall: { id: 'c', name: 'echo', args: {} } }],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'c',
name: 'echo',
response: { output: 'plain text result' },
},
},
],
},
],
};
const strictContext: RequestContext = {
...requestContext,
splitToolMedia: true,
};
const messages = converter.convertGeminiRequestToOpenAI(
request,
strictContext,
);
const toolMessages = messages.filter((m) => m.role === 'tool');
const userMessages = messages.filter((m) => m.role === 'user');
expect(toolMessages).toHaveLength(1);
expect(userMessages).toHaveLength(0);
});
it('should fall back to a placeholder string when the tool response is media-only (issue #3616)', () => {
// When extractFunctionResponseContent returns empty AND parts contain
// only media, the tool message must end up with the placeholder string
// rather than an empty array (which would be invalid spec).
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [{ functionCall: { id: 'c', name: 'shot', args: {} } }],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'c',
name: 'shot',
// null response triggers extractFunctionResponseContent
// to return "" — the empty-text branch we want to cover.
response: null as unknown as Record<string, unknown>,
parts: [
{ inlineData: { mimeType: 'image/png', data: 'xxx' } },
],
},
},
],
},
],
};
const strictContext: RequestContext = {
...requestContext,
splitToolMedia: true,
};
const messages = converter.convertGeminiRequestToOpenAI(
request,
strictContext,
);
const toolMessage = messages.find((m) => m.role === 'tool');
expect(toolMessage).toBeDefined();
expect(toolMessage?.content).toBe(
'[media attached in following user message]',
);
const userMessage = messages.find((m) => m.role === 'user');
const userContent = userMessage?.content as Array<{
type: string;
image_url?: { url: string };
}>;
const img = userContent.find((p) => p.type === 'image_url');
expect(img?.image_url?.url).toBe('data:image/png;base64,xxx');
});
it('should preserve prior embedded-media behavior when splitToolMedia is false (default) on parallel tool calls (issue #3616)', () => {
// Same input as the parallel-tool-calls split test, but with the flag
// off. Asserts that the opt-in is actually opt-in: media stays embedded
// in the tool message and no follow-up user message is synthesised.
const request: GenerateContentParameters = {
model: 'models/test',
contents: [
{
role: 'model',
parts: [
{ functionCall: { id: 'c1', name: 's1', args: {} } },
{ functionCall: { id: 'c2', name: 's2', args: {} } },
],
},
{
role: 'user',
parts: [
{
functionResponse: {
id: 'c1',
name: 's1',
response: { output: 'r1' },
parts: [
{ inlineData: { mimeType: 'image/png', data: 'aaa' } },
],
},
},
{
functionResponse: {
id: 'c2',
name: 's2',
response: { output: 'r2' },
},
},
],
},
],
};
// requestContext default has splitToolMedia undefined / false
const messages = converter.convertGeminiRequestToOpenAI(
request,
requestContext,
);
const toolMessages = messages.filter((m) => m.role === 'tool');
const userMessages = messages.filter((m) => m.role === 'user');
expect(toolMessages).toHaveLength(2);
expect(userMessages).toHaveLength(0);
// First tool message should still carry the embedded image
const firstToolContent = toolMessages[0].content as Array<{
type: string;
image_url?: { url: string };
}>;
const img = firstToolContent.find((p) => p.type === 'image_url');
expect(img?.image_url?.url).toBe('data:image/png;base64,aaa');
});
it('should convert function responses with fileData to tool message with embedded image_url', () => {
const request: GenerateContentParameters = {
model: 'models/test',

View file

@ -400,6 +400,14 @@ function processContent(
const reasoningParts: string[] = [];
const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = [];
let toolCallIndex = 0;
// When `splitToolMedia` is enabled, media stripped from tool messages is
// accumulated here and emitted as a single follow-up user message after
// ALL tool messages in this group have been pushed. OpenAI Chat
// Completions requires every `role: "tool"` response for a given assistant
// turn to appear contiguously before any non-tool message; emitting the
// user message inline (after each tool message) would interleave and
// break that contract when multiple parallel tool calls return media.
const accumulatedSplitMedia: OpenAIContentPart[] = [];
for (const part of parts) {
if (typeof part === 'string') {
@ -441,11 +449,65 @@ function processContent(
requestContext,
);
if (toolMessage) {
// Opt-in only (ContentGeneratorConfig.splitToolMedia). OpenAI spec
// only permits string / text-part content on `role: "tool"` messages.
// Strict OpenAI-compatible servers (e.g. LM Studio) reject tool
// messages containing image_url / input_audio / video_url / file
// parts with HTTP 400 "Invalid 'messages' in payload". When the flag
// is set, strip non-text media from this tool message and accumulate
// it; the combined media is emitted as a single follow-up user
// message after the parts loop completes — preserving the
// "all tool responses contiguous" requirement for parallel tool
// calls. Default (flag false) preserves prior behavior: media is
// embedded in the tool message and permissive providers continue
// to receive it that way. See #3616.
if (
requestContext.splitToolMedia &&
Array.isArray(toolMessage.content)
) {
const mediaParts: OpenAIContentPart[] = [];
const textParts: OpenAI.Chat.ChatCompletionContentPartText[] = [];
for (const cp of toolMessage.content as OpenAIContentPart[]) {
if (
cp &&
(cp.type === 'image_url' ||
cp.type === 'input_audio' ||
cp.type === 'video_url' ||
cp.type === 'file')
) {
mediaParts.push(cp);
} else if (cp && cp.type === 'text') {
textParts.push(cp);
}
}
if (mediaParts.length > 0) {
const textOnly = textParts.map((p) => p.text).join('\n');
toolMessage.content =
textOnly || '[media attached in following user message]';
accumulatedSplitMedia.push(...mediaParts);
}
}
messages.push(toolMessage);
}
}
}
// Emit one combined user message containing all media stripped from the
// tool messages in this group. Runs after the parts loop so all tool
// messages remain contiguous (OpenAI requirement for parallel tool calls).
if (accumulatedSplitMedia.length > 0) {
messages.push({
role: 'user',
content: [
{
type: 'text',
text: '(attached media from previous tool call)',
},
...accumulatedSplitMedia,
] as unknown as OpenAI.Chat.ChatCompletionContentPartText[],
});
}
if (role === 'assistant') {
if (
contentParts.length === 0 &&

View file

@ -521,6 +521,7 @@ export class ContentGenerationPipeline {
model: effectiveModel,
modalities: this.contentGeneratorConfig.modalities ?? {},
startTime: Date.now(),
splitToolMedia: this.contentGeneratorConfig.splitToolMedia ?? false,
...(toolCallParser ? { toolCallParser } : {}),
};
}

View file

@ -18,6 +18,10 @@ export interface RequestContext {
modalities: InputModalities;
startTime: number;
toolCallParser?: StreamingToolCallParser;
// When true, media parts in tool-result messages are split into a follow-up
// user message for strict OpenAI-compat servers. See ContentGeneratorConfig
// for details.
splitToolMedia?: boolean;
}
export interface ErrorHandler {

View file

@ -30,6 +30,7 @@ export const MODEL_GENERATION_CONFIG_FIELDS = [
'customHeaders',
'extra_body',
'modalities',
'splitToolMedia',
] as const satisfies ReadonlyArray<keyof ContentGeneratorConfig>;
/**

View file

@ -38,6 +38,7 @@ export type ModelGenerationConfig = Pick<
| 'extra_body'
| 'contextWindowSize'
| 'modalities'
| 'splitToolMedia'
>;
/**

View file

@ -359,6 +359,11 @@
"type": "boolean",
"default": true
},
"splitToolMedia": {
"description": "When true, media (images / audio / video / files) returned by MCP tool calls is split into a follow-up user message instead of being embedded in the tool message. Required for strict OpenAI-compatible servers (e.g., LM Studio) that reject non-text content on `role: \"tool\"` messages with HTTP 400 \"Invalid 'messages' in payload\". Default false preserves the prior behavior for permissive providers. See QwenLM/qwen-code#3616.",
"type": "boolean",
"default": false
},
"schemaCompliance": {
"description": "The compliance mode for tool schemas sent to the model. Use \"openapi_30\" for strict OpenAPI 3.0 compatibility (e.g., for Gemini). Options: auto, openapi_30",
"enum": [