From b9a0d904de82070a4ac6594c9aa6bc7853cfe5ba Mon Sep 17 00:00:00 2001 From: tanzhenxin Date: Wed, 21 Jan 2026 15:44:58 +0800 Subject: [PATCH] feat: add multi-modal input support (image, PDF, audio) across all content generators --- .gitignore | 2 +- .../src/ui/components/InputPrompt.test.tsx | 4 +- packages/cli/src/ui/utils/clipboardUtils.ts | 4 +- .../converter.test.ts | 232 ++++++++++ .../anthropicContentGenerator/converter.ts | 260 ++++++----- .../core/src/core/coreToolScheduler.test.ts | 18 +- packages/core/src/core/coreToolScheduler.ts | 38 +- .../core/nonInteractiveToolExecutor.test.ts | 6 +- .../openaiContentGenerator/converter.test.ts | 411 ++++++++++++++++- .../core/openaiContentGenerator/converter.ts | 428 +++++++++--------- .../core/openaiContentGenerator/pipeline.ts | 24 +- .../provider/dashscope.test.ts | 21 +- .../provider/dashscope.ts | 28 +- packages/core/src/tools/read-file.test.ts | 6 +- .../core/src/tools/read-many-files.test.ts | 16 +- packages/core/src/utils/fileUtils.test.ts | 15 +- packages/core/src/utils/fileUtils.ts | 17 +- packages/core/src/utils/pathReader.test.ts | 2 + 18 files changed, 1104 insertions(+), 428 deletions(-) diff --git a/.gitignore b/.gitignore index 705216c80..450168925 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,7 @@ !.gemini/config.yaml !.gemini/commands/ -# Note: .gemini-clipboard/ is NOT in gitignore so Gemini can access pasted images +# Note: .qwen-clipboard/ is NOT in gitignore so Gemini can access pasted images # Dependency directory node_modules diff --git a/packages/cli/src/ui/components/InputPrompt.test.tsx b/packages/cli/src/ui/components/InputPrompt.test.tsx index de4cd1dee..4c604c37e 100644 --- a/packages/cli/src/ui/components/InputPrompt.test.tsx +++ b/packages/cli/src/ui/components/InputPrompt.test.tsx @@ -376,7 +376,7 @@ describe('InputPrompt', () => { it('should handle Ctrl+V when clipboard has an image', async () => { vi.mocked(clipboardUtils.clipboardHasImage).mockResolvedValue(true); vi.mocked(clipboardUtils.saveClipboardImage).mockResolvedValue( - '/test/.gemini-clipboard/clipboard-123.png', + '/test/.qwen-clipboard/clipboard-123.png', ); const { stdin, unmount } = renderWithProviders( @@ -436,7 +436,7 @@ describe('InputPrompt', () => { it('should insert image path at cursor position with proper spacing', async () => { const imagePath = path.join( 'test', - '.gemini-clipboard', + '.qwen-clipboard', 'clipboard-456.png', ); vi.mocked(clipboardUtils.clipboardHasImage).mockResolvedValue(true); diff --git a/packages/cli/src/ui/utils/clipboardUtils.ts b/packages/cli/src/ui/utils/clipboardUtils.ts index f6d2380b9..513b5fddc 100644 --- a/packages/cli/src/ui/utils/clipboardUtils.ts +++ b/packages/cli/src/ui/utils/clipboardUtils.ts @@ -44,7 +44,7 @@ export async function saveClipboardImage( // Create a temporary directory for clipboard images within the target directory // This avoids security restrictions on paths outside the target directory const baseDir = targetDir || process.cwd(); - const tempDir = path.join(baseDir, '.gemini-clipboard'); + const tempDir = path.join(baseDir, '.qwen-clipboard'); await fs.mkdir(tempDir, { recursive: true }); // Generate a unique filename with timestamp @@ -120,7 +120,7 @@ export async function cleanupOldClipboardImages( ): Promise { try { const baseDir = targetDir || process.cwd(); - const tempDir = path.join(baseDir, '.gemini-clipboard'); + const tempDir = path.join(baseDir, '.qwen-clipboard'); const files = await fs.readdir(tempDir); const oneHourAgo = Date.now() - 60 * 60 * 1000; diff --git a/packages/core/src/core/anthropicContentGenerator/converter.test.ts b/packages/core/src/core/anthropicContentGenerator/converter.test.ts index f2ab79411..5b3316886 100644 --- a/packages/core/src/core/anthropicContentGenerator/converter.test.ts +++ b/packages/core/src/core/anthropicContentGenerator/converter.test.ts @@ -208,6 +208,238 @@ describe('AnthropicContentConverter', () => { ], }); }); + + it('converts function response with inlineData image parts into tool_result with images', () => { + const { messages } = converter.convertGeminiRequestToAnthropic({ + model: 'models/test', + contents: [ + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call-1', + name: 'Read', + response: { output: 'Image content' }, + parts: [ + { + inlineData: { + mimeType: 'image/png', + data: 'base64encodeddata', + }, + }, + ], + }, + }, + ], + }, + ], + }); + + expect(messages).toEqual([ + { + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: 'call-1', + content: [ + { type: 'text', text: 'Image content' }, + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: 'base64encodeddata', + }, + }, + ], + }, + ], + }, + ]); + }); + + it('renders non-image inlineData as a text block (avoids invalid image media_type)', () => { + const { messages } = converter.convertGeminiRequestToAnthropic({ + model: 'models/test', + contents: [ + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call-1', + name: 'Read', + response: { output: 'Audio content' }, + parts: [ + { + inlineData: { + mimeType: 'audio/mpeg', + data: 'base64encodedaudiodata', + }, + }, + ], + }, + }, + ], + }, + ], + }); + + expect(messages).toHaveLength(1); + expect(messages[0]?.role).toBe('user'); + + const toolResult = messages[0]?.content?.[0] as { + type: string; + content: Array<{ type: string; text?: string }>; + }; + expect(toolResult.type).toBe('tool_result'); + expect(Array.isArray(toolResult.content)).toBe(true); + expect(toolResult.content[0]).toEqual({ + type: 'text', + text: 'Audio content', + }); + expect(toolResult.content[1]?.type).toBe('text'); + expect(toolResult.content[1]?.text).toContain( + 'Unsupported inline media type for Anthropic', + ); + expect(toolResult.content[1]?.text).toContain('audio/mpeg'); + }); + + it('converts fileData with PDF into document block', () => { + const { messages } = converter.convertGeminiRequestToAnthropic({ + model: 'models/test', + contents: [ + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call-1', + name: 'Read', + response: { output: 'PDF content' }, + parts: [ + { + fileData: { + mimeType: 'application/pdf', + fileUri: 'pdfbase64data', + }, + }, + ], + }, + }, + ], + }, + ], + }); + + expect(messages).toEqual([ + { + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: 'call-1', + content: [ + { type: 'text', text: 'PDF content' }, + { + type: 'document', + source: { + type: 'base64', + media_type: 'application/pdf', + data: 'pdfbase64data', + }, + }, + ], + }, + ], + }, + ]); + }); + + it('associates each image with its preceding functionResponse', () => { + const { messages } = converter.convertGeminiRequestToAnthropic({ + model: 'models/test', + contents: [ + { + role: 'user', + parts: [ + // Tool 1 with image 1 + { + functionResponse: { + id: 'call-1', + name: 'Read', + response: { output: 'File 1' }, + parts: [ + { + inlineData: { + mimeType: 'image/png', + data: 'image1data', + }, + }, + ], + }, + }, + // Tool 2 with image 2 + { + functionResponse: { + id: 'call-2', + name: 'Read', + response: { output: 'File 2' }, + parts: [ + { + inlineData: { + mimeType: 'image/jpeg', + data: 'image2data', + }, + }, + ], + }, + }, + ], + }, + ], + }); + + // Multiple tool_result blocks are emitted in order + expect(messages).toHaveLength(1); + expect(messages[0]).toEqual({ + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: 'call-1', + content: [ + { type: 'text', text: 'File 1' }, + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: 'image1data', + }, + }, + ], + }, + { + type: 'tool_result', + tool_use_id: 'call-2', + content: [ + { type: 'text', text: 'File 2' }, + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/jpeg', + data: 'image2data', + }, + }, + ], + }, + ], + }); + }); }); describe('convertGeminiToolsToAnthropic', () => { diff --git a/packages/core/src/core/anthropicContentGenerator/converter.ts b/packages/core/src/core/anthropicContentGenerator/converter.ts index 2fb9b7fee..48e04d798 100644 --- a/packages/core/src/core/anthropicContentGenerator/converter.ts +++ b/packages/core/src/core/anthropicContentGenerator/converter.ts @@ -10,7 +10,6 @@ import type { Content, ContentListUnion, ContentUnion, - FunctionCall, FunctionResponse, GenerateContentParameters, Part, @@ -30,15 +29,6 @@ type AnthropicMessageParam = Anthropic.MessageParam; type AnthropicToolParam = Anthropic.Tool; type AnthropicContentBlockParam = Anthropic.ContentBlockParam; -type ThoughtPart = { text: string; signature?: string }; - -interface ParsedParts { - thoughtParts: ThoughtPart[]; - contentParts: string[]; - functionCalls: FunctionCall[]; - functionResponses: FunctionResponse[]; -} - export class AnthropicContentConverter { private model: string; private schemaCompliance: SchemaComplianceMode; @@ -228,127 +218,161 @@ export class AnthropicContentConverter { } if (!this.isContentObject(content)) return; - - const parsed = this.parseParts(content.parts || []); - - if (parsed.functionResponses.length > 0) { - for (const response of parsed.functionResponses) { - messages.push({ - role: 'user', - content: [ - { - type: 'tool_result', - tool_use_id: response.id || '', - content: this.extractFunctionResponseContent(response.response), - }, - ], - }); - } - return; - } - - if (content.role === 'model' && parsed.functionCalls.length > 0) { - const thinkingBlocks: AnthropicContentBlockParam[] = - parsed.thoughtParts.map((part) => { - const thinkingBlock: unknown = { - type: 'thinking', - thinking: part.text, - }; - if (part.signature) { - (thinkingBlock as { signature?: string }).signature = - part.signature; - } - return thinkingBlock as AnthropicContentBlockParam; - }); - const toolUses: AnthropicContentBlockParam[] = parsed.functionCalls.map( - (call, index) => ({ - type: 'tool_use', - id: call.id || `tool_${index}`, - name: call.name || '', - input: (call.args as Record) || {}, - }), - ); - - const textBlocks: AnthropicContentBlockParam[] = parsed.contentParts.map( - (text) => ({ - type: 'text' as const, - text, - }), - ); - - messages.push({ - role: 'assistant', - content: [...thinkingBlocks, ...textBlocks, ...toolUses], - }); - return; - } - + const parts = content.parts || []; const role = content.role === 'model' ? 'assistant' : 'user'; - const thinkingBlocks: AnthropicContentBlockParam[] = - role === 'assistant' - ? parsed.thoughtParts.map((part) => { - const thinkingBlock: unknown = { - type: 'thinking', - thinking: part.text, - }; - if (part.signature) { - (thinkingBlock as { signature?: string }).signature = - part.signature; - } - return thinkingBlock as AnthropicContentBlockParam; - }) - : []; - const textBlocks: AnthropicContentBlockParam[] = [ - ...thinkingBlocks, - ...parsed.contentParts.map((text) => ({ - type: 'text' as const, - text, - })), - ]; - if (textBlocks.length > 0) { - messages.push({ role, content: textBlocks }); - } - } - - private parseParts(parts: Part[]): ParsedParts { - const thoughtParts: ThoughtPart[] = []; - const contentParts: string[] = []; - const functionCalls: FunctionCall[] = []; - const functionResponses: FunctionResponse[] = []; + const contentBlocks: AnthropicContentBlockParam[] = []; + let toolCallIndex = 0; for (const part of parts) { if (typeof part === 'string') { - contentParts.push(part); - } else if ( - 'text' in part && - part.text && - !('thought' in part && part.thought) - ) { - contentParts.push(part.text); - } else if ('text' in part && 'thought' in part && part.thought) { - thoughtParts.push({ - text: part.text || '', - signature: + contentBlocks.push({ type: 'text', text: part }); + continue; + } + + if ('text' in part && 'thought' in part && part.thought) { + if (role === 'assistant') { + const thinkingBlock: unknown = { + type: 'thinking', + thinking: part.text || '', + }; + if ( 'thoughtSignature' in part && typeof part.thoughtSignature === 'string' - ? part.thoughtSignature - : undefined, - }); - } else if ('functionCall' in part && part.functionCall) { - functionCalls.push(part.functionCall); - } else if ('functionResponse' in part && part.functionResponse) { - functionResponses.push(part.functionResponse); + ) { + (thinkingBlock as { signature?: string }).signature = + part.thoughtSignature; + } + contentBlocks.push(thinkingBlock as AnthropicContentBlockParam); + } + } + + if ('text' in part && part.text && !('thought' in part && part.thought)) { + contentBlocks.push({ type: 'text', text: part.text }); + } + + const mediaBlock = this.createMediaBlockFromPart(part); + if (mediaBlock) { + contentBlocks.push(mediaBlock); + } + + if ('functionCall' in part && part.functionCall) { + if (role === 'assistant') { + contentBlocks.push({ + type: 'tool_use', + id: part.functionCall.id || `tool_${toolCallIndex}`, + name: part.functionCall.name || '', + input: (part.functionCall.args as Record) || {}, + }); + toolCallIndex += 1; + } + } + + if (part.functionResponse) { + const toolResultBlock = this.createToolResultBlock( + part.functionResponse, + ); + if (toolResultBlock && role === 'user') { + contentBlocks.push(toolResultBlock); + } } } + if (contentBlocks.length > 0) { + messages.push({ role, content: contentBlocks }); + } + } + + private createToolResultBlock( + response: FunctionResponse, + ): Anthropic.ToolResultBlockParam | null { + const textContent = this.extractFunctionResponseContent(response.response); + + type ToolResultContent = Anthropic.ToolResultBlockParam['content']; + const partBlocks: AnthropicContentBlockParam[] = []; + + for (const part of response.parts || []) { + const block = this.createMediaBlockFromPart(part); + if (block) { + partBlocks.push(block); + } + } + + let content: ToolResultContent; + if (partBlocks.length > 0) { + const blocks: AnthropicContentBlockParam[] = []; + if (textContent) { + blocks.push({ type: 'text', text: textContent }); + } + blocks.push(...partBlocks); + content = blocks as unknown as ToolResultContent; + } else { + content = textContent; + } + return { - thoughtParts, - contentParts, - functionCalls, - functionResponses, + type: 'tool_result', + tool_use_id: response.id || '', + content, }; } + private createMediaBlockFromPart( + part: Part, + ): AnthropicContentBlockParam | null { + if (part.inlineData?.mimeType && part.inlineData?.data) { + if (!this.isSupportedAnthropicImageMimeType(part.inlineData.mimeType)) { + const displayName = part.inlineData.displayName ?? ''; + return { + type: 'text', + text: `Unsupported inline media type for Anthropic: ${part.inlineData.mimeType}${displayName}.`, + }; + } + return { + type: 'image', + source: { + type: 'base64', + media_type: part.inlineData.mimeType as + | 'image/jpeg' + | 'image/png' + | 'image/gif' + | 'image/webp', + data: part.inlineData.data, + }, + }; + } + + if (part.fileData?.mimeType && part.fileData?.fileUri) { + if (part.fileData.mimeType !== 'application/pdf') { + const displayName = part.fileData.displayName ?? ''; + return { + type: 'text', + text: `Unsupported file media for Anthropic: ${part.fileData.mimeType}${displayName}`, + }; + } + return { + type: 'document', + source: { + type: 'base64', + media_type: part.fileData.mimeType as 'application/pdf', + data: part.fileData.fileUri, + }, + }; + } + + return null; + } + + private isSupportedAnthropicImageMimeType( + mimeType: string, + ): mimeType is 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp' { + return ( + mimeType === 'image/jpeg' || + mimeType === 'image/png' || + mimeType === 'image/gif' || + mimeType === 'image/webp' + ); + } + private extractTextFromContentUnion(contentUnion: unknown): string { if (typeof contentUnion === 'string') { return contentUnion; diff --git a/packages/core/src/core/coreToolScheduler.test.ts b/packages/core/src/core/coreToolScheduler.test.ts index 1cf3c565c..32d390173 100644 --- a/packages/core/src/core/coreToolScheduler.test.ts +++ b/packages/core/src/core/coreToolScheduler.test.ts @@ -800,11 +800,11 @@ describe('convertToFunctionResponse', () => { name: toolName, id: callId, response: { - output: 'Binary content of type image/png was processed.', + output: '', }, + parts: [{ inlineData: { mimeType: 'image/png', data: 'base64...' } }], }, }, - llmContent, ]); }); @@ -819,11 +819,15 @@ describe('convertToFunctionResponse', () => { name: toolName, id: callId, response: { - output: 'Binary content of type application/pdf was processed.', + output: '', }, + parts: [ + { + fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' }, + }, + ], }, }, - llmContent, ]); }); @@ -857,11 +861,13 @@ describe('convertToFunctionResponse', () => { name: toolName, id: callId, response: { - output: 'Binary content of type image/gif was processed.', + output: '', }, + parts: [ + { inlineData: { mimeType: 'image/gif', data: 'gifdata...' } }, + ], }, }, - ...llmContent, ]); }); diff --git a/packages/core/src/core/coreToolScheduler.ts b/packages/core/src/core/coreToolScheduler.ts index c7e2806ac..5f7f5d490 100644 --- a/packages/core/src/core/coreToolScheduler.ts +++ b/packages/core/src/core/coreToolScheduler.ts @@ -30,7 +30,12 @@ import { ToolOutputTruncatedEvent, InputFormat, } from '../index.js'; -import type { Part, PartListUnion } from '@google/genai'; +import type { + FunctionResponse, + FunctionResponsePart, + Part, + PartListUnion, +} from '@google/genai'; import { getResponseTextFromParts } from '../utils/generateContentResponseUtilities.js'; import type { ModifyContext } from '../tools/modifiable-tool.js'; import { @@ -151,13 +156,17 @@ function createFunctionResponsePart( callId: string, toolName: string, output: string, + mediaParts?: FunctionResponsePart[], ): Part { + const functionResponse: FunctionResponse = { + id: callId, + name: toolName, + response: { output }, + ...(mediaParts && mediaParts.length > 0 ? { parts: mediaParts } : {}), + }; + return { - functionResponse: { - id: callId, - name: toolName, - response: { output }, - }, + functionResponse, }; } @@ -198,16 +207,21 @@ export function convertToFunctionResponse( } if (contentToProcess.inlineData || contentToProcess.fileData) { - const mimeType = - contentToProcess.inlineData?.mimeType || - contentToProcess.fileData?.mimeType || - 'unknown'; + const mediaParts: FunctionResponsePart[] = []; + if (contentToProcess.inlineData) { + mediaParts.push({ inlineData: contentToProcess.inlineData }); + } + if (contentToProcess.fileData) { + mediaParts.push({ fileData: contentToProcess.fileData }); + } + const functionResponse = createFunctionResponsePart( callId, toolName, - `Binary content of type ${mimeType} was processed.`, + '', + mediaParts, ); - return [functionResponse, contentToProcess]; + return [functionResponse]; } if (contentToProcess.text !== undefined) { diff --git a/packages/core/src/core/nonInteractiveToolExecutor.test.ts b/packages/core/src/core/nonInteractiveToolExecutor.test.ts index 5b319deda..cbc4c145a 100644 --- a/packages/core/src/core/nonInteractiveToolExecutor.test.ts +++ b/packages/core/src/core/nonInteractiveToolExecutor.test.ts @@ -309,11 +309,13 @@ describe('executeToolCall', () => { name: 'testTool', id: 'call6', response: { - output: 'Binary content of type image/png was processed.', + output: '', }, + parts: [ + { inlineData: { mimeType: 'image/png', data: 'base64data' } }, + ], }, }, - imageDataPart, ], }); }); diff --git a/packages/core/src/core/openaiContentGenerator/converter.test.ts b/packages/core/src/core/openaiContentGenerator/converter.test.ts index c896cb9b7..a5c88a889 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.test.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.test.ts @@ -122,7 +122,13 @@ describe('OpenAIContentConverter', () => { const toolMessage = messages.find((message) => message.role === 'tool'); expect(toolMessage).toBeDefined(); - expect(toolMessage?.content).toBe('Raw output text'); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + }>; + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('Raw output text'); }); it('should prioritize error field when present', () => { @@ -134,7 +140,13 @@ describe('OpenAIContentConverter', () => { const toolMessage = messages.find((message) => message.role === 'tool'); expect(toolMessage).toBeDefined(); - expect(toolMessage?.content).toBe('Command failed'); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + }>; + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('Command failed'); }); it('should stringify non-string responses', () => { @@ -146,7 +158,318 @@ describe('OpenAIContentConverter', () => { const toolMessage = messages.find((message) => message.role === 'tool'); expect(toolMessage).toBeDefined(); - expect(toolMessage?.content).toBe('{"data":{"value":42}}'); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + }>; + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('{"data":{"value":42}}'); + }); + + it('should convert function responses with inlineData to tool message with embedded image_url', () => { + const request: GenerateContentParameters = { + model: 'models/test', + contents: [ + { + role: 'model', + parts: [ + { + functionCall: { + id: 'call_1', + name: 'Read', + args: {}, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call_1', + name: 'Read', + response: { output: 'Image content' }, + parts: [ + { + inlineData: { + mimeType: 'image/png', + data: 'base64encodedimagedata', + }, + }, + ], + }, + }, + ], + }, + ], + }; + + const messages = converter.convertGeminiRequestToOpenAI(request); + + // Should have tool message with both text and image content + const toolMessage = messages.find((message) => message.role === 'tool'); + expect(toolMessage).toBeDefined(); + expect((toolMessage as { tool_call_id?: string }).tool_call_id).toBe( + 'call_1', + ); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + image_url?: { url: string }; + }>; + expect(contentArray).toHaveLength(2); + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('Image content'); + expect(contentArray[1].type).toBe('image_url'); + expect(contentArray[1].image_url?.url).toBe( + 'data:image/png;base64,base64encodedimagedata', + ); + + // No separate user message should be created + const userMessage = messages.find((message) => message.role === 'user'); + expect(userMessage).toBeUndefined(); + }); + + it('should convert function responses with fileData to tool message with embedded input_file', () => { + const request: GenerateContentParameters = { + model: 'models/test', + contents: [ + { + role: 'model', + parts: [ + { + functionCall: { + id: 'call_1', + name: 'Read', + args: {}, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call_1', + name: 'Read', + response: { output: 'File content' }, + parts: [ + { + fileData: { + mimeType: 'image/jpeg', + fileUri: 'base64imagedata', + }, + }, + ], + }, + }, + ], + }, + ], + }; + + const messages = converter.convertGeminiRequestToOpenAI(request); + + // Should have tool message with both text and file content + const toolMessage = messages.find((message) => message.role === 'tool'); + expect(toolMessage).toBeDefined(); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + file?: { filename: string; file_data: string }; + }>; + expect(contentArray).toHaveLength(2); + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('File content'); + expect(contentArray[1].type).toBe('file'); + expect(contentArray[1].file?.filename).toBe('file'); // Default filename when displayName not provided + expect(contentArray[1].file?.file_data).toBe( + 'data:image/jpeg;base64,base64imagedata', + ); + + // No separate user message should be created + const userMessage = messages.find((message) => message.role === 'user'); + expect(userMessage).toBeUndefined(); + }); + + it('should convert PDF fileData to tool message with embedded input_file', () => { + const request: GenerateContentParameters = { + model: 'models/test', + contents: [ + { + role: 'model', + parts: [ + { + functionCall: { + id: 'call_1', + name: 'Read', + args: {}, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call_1', + name: 'Read', + response: { output: 'PDF content' }, + parts: [ + { + fileData: { + mimeType: 'application/pdf', + fileUri: 'base64pdfdata', + displayName: 'document.pdf', + }, + }, + ], + }, + }, + ], + }, + ], + }; + + const messages = converter.convertGeminiRequestToOpenAI(request); + + // Should have tool message with both text and file content + const toolMessage = messages.find((message) => message.role === 'tool'); + expect(toolMessage).toBeDefined(); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + file?: { filename: string; file_data: string }; + }>; + expect(contentArray).toHaveLength(2); + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('PDF content'); + expect(contentArray[1].type).toBe('file'); + expect(contentArray[1].file?.filename).toBe('document.pdf'); + expect(contentArray[1].file?.file_data).toBe( + 'data:application/pdf;base64,base64pdfdata', + ); + + // No separate user message should be created + const userMessage = messages.find((message) => message.role === 'user'); + expect(userMessage).toBeUndefined(); + }); + + it('should convert audio parts to tool message with embedded input_audio', () => { + const request: GenerateContentParameters = { + model: 'models/test', + contents: [ + { + role: 'model', + parts: [ + { + functionCall: { + id: 'call_1', + name: 'Record', + args: {}, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call_1', + name: 'Record', + response: { output: 'Audio recorded' }, + parts: [ + { + inlineData: { + mimeType: 'audio/wav', + data: 'audiobase64data', + }, + }, + ], + }, + }, + ], + }, + ], + }; + + const messages = converter.convertGeminiRequestToOpenAI(request); + + // Should have tool message with both text and audio content + const toolMessage = messages.find((message) => message.role === 'tool'); + expect(toolMessage).toBeDefined(); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + input_audio?: { data: string; format: string }; + }>; + expect(contentArray).toHaveLength(2); + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('Audio recorded'); + expect(contentArray[1].type).toBe('input_audio'); + expect(contentArray[1].input_audio?.data).toBe('audiobase64data'); + expect(contentArray[1].input_audio?.format).toBe('wav'); + + // No separate user message should be created + const userMessage = messages.find((message) => message.role === 'user'); + expect(userMessage).toBeUndefined(); + }); + + it('should create tool message with text-only content when no media parts', () => { + const request = createRequestWithFunctionResponse({ + output: 'Plain text output', + }); + + const messages = converter.convertGeminiRequestToOpenAI(request); + const toolMessage = messages.find((message) => message.role === 'tool'); + + expect(toolMessage).toBeDefined(); + expect(Array.isArray(toolMessage?.content)).toBe(true); + const contentArray = toolMessage?.content as Array<{ + type: string; + text?: string; + }>; + expect(contentArray).toHaveLength(1); + expect(contentArray[0].type).toBe('text'); + expect(contentArray[0].text).toBe('Plain text output'); + + // No user message should be created when there's no media + const userMessage = messages.find((message) => message.role === 'user'); + expect(userMessage).toBeUndefined(); + }); + + it('should skip empty function responses with no media and no text', () => { + const request: GenerateContentParameters = { + model: 'models/test', + contents: [ + { + role: 'user', + parts: [ + { + functionResponse: { + id: 'call_1', + name: 'Empty', + response: { output: '' }, + }, + }, + ], + }, + ], + }; + + const messages = converter.convertGeminiRequestToOpenAI(request); + + // Should have no messages for empty response + expect(messages).toHaveLength(0); }); }); @@ -180,6 +503,35 @@ describe('OpenAIContentConverter', () => { ); }); + it('should convert reasoning to a thought part for non-streaming responses', () => { + const response = converter.convertOpenAIResponseToGemini({ + object: 'chat.completion', + id: 'chatcmpl-2', + created: 123, + model: 'gpt-test', + choices: [ + { + index: 0, + message: { + role: 'assistant', + content: 'final answer', + reasoning: 'chain-of-thought', + }, + finish_reason: 'stop', + logprobs: null, + }, + ], + } as unknown as OpenAI.Chat.ChatCompletion); + + const parts = response.candidates?.[0]?.content?.parts; + expect(parts?.[0]).toEqual( + expect.objectContaining({ thought: true, text: 'chain-of-thought' }), + ); + expect(parts?.[1]).toEqual( + expect.objectContaining({ text: 'final answer' }), + ); + }); + it('should convert streaming reasoning_content delta to a thought part', () => { const chunk = converter.convertOpenAIChunkToGemini({ object: 'chat.completion.chunk', @@ -208,6 +560,34 @@ describe('OpenAIContentConverter', () => { ); }); + it('should convert streaming reasoning delta to a thought part', () => { + const chunk = converter.convertOpenAIChunkToGemini({ + object: 'chat.completion.chunk', + id: 'chunk-1b', + created: 456, + choices: [ + { + index: 0, + delta: { + content: 'visible text', + reasoning: 'thinking...', + }, + finish_reason: 'stop', + logprobs: null, + }, + ], + model: 'gpt-test', + } as unknown as OpenAI.Chat.ChatCompletionChunk); + + const parts = chunk.candidates?.[0]?.content?.parts; + expect(parts?.[0]).toEqual( + expect.objectContaining({ thought: true, text: 'thinking...' }), + ); + expect(parts?.[1]).toEqual( + expect.objectContaining({ text: 'visible text' }), + ); + }); + it('should not throw when streaming chunk has no delta', () => { const chunk = converter.convertOpenAIChunkToGemini({ object: 'chat.completion.chunk', @@ -584,11 +964,7 @@ describe('OpenAIContentConverter', () => { expect(messages).toHaveLength(1); expect(messages[0].role).toBe('assistant'); - const content = messages[0] - .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(content).toHaveLength(2); - expect(content[0]).toEqual({ type: 'text', text: 'First part' }); - expect(content[1]).toEqual({ type: 'text', text: 'Second part' }); + expect(messages[0].content).toBe('First partSecond part'); }); it('should merge multiple consecutive assistant messages', () => { @@ -614,9 +990,7 @@ describe('OpenAIContentConverter', () => { expect(messages).toHaveLength(1); expect(messages[0].role).toBe('assistant'); - const content = messages[0] - .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(content).toHaveLength(3); + expect(messages[0].content).toBe('Part 1Part 2Part 3'); }); it('should merge tool_calls from consecutive assistant messages', () => { @@ -674,7 +1048,9 @@ describe('OpenAIContentConverter', () => { ], }; - const messages = converter.convertGeminiRequestToOpenAI(request); + const messages = converter.convertGeminiRequestToOpenAI(request, { + cleanOrphanToolCalls: false, + }); // Should have: assistant (tool_call_1), tool (result_1), assistant (tool_call_2), tool (result_2) expect(messages).toHaveLength(4); @@ -729,10 +1105,7 @@ describe('OpenAIContentConverter', () => { const messages = converter.convertGeminiRequestToOpenAI(request); expect(messages).toHaveLength(1); - const content = messages[0] - .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(Array.isArray(content)).toBe(true); - expect(content).toHaveLength(2); + expect(messages[0].content).toBe('Text partAnother text'); }); it('should merge empty content correctly', () => { @@ -758,11 +1131,7 @@ describe('OpenAIContentConverter', () => { // Empty messages should be filtered out expect(messages).toHaveLength(1); - const content = messages[0] - .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(content).toHaveLength(2); - expect(content[0]).toEqual({ type: 'text', text: 'First' }); - expect(content[1]).toEqual({ type: 'text', text: 'Second' }); + expect(messages[0].content).toBe('FirstSecond'); }); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/converter.ts b/packages/core/src/core/openaiContentGenerator/converter.ts index 690751a2a..a46a343c1 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.ts @@ -11,7 +11,6 @@ import type { Tool, ToolListUnion, CallableTool, - FunctionCall, FunctionResponse, ContentListUnion, ContentUnion, @@ -47,11 +46,13 @@ type ExtendedChatCompletionMessageParam = export interface ExtendedCompletionMessage extends OpenAI.Chat.ChatCompletionMessage { reasoning_content?: string | null; + reasoning?: string | null; } export interface ExtendedCompletionChunkDelta extends OpenAI.Chat.ChatCompletionChunk.Choice.Delta { reasoning_content?: string | null; + reasoning?: string | null; } /** @@ -63,21 +64,17 @@ export interface ToolCallAccumulator { arguments: string; } -/** - * Parsed parts from Gemini content, categorized by type - */ -interface ParsedParts { - thoughtParts: string[]; - contentParts: string[]; - functionCalls: FunctionCall[]; - functionResponses: FunctionResponse[]; - mediaParts: Array<{ - type: 'image' | 'audio' | 'file'; - data: string; - mimeType: string; - fileUri?: string; - }>; -} +type OpenAIContentPart = + | OpenAI.Chat.ChatCompletionContentPartText + | OpenAI.Chat.ChatCompletionContentPartImage + | OpenAI.Chat.ChatCompletionContentPartInputAudio + | { + type: 'file'; + file: { + filename: string; + file_data: string; + }; + }; /** * Converter class for transforming data between Gemini and OpenAI formats @@ -271,28 +268,48 @@ export class OpenAIContentConverter { ): OpenAI.Chat.ChatCompletion { const candidate = response.candidates?.[0]; const parts = (candidate?.content?.parts || []) as Part[]; - const parsedParts = this.parseParts(parts); + + // Parse parts inline + const thoughtParts: string[] = []; + const contentParts: string[] = []; + const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = []; + let toolCallIndex = 0; + + for (const part of parts) { + if (typeof part === 'string') { + contentParts.push(part); + } else if ('text' in part && part.text) { + if ('thought' in part && part.thought) { + thoughtParts.push(part.text); + } else { + contentParts.push(part.text); + } + } else if ('functionCall' in part && part.functionCall) { + toolCalls.push({ + id: part.functionCall.id || `call_${toolCallIndex}`, + type: 'function' as const, + function: { + name: part.functionCall.name || '', + arguments: JSON.stringify(part.functionCall.args || {}), + }, + }); + toolCallIndex += 1; + } + } const message: ExtendedCompletionMessage = { role: 'assistant', - content: parsedParts.contentParts.join('') || null, + content: contentParts.join('') || null, refusal: null, }; - const reasoningContent = parsedParts.thoughtParts.join(''); + const reasoningContent = thoughtParts.join(''); if (reasoningContent) { message.reasoning_content = reasoningContent; } - if (parsedParts.functionCalls.length > 0) { - message.tool_calls = parsedParts.functionCalls.map((call, index) => ({ - id: call.id || `call_${index}`, - type: 'function' as const, - function: { - name: call.name || '', - arguments: JSON.stringify(call.args || {}), - }, - })); + if (toolCalls.length > 0) { + message.tool_calls = toolCalls; } const finishReason = this.mapGeminiFinishReasonToOpenAI( @@ -390,40 +407,82 @@ export class OpenAIContentConverter { } if (!this.isContentObject(content)) return; + const parts = content.parts || []; + const role = content.role === 'model' ? 'assistant' : 'user'; - const parsedParts = this.parseParts(content.parts || []); + const contentParts: OpenAIContentPart[] = []; + const reasoningParts: string[] = []; + const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = []; + let toolCallIndex = 0; - // Handle function responses (tool results) first - if (parsedParts.functionResponses.length > 0) { - for (const funcResponse of parsedParts.functionResponses) { - messages.push({ - role: 'tool' as const, - tool_call_id: funcResponse.id || '', - content: this.extractFunctionResponseContent(funcResponse.response), - }); + for (const part of parts) { + if (typeof part === 'string') { + contentParts.push({ type: 'text' as const, text: part }); + continue; + } + + if ('text' in part && 'thought' in part && part.thought) { + if (role === 'assistant' && part.text) { + reasoningParts.push(part.text); + } + } + + if ('text' in part && part.text && !('thought' in part && part.thought)) { + contentParts.push({ type: 'text' as const, text: part.text }); + } + + const mediaPart = this.createMediaContentPart(part); + if (mediaPart && role === 'user') { + contentParts.push(mediaPart); + } + + if ('functionCall' in part && part.functionCall && role === 'assistant') { + toolCalls.push({ + id: part.functionCall.id || `call_${toolCallIndex}`, + type: 'function' as const, + function: { + name: part.functionCall.name || '', + arguments: JSON.stringify(part.functionCall.args || {}), + }, + }); + toolCallIndex += 1; + } + + if (part.functionResponse && role === 'user') { + // Create tool message for the function response (with embedded media) + const toolMessage = this.createToolMessage(part.functionResponse); + if (toolMessage) { + messages.push(toolMessage); + } } - return; } - // Handle model messages with function calls - if (content.role === 'model' && parsedParts.functionCalls.length > 0) { - const toolCalls = parsedParts.functionCalls.map((fc, index) => ({ - id: fc.id || `call_${index}`, - type: 'function' as const, - function: { - name: fc.name || '', - arguments: JSON.stringify(fc.args || {}), - }, - })); + if (role === 'assistant') { + if ( + contentParts.length === 0 && + toolCalls.length === 0 && + reasoningParts.length === 0 + ) { + return; + } + const assistantTextContent = contentParts + .filter( + (part): part is OpenAI.Chat.ChatCompletionContentPartText => + part.type === 'text', + ) + .map((part) => part.text) + .join(''); const assistantMessage: ExtendedChatCompletionAssistantMessageParam = { - role: 'assistant' as const, - content: parsedParts.contentParts.join('') || null, - tool_calls: toolCalls, + role: 'assistant', + content: assistantTextContent || null, }; - // Only include reasoning_content if it has actual content - const reasoningContent = parsedParts.thoughtParts.join(''); + if (toolCalls.length > 0) { + assistantMessage.tool_calls = toolCalls; + } + + const reasoningContent = reasoningParts.join(''); if (reasoningContent) { assistantMessage.reasoning_content = reasoningContent; } @@ -432,79 +491,15 @@ export class OpenAIContentConverter { return; } - // Handle regular messages with multimodal content - const role = content.role === 'model' ? 'assistant' : 'user'; - const openAIMessage = this.createMultimodalMessage(role, parsedParts); - - if (openAIMessage) { - messages.push(openAIMessage); + if (contentParts.length > 0) { + messages.push({ + role: 'user', + content: + contentParts as unknown as OpenAI.Chat.ChatCompletionContentPart[], + }); } } - /** - * Parse Gemini parts into categorized components - */ - private parseParts(parts: Part[]): ParsedParts { - const thoughtParts: string[] = []; - const contentParts: string[] = []; - const functionCalls: FunctionCall[] = []; - const functionResponses: FunctionResponse[] = []; - const mediaParts: Array<{ - type: 'image' | 'audio' | 'file'; - data: string; - mimeType: string; - fileUri?: string; - }> = []; - - for (const part of parts) { - if (typeof part === 'string') { - contentParts.push(part); - } else if ( - 'text' in part && - part.text && - !('thought' in part && part.thought) - ) { - contentParts.push(part.text); - } else if ( - 'text' in part && - part.text && - 'thought' in part && - part.thought - ) { - thoughtParts.push(part.text); - } else if ('functionCall' in part && part.functionCall) { - functionCalls.push(part.functionCall); - } else if ('functionResponse' in part && part.functionResponse) { - functionResponses.push(part.functionResponse); - } else if ('inlineData' in part && part.inlineData) { - const { data, mimeType } = part.inlineData; - if (data && mimeType) { - const mediaType = this.getMediaType(mimeType); - mediaParts.push({ type: mediaType, data, mimeType }); - } - } else if ('fileData' in part && part.fileData) { - const { fileUri, mimeType } = part.fileData; - if (fileUri && mimeType) { - const mediaType = this.getMediaType(mimeType); - mediaParts.push({ - type: mediaType, - data: '', - mimeType, - fileUri, - }); - } - } - } - - return { - thoughtParts, - contentParts, - functionCalls, - functionResponses, - mediaParts, - }; - } - private extractFunctionResponseContent(response: unknown): string { if (response === null || response === undefined) { return ''; @@ -535,6 +530,96 @@ export class OpenAIContentConverter { } } + /** + * Create a tool message from function response (with embedded media parts) + */ + private createToolMessage( + response: FunctionResponse, + ): OpenAI.Chat.ChatCompletionToolMessageParam | null { + const textContent = this.extractFunctionResponseContent(response.response); + const contentParts: OpenAIContentPart[] = []; + + // Add text content first if present + if (textContent) { + contentParts.push({ type: 'text' as const, text: textContent }); + } + + // Add media parts from function response + for (const part of response.parts || []) { + const mediaPart = this.createMediaContentPart(part); + if (mediaPart) { + contentParts.push(mediaPart); + } + } + + // Tool messages require content, so skip if empty + if (contentParts.length === 0) { + return null; + } + + // Cast to OpenAI type - some OpenAI-compatible APIs support richer content in tool messages + return { + role: 'tool' as const, + tool_call_id: response.id || '', + content: contentParts as unknown as + | string + | OpenAI.Chat.ChatCompletionContentPartText[], + }; + } + + /** + * Create OpenAI media content part from Gemini part + */ + private createMediaContentPart(part: Part): OpenAIContentPart | null { + if (part.inlineData?.mimeType && part.inlineData?.data) { + const mediaType = this.getMediaType(part.inlineData.mimeType); + if (mediaType === 'image') { + const dataUrl = `data:${part.inlineData.mimeType};base64,${part.inlineData.data}`; + return { + type: 'image_url' as const, + image_url: { url: dataUrl }, + }; + } + if (mediaType === 'audio') { + const format = this.getAudioFormat(part.inlineData.mimeType); + if (format) { + return { + type: 'input_audio' as const, + input_audio: { + data: part.inlineData.data, + format, + }, + }; + } + } + } + + if (part.fileData?.mimeType && part.fileData?.fileUri) { + const filename = part.fileData.displayName || 'file'; + const fileUri = part.fileData.fileUri; + + if (fileUri.startsWith('data:')) { + return { + type: 'file' as const, + file: { + filename, + file_data: fileUri, + }, + }; + } + + return { + type: 'file' as const, + file: { + filename, + file_data: `data:${part.fileData.mimeType};base64,${fileUri}`, + }, + }; + } + + return null; + } + /** * Determine media type from MIME type */ @@ -544,85 +629,6 @@ export class OpenAIContentConverter { return 'file'; } - /** - * Create multimodal OpenAI message from parsed parts - */ - private createMultimodalMessage( - role: 'user' | 'assistant', - parsedParts: Pick< - ParsedParts, - 'contentParts' | 'mediaParts' | 'thoughtParts' - >, - ): ExtendedChatCompletionMessageParam | null { - const { contentParts, mediaParts, thoughtParts } = parsedParts; - const reasoningContent = thoughtParts.join(''); - const content = contentParts.map((text) => ({ - type: 'text' as const, - text, - })); - - // If no media parts, return simple text message - if (mediaParts.length === 0) { - if (content.length === 0) return null; - const message: ExtendedChatCompletionMessageParam = { role, content }; - // Only include reasoning_content if it has actual content - if (reasoningContent) { - ( - message as ExtendedChatCompletionAssistantMessageParam - ).reasoning_content = reasoningContent; - } - return message; - } - - // For assistant messages with media, convert to text only - // since OpenAI assistant messages don't support media content arrays - if (role === 'assistant') { - return content.length > 0 - ? { role: 'assistant' as const, content } - : null; - } - - const contentArray: OpenAI.Chat.ChatCompletionContentPart[] = [...content]; - - // Add media content - for (const mediaPart of mediaParts) { - if (mediaPart.type === 'image') { - if (mediaPart.fileUri) { - // For file URIs, use the URI directly - contentArray.push({ - type: 'image_url' as const, - image_url: { url: mediaPart.fileUri }, - }); - } else if (mediaPart.data) { - // For inline data, create data URL - const dataUrl = `data:${mediaPart.mimeType};base64,${mediaPart.data}`; - contentArray.push({ - type: 'image_url' as const, - image_url: { url: dataUrl }, - }); - } - } else if (mediaPart.type === 'audio' && mediaPart.data) { - // Convert audio format from MIME type - const format = this.getAudioFormat(mediaPart.mimeType); - if (format) { - contentArray.push({ - type: 'input_audio' as const, - input_audio: { - data: mediaPart.data, - format: format as 'wav' | 'mp3', - }, - }); - } - } - // Note: File type is not directly supported in OpenAI's current API - // Could be extended in the future or handled as text description - } - - return contentArray.length > 0 - ? { role: 'user' as const, content: contentArray } - : null; - } - /** * Convert MIME type to OpenAI audio format */ @@ -693,8 +699,9 @@ export class OpenAIContentConverter { const parts: Part[] = []; // Handle reasoning content (thoughts) - const reasoningText = (choice.message as ExtendedCompletionMessage) - .reasoning_content; + const reasoningText = + (choice.message as ExtendedCompletionMessage).reasoning_content ?? + (choice.message as ExtendedCompletionMessage).reasoning; if (reasoningText) { parts.push({ text: reasoningText, thought: true }); } @@ -798,8 +805,9 @@ export class OpenAIContentConverter { if (choice) { const parts: Part[] = []; - const reasoningText = (choice.delta as ExtendedCompletionChunkDelta) - ?.reasoning_content; + const reasoningText = + (choice.delta as ExtendedCompletionChunkDelta)?.reasoning_content ?? + (choice.delta as ExtendedCompletionChunkDelta)?.reasoning; if (reasoningText) { parts.push({ text: reasoningText, thought: true }); } @@ -1130,6 +1138,10 @@ export class OpenAIContentConverter { // If the last message is also an assistant message, merge them if (lastMessage.role === 'assistant') { + const lastToolCalls = + 'tool_calls' in lastMessage ? lastMessage.tool_calls || [] : []; + const currentToolCalls = + 'tool_calls' in message ? message.tool_calls || [] : []; // Combine content const lastContent = lastMessage.content; const currentContent = message.content; @@ -1171,10 +1183,6 @@ export class OpenAIContentConverter { } // Combine tool calls - const lastToolCalls = - 'tool_calls' in lastMessage ? lastMessage.tool_calls || [] : []; - const currentToolCalls = - 'tool_calls' in message ? message.tool_calls || [] : []; const combinedToolCalls = [...lastToolCalls, ...currentToolCalls]; // Update the last message with combined data diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.ts b/packages/core/src/core/openaiContentGenerator/pipeline.ts index 0f00ecb30..0ee0f1e25 100644 --- a/packages/core/src/core/openaiContentGenerator/pipeline.ts +++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts @@ -320,13 +320,15 @@ export class ContentGenerationPipeline { 'frequency_penalty', 'frequencyPenalty', ), - ...this.buildReasoningConfig(), + ...this.buildReasoningConfig(request), }; return params; } - private buildReasoningConfig(): Record { + private buildReasoningConfig( + request: GenerateContentParameters, + ): Record { // Reasoning configuration for OpenAI-compatible endpoints is highly fragmented. // For example, across common providers and models: // @@ -336,13 +338,21 @@ export class ContentGenerationPipeline { // - gpt-5.x series — thinking is enabled by default; can be disabled via `reasoning.effort` // - qwen3 series — model-dependent; can be manually disabled via `extra_body.enable_thinking` // - // Given this inconsistency, we choose not to set any reasoning config here and - // instead rely on each model’s default behavior. + // Given this inconsistency, we avoid mapping values and only pass through the + // configured reasoning object when explicitly enabled. This keeps provider- and + // model-specific semantics intact while honoring request-level opt-out. - // We plan to introduce provider- and model-specific settings to enable more - // fine-grained control over reasoning configuration. + if (request.config?.thinkingConfig?.includeThoughts === false) { + return {}; + } - return {}; + const reasoning = this.contentGeneratorConfig.reasoning; + + if (reasoning === false || reasoning === undefined) { + return {}; + } + + return { reasoning }; } /** diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts index e7c951fd9..6d8fd8a5f 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts @@ -608,7 +608,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { }); }); - it('should add empty text item with cache control if last item is not text for streaming requests', () => { + it('should add cache control to last item even if not text for streaming requests', () => { const requestWithNonTextLast: OpenAI.Chat.ChatCompletionCreateParams = { model: 'qwen-max', stream: true, // This will trigger cache control on last message @@ -633,12 +633,12 @@ describe('DashScopeOpenAICompatibleProvider', () => { const content = result.messages[0] .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(content).toHaveLength(3); + expect(content).toHaveLength(2); - // Should add empty text item with cache control - expect(content[2]).toEqual({ - type: 'text', - text: '', + // Cache control should be added to the last item (image) + expect(content[1]).toEqual({ + type: 'image_url', + image_url: { url: 'https://example.com/image.jpg' }, cache_control: { type: 'ephemeral' }, }); }); @@ -709,13 +709,8 @@ describe('DashScopeOpenAICompatibleProvider', () => { const content = result.messages[0] .content as OpenAI.Chat.ChatCompletionContentPart[]; - expect(content).toEqual([ - { - type: 'text', - text: '', - cache_control: { type: 'ephemeral' }, - }, - ]); + // Empty content array should remain empty + expect(content).toEqual([]); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts index 45b0568a0..e931d08ce 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts @@ -257,31 +257,15 @@ export class DashScopeOpenAICompatibleProvider contentArray: ChatCompletionContentPartWithCache[], ): ChatCompletionContentPartWithCache[] { if (contentArray.length === 0) { - return [ - { - type: 'text', - text: '', - cache_control: { type: 'ephemeral' }, - } as ChatCompletionContentPartTextWithCache, - ]; + return contentArray; } + // Add cache_control to the last text item const lastItem = contentArray[contentArray.length - 1]; - - if (lastItem.type === 'text') { - // Add cache_control to the last text item - contentArray[contentArray.length - 1] = { - ...lastItem, - cache_control: { type: 'ephemeral' }, - } as ChatCompletionContentPartTextWithCache; - } else { - // If the last item is not text, add a new text item with cache_control - contentArray.push({ - type: 'text', - text: '', - cache_control: { type: 'ephemeral' }, - } as ChatCompletionContentPartTextWithCache); - } + contentArray[contentArray.length - 1] = { + ...lastItem, + cache_control: { type: 'ephemeral' }, + } as ChatCompletionContentPartTextWithCache; return contentArray; } diff --git a/packages/core/src/tools/read-file.test.ts b/packages/core/src/tools/read-file.test.ts index 01568eed9..17ce30763 100644 --- a/packages/core/src/tools/read-file.test.ts +++ b/packages/core/src/tools/read-file.test.ts @@ -283,6 +283,7 @@ describe('ReadFileTool', () => { inlineData: { data: pngHeader.toString('base64'), mimeType: 'image/png', + displayName: 'image.png', }, }); expect(result.returnDisplay).toBe('Read image file: image.png'); @@ -301,9 +302,10 @@ describe('ReadFileTool', () => { const result = await invocation.execute(abortSignal); expect(result.llmContent).toEqual({ - inlineData: { - data: pdfHeader.toString('base64'), + fileData: { + fileUri: pdfHeader.toString('base64'), mimeType: 'application/pdf', + displayName: 'document.pdf', }, }); expect(result.returnDisplay).toBe('Read pdf file: document.pdf'); diff --git a/packages/core/src/tools/read-many-files.test.ts b/packages/core/src/tools/read-many-files.test.ts index 758fb5d6a..ec20db671 100644 --- a/packages/core/src/tools/read-many-files.test.ts +++ b/packages/core/src/tools/read-many-files.test.ts @@ -383,6 +383,7 @@ describe('ReadManyFilesTool', () => { 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, ]).toString('base64'), mimeType: 'image/png', + displayName: 'image.png', }, }, '\n--- End of content ---', @@ -407,6 +408,7 @@ describe('ReadManyFilesTool', () => { 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, ]).toString('base64'), mimeType: 'image/png', + displayName: 'myExactImage.png', }, }, '\n--- End of content ---', @@ -434,32 +436,34 @@ describe('ReadManyFilesTool', () => { ); }); - it('should include PDF files as inlineData parts if explicitly requested by extension', async () => { + it('should include PDF files as fileData parts if explicitly requested by extension', async () => { createBinaryFile('important.pdf', Buffer.from('%PDF-1.4...')); const params = { paths: ['*.pdf'] }; // Explicitly requesting .pdf files const invocation = tool.build(params); const result = await invocation.execute(new AbortController().signal); expect(result.llmContent).toEqual([ { - inlineData: { - data: Buffer.from('%PDF-1.4...').toString('base64'), + fileData: { + fileUri: Buffer.from('%PDF-1.4...').toString('base64'), mimeType: 'application/pdf', + displayName: 'important.pdf', }, }, '\n--- End of content ---', ]); }); - it('should include PDF files as inlineData parts if explicitly requested by name', async () => { + it('should include PDF files as fileData parts if explicitly requested by name', async () => { createBinaryFile('report-final.pdf', Buffer.from('%PDF-1.4...')); const params = { paths: ['report-final.pdf'] }; const invocation = tool.build(params); const result = await invocation.execute(new AbortController().signal); expect(result.llmContent).toEqual([ { - inlineData: { - data: Buffer.from('%PDF-1.4...').toString('base64'), + fileData: { + fileUri: Buffer.from('%PDF-1.4...').toString('base64'), mimeType: 'application/pdf', + displayName: 'report-final.pdf', }, }, '\n--- End of content ---', diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index 92af55e42..92a43e2b7 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -731,6 +731,10 @@ describe('fileUtils', () => { expect( (result.llmContent as { inlineData: { data: string } }).inlineData.data, ).toBe(fakePngData.toString('base64')); + expect( + (result.llmContent as { inlineData: { displayName?: string } }) + .inlineData.displayName, + ).toBe('image.png'); expect(result.returnDisplay).toContain('Read image file: image.png'); }); @@ -743,15 +747,20 @@ describe('fileUtils', () => { mockConfig, ); expect( - (result.llmContent as { inlineData: unknown }).inlineData, + (result.llmContent as { fileData: unknown }).fileData, ).toBeDefined(); expect( - (result.llmContent as { inlineData: { mimeType: string } }).inlineData + (result.llmContent as { fileData: { mimeType: string } }).fileData .mimeType, ).toBe('application/pdf'); expect( - (result.llmContent as { inlineData: { data: string } }).inlineData.data, + (result.llmContent as { fileData: { fileUri: string } }).fileData + .fileUri, ).toBe(fakePdfData.toString('base64')); + expect( + (result.llmContent as { fileData: { displayName?: string } }).fileData + .displayName, + ).toBe('document.pdf'); expect(result.returnDisplay).toContain('Read pdf file: document.pdf'); }); diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index 940e9794d..70f207757 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -351,6 +351,7 @@ export async function processSingleFileContent( .relative(rootDirectory, filePath) .replace(/\\/g, '/'); + const displayName = path.basename(filePath); switch (fileType) { case 'binary': { return { @@ -456,7 +457,6 @@ export async function processSingleFileContent( }; } case 'image': - case 'pdf': case 'audio': case 'video': { const contentBuffer = await fs.promises.readFile(filePath); @@ -466,6 +466,21 @@ export async function processSingleFileContent( inlineData: { data: base64Data, mimeType: mime.getType(filePath) || 'application/octet-stream', + displayName, + }, + }, + returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`, + }; + } + case 'pdf': { + const contentBuffer = await fs.promises.readFile(filePath); + const base64Data = contentBuffer.toString('base64'); + return { + llmContent: { + fileData: { + fileUri: base64Data, + mimeType: mime.getType(filePath) || 'application/octet-stream', + displayName, }, }, returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`, diff --git a/packages/core/src/utils/pathReader.test.ts b/packages/core/src/utils/pathReader.test.ts index fd6ff2245..5de10765b 100644 --- a/packages/core/src/utils/pathReader.test.ts +++ b/packages/core/src/utils/pathReader.test.ts @@ -113,6 +113,7 @@ describe('readPathFromWorkspace', () => { inlineData: { mimeType: 'image/png', data: imageData.toString('base64'), + displayName: 'image.png', }, }, ]); @@ -263,6 +264,7 @@ describe('readPathFromWorkspace', () => { inlineData: { mimeType: 'image/png', data: imageData.toString('base64'), + displayName: 'photo.png', }, }); });