diff --git a/packages/cli/src/ui/components/ModelDialog.test.tsx b/packages/cli/src/ui/components/ModelDialog.test.tsx index 3ce25bfa9..b5900c80c 100644 --- a/packages/cli/src/ui/components/ModelDialog.test.tsx +++ b/packages/cli/src/ui/components/ModelDialog.test.tsx @@ -108,7 +108,7 @@ describe('', () => { it('renders the title and help text', () => { const { getByText } = renderComponent(); expect(getByText('Select Model')).toBeDefined(); - expect(getByText('(Press Esc to close)')).toBeDefined(); + expect(getByText('Enter to select · Esc to close')).toBeDefined(); }); it('passes all model options to DescriptiveRadioButtonSelect', () => { @@ -251,11 +251,12 @@ describe('', () => { expect(props.onClose).toHaveBeenCalledTimes(1); }); - it('does not pass onHighlight to DescriptiveRadioButtonSelect', () => { + it('passes onHighlight to DescriptiveRadioButtonSelect', () => { renderComponent(); const childOnHighlight = mockedSelect.mock.calls[0][0].onHighlight; - expect(childOnHighlight).toBeUndefined(); + expect(childOnHighlight).toBeDefined(); + expect(typeof childOnHighlight).toBe('function'); }); it('calls onClose prop when "escape" key is pressed', () => { diff --git a/packages/cli/src/ui/components/ModelDialog.tsx b/packages/cli/src/ui/components/ModelDialog.tsx index 8c102890f..056dfa571 100644 --- a/packages/cli/src/ui/components/ModelDialog.tsx +++ b/packages/cli/src/ui/components/ModelDialog.tsx @@ -13,8 +13,7 @@ import { logModelSlashCommand, type AvailableModel as CoreAvailableModel, type ContentGeneratorConfig, - type ContentGeneratorConfigSource, - type ContentGeneratorConfigSources, + type InputModalities, } from '@qwen-code/qwen-code-core'; import { useKeypress } from '../hooks/useKeypress.js'; import { theme } from '../semantic-colors.js'; @@ -26,57 +25,21 @@ import { MAINLINE_CODER } from '../models/availableModels.js'; import { getPersistScopeForModelSelection } from '../../config/modelProvidersScope.js'; import { t } from '../../i18n/index.js'; +function formatModalities(modalities?: InputModalities): string { + if (!modalities) return 'text-only'; + const parts: string[] = []; + if (modalities.image) parts.push('image'); + if (modalities.pdf) parts.push('pdf'); + if (modalities.audio) parts.push('audio'); + if (modalities.video) parts.push('video'); + if (parts.length === 0) return 'text-only'; + return `text · ${parts.join(' · ')}`; +} + interface ModelDialogProps { onClose: () => void; } -function formatSourceBadge( - source: ContentGeneratorConfigSource | undefined, -): string | undefined { - if (!source) return undefined; - - switch (source.kind) { - case 'cli': - return source.detail ? `CLI ${source.detail}` : 'CLI'; - case 'env': - return source.envKey ? `ENV ${source.envKey}` : 'ENV'; - case 'settings': - return source.settingsPath - ? `Settings ${source.settingsPath}` - : 'Settings'; - case 'modelProviders': { - const suffix = - source.authType && source.modelId - ? `${source.authType}:${source.modelId}` - : source.authType - ? `${source.authType}` - : source.modelId - ? `${source.modelId}` - : ''; - return suffix ? `ModelProviders ${suffix}` : 'ModelProviders'; - } - case 'default': - return source.detail ? `Default ${source.detail}` : 'Default'; - case 'computed': - return source.detail ? `Computed ${source.detail}` : 'Computed'; - case 'programmatic': - return source.detail ? `Programmatic ${source.detail}` : 'Programmatic'; - case 'unknown': - default: - return undefined; - } -} - -function readSourcesFromConfig(config: unknown): ContentGeneratorConfigSources { - if (!config) { - return {}; - } - const maybe = config as { - getContentGeneratorConfigSources?: () => ContentGeneratorConfigSources; - }; - return maybe.getContentGeneratorConfigSources?.() ?? {}; -} - function maskApiKey(apiKey: string | undefined): string { if (!apiKey) return '(not set)'; const trimmed = apiKey.trim(); @@ -143,35 +106,26 @@ function handleModelSwitchSuccess({ ); } -function ConfigRow({ +function formatContextWindow(size?: number): string { + if (!size) return '(unknown)'; + return `${size.toLocaleString('en-US')} tokens`; +} + +function DetailRow({ label, value, - badge, }: { label: string; value: React.ReactNode; - badge?: string; }): React.JSX.Element { return ( - - - - {label}: - - - {value} - + + + {label}: + + + {value} - {badge ? ( - - - - - - {badge} - - - ) : null} ); } @@ -183,13 +137,9 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { // Local error state for displaying errors within the dialog const [errorMessage, setErrorMessage] = useState(null); + const [highlightedValue, setHighlightedValue] = useState(null); const authType = config?.getAuthType(); - const effectiveConfig = - (config?.getContentGeneratorConfig?.() as - | ContentGeneratorConfig - | undefined) ?? undefined; - const sources = readSourcesFromConfig(config); const availableModelEntries = useMemo(() => { const allModels = config ? config.getAllConfiguredModels() : []; @@ -319,6 +269,20 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { return index === -1 ? 0 : index; }, [MODEL_OPTIONS, preferredKey]); + const handleHighlight = useCallback((value: string) => { + setHighlightedValue(value); + }, []); + + const highlightedEntry = useMemo(() => { + const key = highlightedValue ?? preferredKey; + return availableModelEntries.find( + ({ authType: t2, model, isRuntime, snapshotId }) => { + const v = isRuntime && snapshotId ? snapshotId : `${t2}::${model.id}`; + return v === key; + }, + ); + }, [highlightedValue, preferredKey, availableModelEntries]); + const handleSelect = useCallback( async (selected: string) => { setErrorMessage(null); @@ -413,35 +377,6 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { > {t('Select Model')} - - - {t('Current (effective) configuration')} - - - - - - {authType !== AuthType.QWEN_OAUTH && ( - <> - - - - )} - - - {!hasModels ? ( @@ -465,12 +400,50 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { )} + {highlightedEntry && ( + + + + + + {highlightedEntry.authType !== AuthType.QWEN_OAUTH && ( + <> + + + + )} + + + )} + {errorMessage && ( @@ -480,7 +453,9 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { )} - {t('(Press Esc to close)')} + + {t('Enter to select · Esc to close')} + ); diff --git a/packages/core/src/core/contentGenerator.ts b/packages/core/src/core/contentGenerator.ts index f3af06bda..078729af6 100644 --- a/packages/core/src/core/contentGenerator.ts +++ b/packages/core/src/core/contentGenerator.ts @@ -60,6 +60,17 @@ export enum AuthType { USE_ANTHROPIC = 'anthropic', } +/** + * Supported input modalities for a model. + * Omitted or false fields mean the model does not support that input type. + */ +export type InputModalities = { + image?: boolean; + pdf?: boolean; + audio?: boolean; + video?: boolean; +}; + export type ContentGeneratorConfig = { model: string; apiKey?: string; @@ -98,6 +109,9 @@ export type ContentGeneratorConfig = { customHeaders?: Record; // Extra body parameters to be merged into the request body extra_body?: Record; + // Supported input modalities. Unsupported media types are replaced with text + // placeholders. Leave undefined to use automatic detection from model name. + modalities?: InputModalities; }; // Keep the public ContentGeneratorConfigSources API, but reuse the generic diff --git a/packages/core/src/core/modalityDefaults.test.ts b/packages/core/src/core/modalityDefaults.test.ts new file mode 100644 index 000000000..8aae4be76 --- /dev/null +++ b/packages/core/src/core/modalityDefaults.test.ts @@ -0,0 +1,219 @@ +/** + * @license + * Copyright 2025 Qwen Team + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { defaultModalities } from './modalityDefaults.js'; + +describe('defaultModalities', () => { + describe('Google Gemini', () => { + it('returns full multimodal for gemini-3-pro', () => { + expect(defaultModalities('gemini-3-pro-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-3-flash', () => { + expect(defaultModalities('gemini-3-flash-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-3.1-pro', () => { + expect(defaultModalities('gemini-3.1-pro-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-2.5-pro', () => { + expect(defaultModalities('gemini-2.5-pro')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-1.5-flash', () => { + expect(defaultModalities('gemini-1.5-flash')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + }); + + describe('OpenAI', () => { + it('returns image for gpt-5.2', () => { + const m = defaultModalities('gpt-5.2'); + expect(m.image).toBe(true); + expect(m.audio).toBeUndefined(); + expect(m.pdf).toBeUndefined(); + expect(m.video).toBeUndefined(); + }); + + it('returns image for gpt-5-mini', () => { + expect(defaultModalities('gpt-5-mini').image).toBe(true); + }); + + it('returns image for gpt-4o', () => { + expect(defaultModalities('gpt-4o').image).toBe(true); + }); + + it('returns image for o3', () => { + expect(defaultModalities('o3').image).toBe(true); + }); + }); + + describe('Anthropic Claude', () => { + it('returns image + pdf for claude-opus-4-6', () => { + const m = defaultModalities('claude-opus-4-6'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + expect(m.audio).toBeUndefined(); + expect(m.video).toBeUndefined(); + }); + + it('returns image + pdf for claude-sonnet-4-6', () => { + const m = defaultModalities('claude-sonnet-4-6'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + + it('returns image + pdf for claude-sonnet-4', () => { + const m = defaultModalities('claude-sonnet-4'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + + it('returns image + pdf for claude-3.5-sonnet', () => { + const m = defaultModalities('claude-3.5-sonnet'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + }); + + describe('Qwen', () => { + it('returns image + video for qwen-vl-max', () => { + const m = defaultModalities('qwen-vl-max'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns image + video for qwen3-vl-plus', () => { + const m = defaultModalities('qwen3-vl-plus'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + }); + + it('returns image + video for vision-model', () => { + const m = defaultModalities('vision-model'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + }); + + it('returns text-only for qwen3-coder-plus', () => { + expect(defaultModalities('qwen3-coder-plus')).toEqual({}); + }); + + it('returns image + video for coder-model (same as qwen3.5-plus)', () => { + expect(defaultModalities('coder-model')).toEqual({ + image: true, + video: true, + }); + }); + + it('returns image + video for qwen3.5-plus', () => { + const m = defaultModalities('qwen3.5-plus'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns text-only for qwen-turbo', () => { + expect(defaultModalities('qwen-turbo')).toEqual({}); + }); + }); + + describe('DeepSeek', () => { + it('returns text-only for deepseek-chat', () => { + expect(defaultModalities('deepseek-chat')).toEqual({}); + }); + + it('returns text-only for deepseek-reasoner', () => { + expect(defaultModalities('deepseek-reasoner')).toEqual({}); + }); + }); + + describe('Zhipu GLM', () => { + it('returns image for glm-4.5v', () => { + const m = defaultModalities('glm-4.5v'); + expect(m.image).toBe(true); + expect(m.pdf).toBeUndefined(); + }); + + it('returns text-only for glm-5', () => { + expect(defaultModalities('glm-5')).toEqual({}); + }); + + it('returns text-only for glm-4.7', () => { + expect(defaultModalities('glm-4.7')).toEqual({}); + }); + }); + + describe('MiniMax', () => { + it('returns text-only for MiniMax-M2.5', () => { + expect(defaultModalities('MiniMax-M2.5')).toEqual({}); + }); + }); + + describe('Kimi', () => { + it('returns image + video for kimi-k2.5', () => { + const m = defaultModalities('kimi-k2.5'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns text-only for kimi-k2', () => { + expect(defaultModalities('kimi-k2')).toEqual({}); + }); + }); + + describe('unknown models', () => { + it('returns text-only for unrecognized models', () => { + expect(defaultModalities('some-random-model-xyz')).toEqual({}); + }); + }); + + describe('normalization', () => { + it('normalizes provider prefixes', () => { + expect(defaultModalities('openai/gpt-4o')).toEqual( + defaultModalities('gpt-4o'), + ); + }); + + it('returns a fresh copy each time', () => { + const a = defaultModalities('gemini-2.5-pro'); + const b = defaultModalities('gemini-2.5-pro'); + expect(a).toEqual(b); + expect(a).not.toBe(b); + }); + }); +}); diff --git a/packages/core/src/core/modalityDefaults.ts b/packages/core/src/core/modalityDefaults.ts new file mode 100644 index 000000000..790499dfe --- /dev/null +++ b/packages/core/src/core/modalityDefaults.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2025 Qwen Team + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { InputModalities } from './contentGenerator.js'; +import { normalize } from './tokenLimits.js'; + +const FULL_MULTIMODAL: InputModalities = { + image: true, + pdf: true, + audio: true, + video: true, +}; + +/** + * Ordered regex patterns: most specific -> most general (first match wins). + * Default for unknown models is text-only (empty object = all false). + */ +const MODALITY_PATTERNS: Array<[RegExp, InputModalities]> = [ + // ------------------- + // Google Gemini — full multimodal + // ------------------- + [/^gemini-3/, FULL_MULTIMODAL], + [/^gemini-/, FULL_MULTIMODAL], + + // ------------------- + // OpenAI — image by default for all gpt/o-series models + // ------------------- + [/^gpt-5/, { image: true }], + [/^gpt-/, { image: true }], + [/^o\d/, { image: true }], + + // ------------------- + // Anthropic Claude — image + pdf + // ------------------- + [/^claude-/, { image: true, pdf: true }], + + // ------------------- + // Alibaba / Qwen + // ------------------- + // Qwen3.5-Plus: image support + [/^qwen3\.5-plus/, { image: true, video: true }], + [/^coder-model$/, { image: true, video: true }], + + // Qwen VL (vision-language) models: image + video + [/^qwen-vl-/, { image: true, video: true }], + [/^qwen3-vl-/, { image: true, video: true }], + [/^vision-model$/, { image: true, video: true }], + + // Qwen coder / text models: text-only + [/^qwen3-coder-/, {}], + [/^qwen/, {}], + + // ------------------- + // DeepSeek — text-only + // ------------------- + [/^deepseek/, {}], + + // ------------------- + // Zhipu GLM + // ------------------- + [/^glm-4\.5v/, { image: true }], + [/^glm-5(?:-|$)/, {}], + [/^glm-/, {}], + + // ------------------- + // MiniMax — text-only + // ------------------- + [/^minimax-/, {}], + + // ------------------- + // Moonshot / Kimi + // ------------------- + [/^kimi-k2\.5/, { image: true, video: true }], + [/^kimi-/, {}], +]; + +/** + * Return the default input modalities for a model based on its name. + * + * Uses the same normalize-then-regex pattern as {@link tokenLimit}. + * Unknown models default to text-only (empty object) to avoid sending + * unsupported media types that would cause unrecoverable API errors. + */ +export function defaultModalities(model: string): InputModalities { + const norm = normalize(model); + for (const [regex, modalities] of MODALITY_PATTERNS) { + if (regex.test(norm)) { + return { ...modalities }; + } + } + return {}; +} diff --git a/packages/core/src/core/openaiContentGenerator/converter.test.ts b/packages/core/src/core/openaiContentGenerator/converter.test.ts index 36bbc812d..12b8b8982 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.test.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.test.ts @@ -22,7 +22,12 @@ describe('OpenAIContentConverter', () => { let converter: OpenAIContentConverter; beforeEach(() => { - converter = new OpenAIContentConverter('test-model'); + converter = new OpenAIContentConverter('test-model', 'auto', { + image: true, + pdf: true, + audio: true, + video: true, + }); }); describe('resetStreamingToolCalls', () => { @@ -1684,7 +1689,12 @@ describe('MCP tool result end-to-end through OpenAI converter (issue #1520)', () let converter: OpenAIContentConverter; beforeEach(() => { - converter = new OpenAIContentConverter('test-model'); + converter = new OpenAIContentConverter('test-model', 'auto', { + image: true, + pdf: true, + audio: true, + video: true, + }); }); it('should preserve MCP multi-text content in tool message (not leak to user message)', () => { @@ -1957,3 +1967,159 @@ describe('MCP tool result end-to-end through OpenAI converter (issue #1520)', () expect(contentArray[1].image_url?.url).toContain('data:image/png'); }); }); + +describe('modality filtering', () => { + function makeRequest(parts: Part[]): GenerateContentParameters { + return { + model: 'test-model', + contents: [{ role: 'user', parts }], + }; + } + + function getUserContentParts( + messages: OpenAI.Chat.ChatCompletionMessageParam[], + ): Array<{ type: string; text?: string }> { + const userMsg = messages.find((m) => m.role === 'user'); + if ( + !userMsg || + !('content' in userMsg) || + !Array.isArray(userMsg.content) + ) { + return []; + } + return userMsg.content as Array<{ type: string; text?: string }>; + } + + it('replaces image with placeholder when image modality is disabled', () => { + const conv = new OpenAIContentConverter('deepseek-chat', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'abc123' }, + displayName: 'screenshot.png', + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('image file'); + expect(parts[0].text).toContain('was not provided to you'); + }); + + it('keeps image when image modality is enabled', () => { + const conv = new OpenAIContentConverter('gpt-4o', 'auto', { image: true }); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'abc123' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('image_url'); + }); + + it('replaces PDF with placeholder when pdf modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', { + image: true, + }); + const request = makeRequest([ + { + inlineData: { + mimeType: 'application/pdf', + data: 'pdf-data', + displayName: 'doc.pdf', + }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('pdf file'); + expect(parts[0].text).toContain('was not provided to you'); + }); + + it('keeps PDF when pdf modality is enabled', () => { + const conv = new OpenAIContentConverter('claude-sonnet', 'auto', { + image: true, + pdf: true, + }); + const request = makeRequest([ + { + inlineData: { + mimeType: 'application/pdf', + data: 'pdf-data', + displayName: 'doc.pdf', + }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('file'); + }); + + it('replaces video with placeholder when video modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'video/mp4', data: 'vid-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('video file'); + }); + + it('replaces audio with placeholder when audio modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'audio/wav', data: 'audio-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('audio file'); + }); + + it('handles mixed content: keeps text + supported media, replaces unsupported', () => { + const conv = new OpenAIContentConverter('gpt-4o', 'auto', { image: true }); + const request = makeRequest([ + { text: 'Analyze these files' }, + { + inlineData: { mimeType: 'image/png', data: 'img-data' }, + } as unknown as Part, + { + inlineData: { mimeType: 'video/mp4', data: 'vid-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(3); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toBe('Analyze these files'); + expect(parts[1].type).toBe('image_url'); + expect(parts[2].type).toBe('text'); + expect(parts[2].text).toContain('video file'); + }); + + it('defaults to text-only when no modalities are specified', () => { + const conv = new OpenAIContentConverter('unknown-model'); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'img-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('image file'); + }); +}); diff --git a/packages/core/src/core/openaiContentGenerator/converter.ts b/packages/core/src/core/openaiContentGenerator/converter.ts index 2ca7428bd..38a2f7745 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.ts @@ -20,12 +20,16 @@ import type { import { GenerateContentResponse, FinishReason } from '@google/genai'; import type OpenAI from 'openai'; import { safeJsonParse } from '../../utils/safeJsonParse.js'; +import { createDebugLogger } from '../../utils/debugLogger.js'; +import type { InputModalities } from '../contentGenerator.js'; import { StreamingToolCallParser } from './streamingToolCallParser.js'; import { convertSchema, type SchemaComplianceMode, } from '../../utils/schemaConverter.js'; +const debugLogger = createDebugLogger('CONVERTER'); + /** * Extended usage type that supports both OpenAI standard format and alternative formats * Some models return cached_tokens at the top level instead of in prompt_tokens_details @@ -92,12 +96,18 @@ type OpenAIContentPart = export class OpenAIContentConverter { private model: string; private schemaCompliance: SchemaComplianceMode; + private modalities: InputModalities; private streamingToolCallParser: StreamingToolCallParser = new StreamingToolCallParser(); - constructor(model: string, schemaCompliance: SchemaComplianceMode = 'auto') { + constructor( + model: string, + schemaCompliance: SchemaComplianceMode = 'auto', + modalities: InputModalities = {}, + ) { this.model = model; this.schemaCompliance = schemaCompliance; + this.modalities = modalities; } /** @@ -108,6 +118,13 @@ export class OpenAIContentConverter { this.model = model; } + /** + * Update the supported input modalities. + */ + setModalities(modalities: InputModalities): void { + this.modalities = modalities; + } + /** * Reset streaming tool calls parser for new stream processing * This should be called at the beginning of each stream to prevent @@ -585,13 +602,19 @@ export class OpenAIContentConverter { } /** - * Create OpenAI media content part from Gemini part + * Create OpenAI media content part from Gemini part. + * Checks modality support before building each media type. */ private createMediaContentPart(part: Part): OpenAIContentPart | null { if (part.inlineData?.mimeType && part.inlineData?.data) { const mimeType = part.inlineData.mimeType; const mediaType = this.getMediaType(mimeType); + const displayName = part.inlineData.displayName || mimeType; + if (mediaType === 'image') { + if (!this.modalities.image) { + return this.unsupportedModalityPlaceholder('image', displayName); + } const dataUrl = `data:${mimeType};base64,${part.inlineData.data}`; return { type: 'image_url' as const, @@ -600,6 +623,9 @@ export class OpenAIContentConverter { } if (mimeType === 'application/pdf') { + if (!this.modalities.pdf) { + return this.unsupportedModalityPlaceholder('pdf', displayName); + } const filename = part.inlineData.displayName || 'document.pdf'; return { type: 'file' as const, @@ -611,6 +637,9 @@ export class OpenAIContentConverter { } if (mediaType === 'audio') { + if (!this.modalities.audio) { + return this.unsupportedModalityPlaceholder('audio', displayName); + } const format = this.getAudioFormat(mimeType); if (format) { return { @@ -624,6 +653,9 @@ export class OpenAIContentConverter { } if (mediaType === 'video') { + if (!this.modalities.video) { + return this.unsupportedModalityPlaceholder('video', displayName); + } return { type: 'video_url' as const, video_url: { @@ -632,12 +664,9 @@ export class OpenAIContentConverter { }; } - const displayName = part.inlineData.displayName - ? ` (${part.inlineData.displayName})` - : ''; return { type: 'text' as const, - text: `Unsupported inline media type: ${mimeType}${displayName}.`, + text: `Unsupported inline media type: ${mimeType} (${displayName}).`, }; } @@ -648,6 +677,9 @@ export class OpenAIContentConverter { const mediaType = this.getMediaType(mimeType); if (mediaType === 'image') { + if (!this.modalities.image) { + return this.unsupportedModalityPlaceholder('image', filename); + } return { type: 'image_url' as const, image_url: { url: fileUri }, @@ -655,6 +687,9 @@ export class OpenAIContentConverter { } if (mimeType === 'application/pdf') { + if (!this.modalities.pdf) { + return this.unsupportedModalityPlaceholder('pdf', filename); + } return { type: 'file' as const, file: { @@ -665,6 +700,9 @@ export class OpenAIContentConverter { } if (mediaType === 'video') { + if (!this.modalities.video) { + return this.unsupportedModalityPlaceholder('video', filename); + } return { type: 'video_url' as const, video_url: { @@ -673,18 +711,43 @@ export class OpenAIContentConverter { }; } - const displayName = part.fileData.displayName + const displayNameStr = part.fileData.displayName ? ` (${part.fileData.displayName})` : ''; return { type: 'text' as const, - text: `Unsupported file media type: ${mimeType}${displayName}.`, + text: `Unsupported file media type: ${mimeType}${displayNameStr}.`, }; } return null; } + /** + * Create a text placeholder for unsupported modalities. + */ + private unsupportedModalityPlaceholder( + modality: string, + displayName: string, + ): OpenAIContentPart { + debugLogger.warn( + `Model '${this.model}' does not support ${modality} input. ` + + `Replacing with text placeholder: ${displayName}`, + ); + let hint: string; + if (modality === 'pdf') { + hint = + 'The content cannot be accessed by the read_file tool. Try using other tools or commands that can extract text from PDF files.'; + } else { + hint = + 'The content cannot be accessed by the read_file tool. If you cannot find an alternative approach, let the user know you are unable to process this type of file.'; + } + return { + type: 'text' as const, + text: `[The ${modality} file "${displayName}" was not provided to you. ${hint}]`, + }; + } + /** * Determine media type from MIME type */ diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.test.ts b/packages/core/src/core/openaiContentGenerator/pipeline.test.ts index 964f768a3..d71e23e91 100644 --- a/packages/core/src/core/openaiContentGenerator/pipeline.test.ts +++ b/packages/core/src/core/openaiContentGenerator/pipeline.test.ts @@ -47,6 +47,7 @@ describe('ContentGenerationPipeline', () => { // Mock converter mockConverter = { setModel: vi.fn(), + setModalities: vi.fn(), convertGeminiRequestToOpenAI: vi.fn(), convertOpenAIResponseToGemini: vi.fn(), convertOpenAIChunkToGemini: vi.fn(), @@ -104,6 +105,7 @@ describe('ContentGenerationPipeline', () => { expect(OpenAIContentConverter).toHaveBeenCalledWith( 'test-model', undefined, + {}, ); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.ts b/packages/core/src/core/openaiContentGenerator/pipeline.ts index 1865adb48..8d2cc9fc7 100644 --- a/packages/core/src/core/openaiContentGenerator/pipeline.ts +++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts @@ -46,6 +46,7 @@ export class ContentGenerationPipeline { this.converter = new OpenAIContentConverter( this.contentGeneratorConfig.model, this.contentGeneratorConfig.schemaCompliance, + this.contentGeneratorConfig.modalities ?? {}, ); } @@ -58,6 +59,7 @@ export class ContentGenerationPipeline { // that is not valid/available for the OpenAI-compatible backend. const effectiveModel = this.contentGeneratorConfig.model; this.converter.setModel(effectiveModel); + this.converter.setModalities(this.contentGeneratorConfig.modalities ?? {}); return this.executeWithErrorHandling( request, userPromptId, @@ -85,6 +87,7 @@ export class ContentGenerationPipeline { ): Promise> { const effectiveModel = this.contentGeneratorConfig.model; this.converter.setModel(effectiveModel); + this.converter.setModalities(this.contentGeneratorConfig.modalities ?? {}); return this.executeWithErrorHandling( request, userPromptId, diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts index a57bbacb7..006cf1abd 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts @@ -800,7 +800,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(4096); // Should be limited to default output limit (4K) + expect(result.max_tokens).toBe(8192); // Should be limited to default output limit (8K) }); it('should preserve other request parameters when limiting max_tokens', () => { @@ -872,7 +872,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { ], }, ], - max_tokens: 50000, + max_tokens: 50000, // Exceeds the 32768 limit }; const result = provider.buildRequest(request, 'test-prompt-id'); @@ -899,12 +899,12 @@ describe('DashScopeOpenAICompatibleProvider', () => { ], }, ], - max_tokens: 9000, + max_tokens: 50000, // Exceeds the 32768 limit }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Limited to model's output limit (8K) + expect(result.max_tokens).toBe(32768); // Limited to model's output limit (32K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, diff --git a/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts b/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts index 68693393b..9a69cd326 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts @@ -5,7 +5,6 @@ */ import { describe, it, expect, vi, beforeEach } from 'vitest'; -import type OpenAI from 'openai'; import { DeepSeekOpenAICompatibleProvider } from './deepseek.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import type { Config } from '../../../config/config.js'; @@ -18,7 +17,6 @@ vi.mock('openai', () => ({ })); describe('DeepSeekOpenAICompatibleProvider', () => { - let provider: DeepSeekOpenAICompatibleProvider; let mockContentGeneratorConfig: ContentGeneratorConfig; let mockCliConfig: Config; @@ -34,11 +32,6 @@ describe('DeepSeekOpenAICompatibleProvider', () => { mockCliConfig = { getCliVersion: vi.fn().mockReturnValue('1.0.0'), } as unknown as Config; - - provider = new DeepSeekOpenAICompatibleProvider( - mockContentGeneratorConfig, - mockCliConfig, - ); }); describe('isDeepSeekProvider', () => { @@ -61,72 +54,15 @@ describe('DeepSeekOpenAICompatibleProvider', () => { }); }); - describe('buildRequest', () => { - const userPromptId = 'prompt-123'; - - it('converts array content into a string', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: 'Hello' }, - { type: 'text', text: ' world' }, - ], - }, - ], - }; - - const result = provider.buildRequest(originalRequest, userPromptId); - - expect(result.messages).toHaveLength(1); - expect(result.messages?.[0]).toEqual({ - role: 'user', - content: 'Hello world', + describe('getDefaultGenerationConfig', () => { + it('returns temperature 0', () => { + const provider = new DeepSeekOpenAICompatibleProvider( + mockContentGeneratorConfig, + mockCliConfig, + ); + expect(provider.getDefaultGenerationConfig()).toEqual({ + temperature: 0, }); - expect(originalRequest.messages?.[0].content).toEqual([ - { type: 'text', text: 'Hello' }, - { type: 'text', text: ' world' }, - ]); - }); - - it('leaves string content unchanged', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: 'Hello world', - }, - ], - }; - - const result = provider.buildRequest(originalRequest, userPromptId); - - expect(result.messages?.[0].content).toBe('Hello world'); - }); - - it('throws when encountering non-text multimodal parts', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: 'Hello' }, - { - type: 'image_url', - image_url: { url: 'https://example.com/image.png' }, - }, - ], - }, - ], - }; - - expect(() => - provider.buildRequest(originalRequest, userPromptId), - ).toThrow(/only supports text content/i); }); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts b/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts index 9b5fd7479..0e246725f 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts @@ -4,7 +4,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type OpenAI from 'openai'; import type { Config } from '../../../config/config.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import { DefaultOpenAICompatibleProvider } from './default.js'; @@ -26,58 +25,6 @@ export class DeepSeekOpenAICompatibleProvider extends DefaultOpenAICompatiblePro return baseUrl.toLowerCase().includes('api.deepseek.com'); } - override buildRequest( - request: OpenAI.Chat.ChatCompletionCreateParams, - userPromptId: string, - ): OpenAI.Chat.ChatCompletionCreateParams { - const baseRequest = super.buildRequest(request, userPromptId); - if (!baseRequest.messages?.length) { - return baseRequest; - } - - const messages = baseRequest.messages.map((message) => { - if (!('content' in message)) { - return message; - } - - const { content } = message; - - if ( - typeof content === 'string' || - content === null || - content === undefined - ) { - return message; - } - - if (!Array.isArray(content)) { - return message; - } - - const text = content - .map((part) => { - if (part.type !== 'text') { - throw new Error( - `DeepSeek provider only supports text content. Found non-text part of type '${part.type}' in message with role '${message.role}'.`, - ); - } - - return part.text ?? ''; - }) - .join(''); - - return { - ...message, - content: text, - } as OpenAI.Chat.ChatCompletionMessageParam; - }); - - return { - ...baseRequest, - messages, - }; - } - override getDefaultGenerationConfig(): GenerateContentConfig { return { temperature: 0, diff --git a/packages/core/src/core/tokenLimits.test.ts b/packages/core/src/core/tokenLimits.test.ts index ffd71cd4b..8aa947262 100644 --- a/packages/core/src/core/tokenLimits.test.ts +++ b/packages/core/src/core/tokenLimits.test.ts @@ -91,183 +91,144 @@ describe('normalize', () => { }); describe('tokenLimit', () => { - // Test cases for each model family describe('Google Gemini', () => { - it('should return the correct limit for Gemini 1.5 Pro', () => { - expect(tokenLimit('gemini-1.5-pro')).toBe(2097152); + it('should return 1M for Gemini 3.x (latest)', () => { + expect(tokenLimit('gemini-3-pro-preview')).toBe(1000000); + expect(tokenLimit('gemini-3-flash-preview')).toBe(1000000); + expect(tokenLimit('gemini-3.1-pro-preview')).toBe(1000000); }); - it('should return the correct limit for Gemini 1.5 Flash', () => { - expect(tokenLimit('gemini-1.5-flash')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.5 Pro', () => { - expect(tokenLimit('gemini-2.5-pro')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.5 Flash', () => { - expect(tokenLimit('gemini-2.5-flash')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.0 Flash with image generation', () => { - expect(tokenLimit('gemini-2.0-flash-image-generation')).toBe(32768); - }); - it('should return the correct limit for Gemini 2.0 Flash', () => { - expect(tokenLimit('gemini-2.0-flash')).toBe(1048576); + + it('should return 1M for legacy Gemini (fallback)', () => { + expect(tokenLimit('gemini-2.5-pro')).toBe(1000000); + expect(tokenLimit('gemini-2.5-flash')).toBe(1000000); + expect(tokenLimit('gemini-2.0-flash')).toBe(1000000); + expect(tokenLimit('gemini-1.5-pro')).toBe(1000000); + expect(tokenLimit('gemini-1.5-flash')).toBe(1000000); }); }); describe('OpenAI', () => { - it('should return the correct limit for o3-mini', () => { - expect(tokenLimit('o3-mini')).toBe(200000); + it('should return 400K for GPT-5.x (latest)', () => { + expect(tokenLimit('gpt-5')).toBe(400000); + expect(tokenLimit('gpt-5-mini')).toBe(400000); + expect(tokenLimit('gpt-5.2')).toBe(400000); + expect(tokenLimit('gpt-5.2-pro')).toBe(400000); }); - it('should return the correct limit for o3 models', () => { - expect(tokenLimit('o3')).toBe(200000); - }); - it('should return the correct limit for o4-mini', () => { - expect(tokenLimit('o4-mini')).toBe(200000); - }); - it('should return the correct limit for gpt-4o-mini', () => { - expect(tokenLimit('gpt-4o-mini')).toBe(131072); - }); - it('should return the correct limit for gpt-4o', () => { + + it('should return 128K for legacy GPT (fallback)', () => { expect(tokenLimit('gpt-4o')).toBe(131072); - }); - it('should return the correct limit for gpt-4.1-mini', () => { - expect(tokenLimit('gpt-4.1-mini')).toBe(1048576); - }); - it('should return the correct limit for gpt-4.1 models', () => { - expect(tokenLimit('gpt-4.1')).toBe(1048576); - }); - it('should return the correct limit for gpt-4', () => { + expect(tokenLimit('gpt-4o-mini')).toBe(131072); + expect(tokenLimit('gpt-4.1')).toBe(131072); expect(tokenLimit('gpt-4')).toBe(131072); }); + + it('should return 200K for o-series', () => { + expect(tokenLimit('o3')).toBe(200000); + expect(tokenLimit('o3-mini')).toBe(200000); + expect(tokenLimit('o4-mini')).toBe(200000); + }); }); describe('Anthropic Claude', () => { - it('should return the correct limit for Claude 3.5 Sonnet', () => { + it('should return 200K for all Claude models', () => { + expect(tokenLimit('claude-opus-4-6')).toBe(200000); + expect(tokenLimit('claude-sonnet-4-6')).toBe(200000); + expect(tokenLimit('claude-sonnet-4')).toBe(200000); + expect(tokenLimit('claude-opus-4')).toBe(200000); expect(tokenLimit('claude-3.5-sonnet')).toBe(200000); - }); - it('should return the correct limit for Claude 3.7 Sonnet', () => { - expect(tokenLimit('claude-3.7-sonnet')).toBe(1048576); - }); - it('should return the correct limit for Claude Sonnet 4', () => { - expect(tokenLimit('claude-sonnet-4')).toBe(1048576); - }); - it('should return the correct limit for Claude Opus 4', () => { - expect(tokenLimit('claude-opus-4')).toBe(1048576); + expect(tokenLimit('claude-3.7-sonnet')).toBe(200000); }); }); describe('Alibaba Qwen', () => { - it('should return the correct limit for qwen3-coder commercial models', () => { - expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); - expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1048576); - expect(tokenLimit('qwen3-coder-flash')).toBe(1048576); - expect(tokenLimit('qwen3-coder-flash-20250601')).toBe(1048576); + it('should return 1M for commercial Qwen3 models', () => { + expect(tokenLimit('qwen3-coder-plus')).toBe(1000000); + expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1000000); + expect(tokenLimit('qwen3-coder-flash')).toBe(1000000); + expect(tokenLimit('qwen3.5-plus')).toBe(1000000); + expect(tokenLimit('coder-model')).toBe(1000000); }); - it('should return the correct limit for qwen3-coder open source models', () => { + it('should return 256K for Qwen3 non-commercial models', () => { + expect(tokenLimit('qwen3-max')).toBe(262144); + expect(tokenLimit('qwen3-max-2026-01-23')).toBe(262144); + expect(tokenLimit('qwen3-vl-plus')).toBe(262144); expect(tokenLimit('qwen3-coder-7b')).toBe(262144); - expect(tokenLimit('qwen3-coder-480b-a35b-instruct')).toBe(262144); - expect(tokenLimit('qwen3-coder-30b-a3b-instruct')).toBe(262144); + expect(tokenLimit('qwen3-coder-next')).toBe(262144); }); - it('should return the correct limit for qwen3 2507 variants', () => { - expect(tokenLimit('qwen3-some-model-2507-instruct')).toBe(262144); + it('should return 1M for studio latest models', () => { + expect(tokenLimit('qwen-plus-latest')).toBe(1000000); + expect(tokenLimit('qwen-flash-latest')).toBe(1000000); }); - it('should return the correct limit for qwen2.5-1m', () => { - expect(tokenLimit('qwen2.5-1m')).toBe(1048576); - expect(tokenLimit('qwen2.5-1m-instruct')).toBe(1048576); - }); - - it('should return the correct limit for qwen2.5', () => { - expect(tokenLimit('qwen2.5')).toBe(131072); - expect(tokenLimit('qwen2.5-instruct')).toBe(131072); - }); - - it('should return the correct limit for qwen-plus', () => { - expect(tokenLimit('qwen-plus-latest')).toBe(1048576); - expect(tokenLimit('qwen-plus')).toBe(131072); - }); - - it('should return the correct limit for qwen-flash', () => { - expect(tokenLimit('qwen-flash-latest')).toBe(1048576); - }); - - it('should return the correct limit for qwen-turbo', () => { - expect(tokenLimit('qwen-turbo')).toBe(131072); - expect(tokenLimit('qwen-turbo-latest')).toBe(131072); - }); - }); - - describe('ByteDance Seed-OSS', () => { - it('should return the correct limit for seed-oss', () => { - expect(tokenLimit('seed-oss')).toBe(524288); - }); - }); - - describe('Zhipu GLM', () => { - it('should return the correct limit for glm-4.5v', () => { - expect(tokenLimit('glm-4.5v')).toBe(65536); - }); - it('should return the correct limit for glm-4.5-air', () => { - expect(tokenLimit('glm-4.5-air')).toBe(131072); - }); - it('should return the correct limit for glm-4.5', () => { - expect(tokenLimit('glm-4.5')).toBe(131072); - }); - it('should return the correct limit for glm-4.6', () => { - expect(tokenLimit('glm-4.6')).toBe(202752); + it('should return 256K for Qwen fallback', () => { + expect(tokenLimit('qwen-plus')).toBe(262144); + expect(tokenLimit('qwen-turbo')).toBe(262144); + expect(tokenLimit('qwen2.5')).toBe(262144); + expect(tokenLimit('qwen-vl-max-latest')).toBe(262144); + expect(tokenLimit('vision-model')).toBe(262144); }); }); describe('DeepSeek', () => { - it('should return the correct limit for deepseek-r1', () => { + it('should return 128K for DeepSeek models', () => { expect(tokenLimit('deepseek-r1')).toBe(131072); - }); - it('should return the correct limit for deepseek-v3', () => { expect(tokenLimit('deepseek-v3')).toBe(131072); + expect(tokenLimit('deepseek-chat')).toBe(131072); }); - it('should return the correct limit for deepseek-v3.1', () => { - expect(tokenLimit('deepseek-v3.1')).toBe(131072); + }); + + describe('Zhipu GLM', () => { + it('should return 200K for GLM-5 and GLM-4.7 (latest)', () => { + expect(tokenLimit('glm-5')).toBe(202752); + expect(tokenLimit('glm-4.7')).toBe(202752); }); - it('should return the correct limit for deepseek-v3.2', () => { - expect(tokenLimit('deepseek-v3.2-exp')).toBe(131072); + + it('should return 200K for legacy GLM (fallback)', () => { + expect(tokenLimit('glm-4.5')).toBe(202752); + expect(tokenLimit('glm-4.5v')).toBe(202752); + expect(tokenLimit('glm-4.5-air')).toBe(202752); + }); + }); + + describe('MiniMax', () => { + it('should return 1M for MiniMax-M2.5 (latest)', () => { + expect(tokenLimit('MiniMax-M2.5')).toBe(1000000); + }); + + it('should return 200K for MiniMax fallback', () => { + expect(tokenLimit('MiniMax-M2.1')).toBe(200000); }); }); describe('Moonshot Kimi', () => { - it('should return the correct limit for kimi-k2 variants', () => { - expect(tokenLimit('kimi-k2-0905-preview')).toBe(262144); // 256K + it('should return 256K for Kimi models', () => { + expect(tokenLimit('kimi-k2.5')).toBe(262144); expect(tokenLimit('kimi-k2-0905')).toBe(262144); - expect(tokenLimit('kimi-k2-turbo-preview')).toBe(262144); expect(tokenLimit('kimi-k2-turbo')).toBe(262144); - expect(tokenLimit('kimi-k2-0711-preview')).toBe(262144); - expect(tokenLimit('kimi-k2-instruct')).toBe(262144); }); }); describe('Other models', () => { - it('should return the correct limit for gpt-oss', () => { - expect(tokenLimit('gpt-oss')).toBe(131072); + it('should return correct limits for other known models', () => { + expect(tokenLimit('seed-oss')).toBe(524288); }); - it('should return the correct limit for llama-4-scout', () => { - expect(tokenLimit('llama-4-scout')).toBe(10485760); - }); - it('should return the correct limit for mistral-large-2', () => { - expect(tokenLimit('mistral-large-2')).toBe(131072); + + it('should return the default token limit for unknown models', () => { + expect(tokenLimit('llama-4-scout')).toBe(DEFAULT_TOKEN_LIMIT); }); }); - // Test for default limit it('should return the default token limit for an unknown model', () => { expect(tokenLimit('unknown-model-v1.0')).toBe(DEFAULT_TOKEN_LIMIT); + expect(tokenLimit('mistral-large-2')).toBe(DEFAULT_TOKEN_LIMIT); }); - // Test with complex model string it('should return the correct limit for a complex model string', () => { expect(tokenLimit(' a/b/c|GPT-4o:gpt-4o-2024-05-13-q4 ')).toBe(131072); }); - // Test case-insensitive matching it('should handle case-insensitive model names', () => { expect(tokenLimit('GPT-4O')).toBe(131072); expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000); @@ -275,99 +236,96 @@ describe('tokenLimit', () => { }); describe('tokenLimit with output type', () => { - describe('Qwen models with output limits', () => { - it('should return the correct output limit for qwen3-coder-plus', () => { - expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); - expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); + describe('latest models output limits', () => { + it('should return correct output limits for GPT-5.x', () => { + expect(tokenLimit('gpt-5.2', 'output')).toBe(131072); + expect(tokenLimit('gpt-5-mini', 'output')).toBe(131072); }); - it('should return the correct output limit for qwen-vl-max-latest', () => { - expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); + it('should return correct output limits for Gemini 3.x', () => { + expect(tokenLimit('gemini-3-pro-preview', 'output')).toBe(65536); + expect(tokenLimit('gemini-3-flash-preview', 'output')).toBe(65536); + }); + + it('should return correct output limits for Claude 4.6', () => { + expect(tokenLimit('claude-opus-4-6', 'output')).toBe(131072); + expect(tokenLimit('claude-sonnet-4-6', 'output')).toBe(65536); }); }); - describe('Default output limits', () => { + describe('legacy model output fallbacks', () => { + it('should return fallback output limits for legacy GPT', () => { + expect(tokenLimit('gpt-4o', 'output')).toBe(16384); + }); + + it('should return fallback output limits for legacy Gemini', () => { + expect(tokenLimit('gemini-2.5-pro', 'output')).toBe(8192); + }); + + it('should return fallback output limits for legacy Claude', () => { + expect(tokenLimit('claude-sonnet-4', 'output')).toBe(65536); + expect(tokenLimit('claude-opus-4', 'output')).toBe(65536); + }); + }); + + describe('Qwen output limits', () => { + it('should return correct output limits for Qwen models', () => { + expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); + expect(tokenLimit('qwen3-coder-next', 'output')).toBe(65536); + expect(tokenLimit('qwen3.5-plus', 'output')).toBe(65536); + expect(tokenLimit('qwen3-max', 'output')).toBe(65536); + expect(tokenLimit('qwen3-max-2026-01-23', 'output')).toBe(65536); + expect(tokenLimit('qwen3-vl-plus', 'output')).toBe(32768); + expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); + expect(tokenLimit('vision-model', 'output')).toBe(32768); + }); + }); + + describe('other output limits', () => { + it('should return correct output limits for DeepSeek', () => { + expect(tokenLimit('deepseek-reasoner', 'output')).toBe(65536); + expect(tokenLimit('deepseek-chat', 'output')).toBe(8192); + }); + + it('should return correct output limits for GLM', () => { + expect(tokenLimit('glm-5', 'output')).toBe(16384); + expect(tokenLimit('glm-4.7', 'output')).toBe(16384); + }); + + it('should return correct output limits for MiniMax', () => { + expect(tokenLimit('MiniMax-M2.5', 'output')).toBe(65536); + }); + + it('should return correct output limits for Kimi', () => { + expect(tokenLimit('kimi-k2.5', 'output')).toBe(32768); + }); + }); + + describe('default output limits', () => { it('should return the default output limit for unknown models', () => { expect(tokenLimit('unknown-model', 'output')).toBe( DEFAULT_OUTPUT_TOKEN_LIMIT, ); - expect(tokenLimit('gpt-4', 'output')).toBe(DEFAULT_OUTPUT_TOKEN_LIMIT); - expect(tokenLimit('claude-3.5-sonnet', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - }); - - it('should return the default output limit for models without specific output patterns', () => { - expect(tokenLimit('qwen3-coder-7b', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - expect(tokenLimit('qwen-plus', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - expect(tokenLimit('qwen-vl-max', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); }); }); - describe('Input vs Output limits comparison', () => { - it('should return different limits for input vs output for qwen3-coder-plus', () => { - expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); // 1M input - expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); // 64K output + describe('input vs output comparison', () => { + it('should return different limits for input vs output', () => { + expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1000000); + expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); }); - it('should return different limits for input vs output for qwen-vl-max-latest', () => { - expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); // 128K input - expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); // 8K output - }); - - it('should return different limits for input vs output for qwen3-vl-plus', () => { - expect(tokenLimit('qwen3-vl-plus', 'input')).toBe(262144); // 256K input - expect(tokenLimit('qwen3-vl-plus', 'output')).toBe(32768); // 32K output - }); - - it('should return same default limits for unknown models', () => { - expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); // 128K input - expect(tokenLimit('unknown-model', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); // 4K output - }); - }); - - describe('Backward compatibility', () => { it('should default to input type when no type is specified', () => { - expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); // Should be input limit - expect(tokenLimit('qwen-vl-max-latest')).toBe(131072); // Should be input limit - expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); // Should be input default - }); - - it('should work with explicit input type', () => { - expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); - expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); - expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); + expect(tokenLimit('qwen3-coder-plus')).toBe(1000000); + expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); }); }); - describe('Model normalization with output limits', () => { + describe('normalization with output limits', () => { it('should handle normalized model names for output limits', () => { expect(tokenLimit('QWEN3-CODER-PLUS', 'output')).toBe(65536); expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); expect(tokenLimit('QWEN-VL-MAX-LATEST', 'output')).toBe(8192); }); - - it('should handle complex model strings for output limits', () => { - expect( - tokenLimit( - ' a/b/c|QWEN3-CODER-PLUS:qwen3-coder-plus-2024-05-13 ', - 'output', - ), - ).toBe(65536); - expect( - tokenLimit( - 'provider/qwen-vl-max-latest:qwen-vl-max-latest-v1', - 'output', - ), - ).toBe(8192); - }); }); }); diff --git a/packages/core/src/core/tokenLimits.ts b/packages/core/src/core/tokenLimits.ts index ae6cbd9e2..7d18497b7 100644 --- a/packages/core/src/core/tokenLimits.ts +++ b/packages/core/src/core/tokenLimits.ts @@ -9,23 +9,23 @@ type TokenCount = number; export type TokenLimitType = 'input' | 'output'; export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two) -export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 4_096; // 4K tokens +export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 8_192; // 8K tokens /** * Accurate numeric limits: * - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.) - * - vendor-declared exact values (e.g., 200k -> 200000) are used as stated in docs. + * - vendor-declared exact values (e.g., 200k -> 200000, 1m -> 1000000) are + * used as stated in docs. */ const LIMITS = { '32k': 32_768, '64k': 65_536, '128k': 131_072, - '200k': 200_000, // vendor-declared decimal, used by OpenAI, Anthropic, GLM etc. + '200k': 200_000, // vendor-declared decimal, used by OpenAI, Anthropic, etc. '256k': 262_144, + '400k': 400_000, // vendor-declared decimal, used by OpenAI GPT-5.x '512k': 524_288, - '1m': 1_048_576, - '2m': 2_097_152, - '10m': 10_485_760, // 10 million tokens + '1m': 1_000_000, // Output token limits (typically much smaller than input limits) '4k': 4_096, '8k': 8_192, @@ -81,113 +81,67 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [ // ------------------- // Google Gemini // ------------------- - [/^gemini-1\.5-pro$/, LIMITS['2m']], - [/^gemini-1\.5-flash$/, LIMITS['1m']], - [/^gemini-2\.5-pro.*$/, LIMITS['1m']], - [/^gemini-2\.5-flash.*$/, LIMITS['1m']], - [/^gemini-2\.0-flash-image-generation$/, LIMITS['32k']], - [/^gemini-2\.0-flash.*$/, LIMITS['1m']], + [/^gemini-3/, LIMITS['1m']], // Gemini 3.x (Pro, Flash, 3.1, etc.): 1M + [/^gemini-/, LIMITS['1m']], // Gemini fallback (1.5, 2.x): 1M // ------------------- - // OpenAI (o3 / o4-mini / gpt-4.1 / gpt-4o family) - // o3 and o4-mini document a 200,000-token context window (decimal). - // Note: GPT-4.1 models typically report 1_048_576 (1M) context in OpenAI announcements. - [/^o3(?:-mini|$).*$/, LIMITS['200k']], - [/^o3.*$/, LIMITS['200k']], - [/^o4-mini.*$/, LIMITS['200k']], - [/^gpt-4\.1-mini.*$/, LIMITS['1m']], - [/^gpt-4\.1.*$/, LIMITS['1m']], - [/^gpt-4o-mini.*$/, LIMITS['128k']], - [/^gpt-4o.*$/, LIMITS['128k']], - [/^gpt-4.*$/, LIMITS['128k']], + // OpenAI + // ------------------- + [/^gpt-5/, LIMITS['400k']], // GPT-5.x: 400K + [/^gpt-/, LIMITS['128k']], // GPT fallback (4o, 4.1, etc.): 128K + [/^o\d/, LIMITS['200k']], // o-series (o3, o4-mini, etc.): 200K // ------------------- // Anthropic Claude - // - Claude Sonnet / Sonnet 3.5 and related Sonnet variants: 200,000 tokens documented. - // - Some Sonnet/Opus models offer 1M in beta/enterprise tiers (handled separately if needed). - [/^claude-3\.5-sonnet.*$/, LIMITS['200k']], - [/^claude-3\.7-sonnet.*$/, LIMITS['1m']], // some Sonnet 3.7/Opus variants advertise 1M beta in docs - [/^claude-sonnet-4.*$/, LIMITS['1m']], - [/^claude-opus-4.*$/, LIMITS['1m']], + // ------------------- + [/^claude-/, LIMITS['200k']], // All Claude models: 200K // ------------------- // Alibaba / Qwen // ------------------- - // Commercial Qwen3-Coder-Plus: 1M token context - [/^qwen3-coder-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-plus" and date variants - - // Commercial Qwen3-Coder-Flash: 1M token context - [/^qwen3-coder-flash(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-flash" and date variants - - // Commercial Qwen3.5-Plus: 1M token context - [/^qwen3\.5-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3.5-plus" and date variants - - // Generic coder-model: same as qwen3.5-plus (1M token context) + // Commercial API models (1,000,000 context) + [/^qwen3-coder-plus/, LIMITS['1m']], + [/^qwen3-coder-flash/, LIMITS['1m']], + [/^qwen3\.5-plus/, LIMITS['1m']], [/^coder-model$/, LIMITS['1m']], - - // Commercial Qwen3-Max-Preview: 256K token context - [/^qwen3-max(-preview)?(-.*)?$/, LIMITS['256k']], // catches "qwen3-max" or "qwen3-max-preview" and date variants - - // Open-source Qwen3-Coder variants: 256K native - [/^qwen3-coder-.*$/, LIMITS['256k']], - // Open-source Qwen3 2507 variants: 256K native - [/^qwen3-.*-2507-.*$/, LIMITS['256k']], - - // Open-source long-context Qwen2.5-1M - [/^qwen2\.5-1m.*$/, LIMITS['1m']], - - // Standard Qwen2.5: 128K - [/^qwen2\.5.*$/, LIMITS['128k']], - - // Studio commercial Qwen-Plus / Qwen-Flash / Qwen-Turbo - [/^qwen-plus-latest$/, LIMITS['1m']], // Commercial latest: 1M - [/^qwen-plus.*$/, LIMITS['128k']], // Standard: 128K + // Commercial API models (256K context) + [/^qwen3-max/, LIMITS['256k']], + [/^qwen3-vl-plus$/, LIMITS['256k']], + [/^vision-model$/, LIMITS['256k']], + // Open-source Qwen3 variants: 256K native + [/^qwen3-coder-/, LIMITS['256k']], + // Studio commercial Qwen-Plus / Qwen-Flash + [/^qwen-plus-latest$/, LIMITS['1m']], [/^qwen-flash-latest$/, LIMITS['1m']], - [/^qwen-turbo.*$/, LIMITS['128k']], - - // Qwen Vision Models - [/^qwen3-vl-plus$/, LIMITS['256k']], // Qwen3-VL-Plus: 256K input - [/^qwen-vl-max.*$/, LIMITS['128k']], - - // Generic vision-model: same as qwen-vl-max (128K token context) - [/^vision-model$/, LIMITS['128k']], - - // ------------------- - // ByteDance Seed-OSS (512K) - // ------------------- - [/^seed-oss.*$/, LIMITS['512k']], - - // ------------------- - // Zhipu GLM - // ------------------- - [/^glm-4\.5v(?:-.*)?$/, LIMITS['64k']], - [/^glm-4\.5-air(?:-.*)?$/, LIMITS['128k']], - [/^glm-4\.5(?:-.*)?$/, LIMITS['128k']], - [/^glm-4\.6(?:-.*)?$/, 202_752 as unknown as TokenCount], // exact limit from the model config file - [/^glm-4\.7(?:-.*)?$/, LIMITS['200k']], + // Qwen fallback (VL, turbo, plus, 2.5, etc.): 128K + [/^qwen/, LIMITS['256k']], // ------------------- // DeepSeek // ------------------- - [/^deepseek(?:-.*)?$/, LIMITS['128k']], + [/^deepseek/, LIMITS['128k']], // ------------------- - // Moonshot / Kimi + // Zhipu GLM // ------------------- - [/^kimi-2\.5.*$/, LIMITS['256k']], // Kimi-2.5: 256K context - [/^kimi-k2.*$/, LIMITS['256k']], // Kimi-k2 variants: 256K context - - // ------------------- - // GPT-OSS / Llama & Mistral examples - // ------------------- - [/^gpt-oss.*$/, LIMITS['128k']], - [/^llama-4-scout.*$/, LIMITS['10m']], - [/^mistral-large-2.*$/, LIMITS['128k']], + [/^glm-5/, 202_752 as TokenCount], // GLM-5: exact vendor limit + [/^glm-/, 202_752 as TokenCount], // GLM fallback: 128K // ------------------- // MiniMax // ------------------- - [/^minimax-m2\.1.*$/i, LIMITS['200k']], // MiniMax-M2.1: 200K context + [/^minimax-m2\.5/i, LIMITS['1m']], // MiniMax-M2.5: 1,000,000 + [/^minimax-/i, LIMITS['200k']], // MiniMax fallback: 200K + + // ------------------- + // Moonshot / Kimi + // ------------------- + [/^kimi-/, LIMITS['256k']], // Kimi fallback: 256K + + // ------------------- + // Other + // ------------------- + [/^seed-oss/, LIMITS['512k']], ]; /** @@ -196,35 +150,40 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [ * in a single response for specific models. */ const OUTPUT_PATTERNS: Array<[RegExp, TokenCount]> = [ - // ------------------- - // Alibaba / Qwen - DashScope Models - // ------------------- - // Qwen3-Coder-Plus: 65,536 max output tokens - [/^qwen3-coder-plus(-.*)?$/, LIMITS['64k']], + // Google Gemini + [/^gemini-3/, LIMITS['64k']], // Gemini 3.x: 64K + [/^gemini-/, LIMITS['8k']], // Gemini fallback: 8K - // Qwen3.5-Plus: 65,536 max output tokens - [/^qwen3\.5-plus(-.*)?$/, LIMITS['64k']], + // OpenAI + [/^gpt-5/, LIMITS['128k']], // GPT-5.x: 128K + [/^gpt-/, LIMITS['16k']], // GPT fallback: 16K + [/^o\d/, LIMITS['128k']], // o-series: 128K - // Generic coder-model: same as qwen3.5-plus (64K max output tokens) + // Anthropic Claude + [/^claude-opus-4-6/, LIMITS['128k']], // Opus 4.6: 128K + [/^claude-sonnet-4-6/, LIMITS['64k']], // Sonnet 4.6: 64K + [/^claude-/, LIMITS['64k']], // Claude fallback: 64K + + // Alibaba / Qwen + [/^qwen3\.5/, LIMITS['64k']], [/^coder-model$/, LIMITS['64k']], - - // Qwen3-Max: 65,536 max output tokens - [/^qwen3-max(-preview)?(-.*)?$/, LIMITS['64k']], - - // Qwen-VL-Max-Latest: 8,192 max output tokens - [/^qwen-vl-max-latest$/, LIMITS['8k']], - - // Generic vision-model: same as qwen-vl-max-latest (8K max output tokens) - [/^vision-model$/, LIMITS['8k']], - - // Qwen3-VL-Plus: 32K max output tokens [/^qwen3-vl-plus$/, LIMITS['32k']], + [/^vision-model$/, LIMITS['32k']], + [/^qwen3-/, LIMITS['64k']], - // Deepseek-chat: 8k max tokens - [/^deepseek-chat$/, LIMITS['8k']], + // DeepSeek + [/^deepseek-reasoner/, LIMITS['64k']], + [/^deepseek-chat/, LIMITS['8k']], - // Deepseek-reasoner: 64k max tokens - [/^deepseek-reasoner$/, LIMITS['64k']], + // Zhipu GLM + [/^glm-5/, LIMITS['16k']], + [/^glm-4\.7/, LIMITS['16k']], + + // MiniMax + [/^minimax-m2\.5/i, LIMITS['64k']], + + // Kimi + [/^kimi-k2\.5/, LIMITS['32k']], ]; /** diff --git a/packages/core/src/models/constants.ts b/packages/core/src/models/constants.ts index 9e5d15009..4ed57ae42 100644 --- a/packages/core/src/models/constants.ts +++ b/packages/core/src/models/constants.ts @@ -28,6 +28,7 @@ export const MODEL_GENERATION_CONFIG_FIELDS = [ 'contextWindowSize', 'customHeaders', 'extra_body', + 'modalities', ] as const satisfies ReadonlyArray; /** @@ -107,7 +108,7 @@ export const QWEN_OAUTH_MODELS: ModelConfig[] = [ name: 'coder-model', description: 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance', - capabilities: { vision: false }, + capabilities: { vision: true }, }, { id: 'vision-model', diff --git a/packages/core/src/models/modelRegistry.ts b/packages/core/src/models/modelRegistry.ts index 7b9bdad77..c2815fb32 100644 --- a/packages/core/src/models/modelRegistry.ts +++ b/packages/core/src/models/modelRegistry.ts @@ -5,6 +5,8 @@ */ import { AuthType } from '../core/contentGenerator.js'; +import { defaultModalities } from '../core/modalityDefaults.js'; +import { tokenLimit } from '../core/tokenLimits.js'; import { DEFAULT_OPENAI_BASE_URL } from '../core/openaiContentGenerator/constants.js'; import { type ModelConfig, @@ -121,7 +123,12 @@ export class ModelRegistry { capabilities: model.capabilities, authType: model.authType, isVision: model.capabilities?.vision ?? false, - contextWindowSize: model.generationConfig.contextWindowSize, + contextWindowSize: + model.generationConfig.contextWindowSize ?? tokenLimit(model.id), + modalities: + model.generationConfig.modalities ?? defaultModalities(model.id), + baseUrl: model.baseUrl, + envKey: model.envKey, })); } diff --git a/packages/core/src/models/modelsConfig.ts b/packages/core/src/models/modelsConfig.ts index 9311c9279..3b53c868c 100644 --- a/packages/core/src/models/modelsConfig.ts +++ b/packages/core/src/models/modelsConfig.ts @@ -11,6 +11,7 @@ import type { ContentGeneratorConfig } from '../core/contentGenerator.js'; import type { ContentGeneratorConfigSources } from '../core/contentGenerator.js'; import { DEFAULT_QWEN_MODEL } from '../config/models.js'; import { tokenLimit } from '../core/tokenLimits.js'; +import { defaultModalities } from '../core/modalityDefaults.js'; import { ModelRegistry } from './modelRegistry.js'; import { @@ -769,6 +770,15 @@ export class ModelsConfig { detail: 'auto-detected from model', }; } + + // modalities fallback: auto-detect from model when not set by provider + if (gc.modalities === undefined) { + this._generationConfig.modalities = defaultModalities(model.id); + this.generationConfigSources['modalities'] = { + kind: 'computed', + detail: 'auto-detected from model', + }; + } } /** diff --git a/packages/core/src/models/types.ts b/packages/core/src/models/types.ts index 69c286729..5c9c9b51d 100644 --- a/packages/core/src/models/types.ts +++ b/packages/core/src/models/types.ts @@ -7,6 +7,7 @@ import type { AuthType, ContentGeneratorConfig, + InputModalities, } from '../core/contentGenerator.js'; import type { ConfigSources } from '../utils/configResolver.js'; @@ -35,6 +36,7 @@ export type ModelGenerationConfig = Pick< | 'customHeaders' | 'extra_body' | 'contextWindowSize' + | 'modalities' >; /** @@ -93,6 +95,9 @@ export interface AvailableModel { authType: AuthType; isVision?: boolean; contextWindowSize?: number; + modalities?: InputModalities; + baseUrl?: string; + envKey?: string; /** Whether this is a runtime model (not from modelProviders) */ isRuntimeModel?: boolean;