From c7facf5013ee2c7f47604a1487fd25dd5af47a61 Mon Sep 17 00:00:00 2001 From: John London Date: Tue, 5 May 2026 13:07:32 -0500 Subject: [PATCH] fix(core): use per-model settings for fast model side queries (#3815) Fixes #3765. Side queries (session recap, title generation, tool-use summary) running on the fast model previously inherited the main model's ContentGeneratorConfig, leaking extra_body / samplingParams / reasoning between models. GeminiClient.generateContent() now resolves the target model's own config via buildAgentContentGeneratorConfig() when the requested model differs from the main model, and creates a dedicated ContentGenerator (cached, cleared on resetChat). Cross-authType resolution lets the fast model live under a different provider than the main model. Uses the target model's authType for retry logic so provider-specific checks (e.g. QWEN_OAUTH quota detection) reference the correct provider. Falls back to the main generator if the model is not in the registry. --- packages/core/src/core/client.test.ts | 402 ++++++++++++++++++++++++++ packages/core/src/core/client.ts | 144 ++++++++- 2 files changed, 544 insertions(+), 2 deletions(-) diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts index 42187a1fc..ca586d0a5 100644 --- a/packages/core/src/core/client.test.ts +++ b/packages/core/src/core/client.test.ts @@ -19,18 +19,27 @@ import { GeminiClient, SendMessageType } from './client.js'; import { findCompressSplitPoint } from '../services/chatCompressionService.js'; import { AuthType, + createContentGenerator, type ContentGenerator, type ContentGeneratorConfig, } from './contentGenerator.js'; +import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js'; import { type GeminiChat } from './geminiChat.js'; import type { Config } from '../config/config.js'; import { ApprovalMode } from '../config/config.js'; +import type { ModelsConfig } from '../models/modelsConfig.js'; +import { retryWithBackoff } from '../utils/retry.js'; import { CompressionStatus, GeminiEventType, Turn, type ChatCompressionInfo, } from './turn.js'; + +vi.mock('../utils/retry.js', () => ({ + retryWithBackoff: vi.fn(async (fn) => await fn()), + isUnattendedMode: vi.fn(() => false), +})); import { getCoreSystemPrompt, getCustomSystemPrompt } from './prompts.js'; import { DEFAULT_QWEN_FLASH_MODEL } from '../config/models.js'; import { FileDiscoveryService } from '../services/fileDiscoveryService.js'; @@ -90,6 +99,25 @@ vi.mock('./turn', async (importOriginal) => { vi.mock('../config/config.js'); vi.mock('./prompts'); +vi.mock('../models/content-generator-config.js', async (importOriginal) => { + const actual = + await importOriginal< + typeof import('../models/content-generator-config.js') + >(); + return { + ...actual, + buildAgentContentGeneratorConfig: vi + .fn() + .mockImplementation(actual.buildAgentContentGeneratorConfig), + }; +}); +vi.mock('./contentGenerator.js', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + createContentGenerator: vi.fn(), + }; +}); vi.mock('../utils/getFolderStructure', () => ({ getFolderStructure: vi.fn().mockResolvedValue('Mock Folder Structure'), })); @@ -151,6 +179,7 @@ vi.mock('../telemetry/uiTelemetry.js', () => ({ vi.mock('../telemetry/loggers.js', () => ({ logChatCompression: vi.fn(), logNextSpeakerCheck: vi.fn(), + logApiRequest: vi.fn(), })); // Mock RequestTokenizer to use simple character-based estimation @@ -277,6 +306,12 @@ describe('Gemini Client (client.ts)', () => { vi.resetAllMocks(); vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear(); + // Default: createContentGenerator rejects (simulates test env without auth). + // Individual tests can override with mockResolvedValue for success path. + vi.mocked(createContentGenerator).mockRejectedValue( + new Error('no auth in test env'), + ); + mockMemoryManager = { scheduleExtract: vi.fn().mockResolvedValue({ touchedTopics: [], @@ -389,6 +424,9 @@ describe('Gemini Client (client.ts)', () => { getArenaAgentClient: vi.fn().mockReturnValue(null), getManagedAutoMemoryEnabled: vi.fn().mockReturnValue(true), getMemoryManager: vi.fn().mockReturnValue(mockMemoryManager), + getModelsConfig: vi.fn().mockReturnValue({ + getResolvedModel: vi.fn().mockReturnValue(undefined), + }), getDisableAllHooks: vi.fn().mockReturnValue(true), getArenaManager: vi.fn().mockReturnValue(null), getMessageBus: vi.fn().mockReturnValue(undefined), @@ -3150,4 +3188,368 @@ Other open files: // Note: there is currently no "fallback mode" model routing; the model used // is always the one explicitly requested by the caller. }); + + describe('generateContent with fast model', () => { + it('should resolve per-model config and fall back when createContentGenerator fails', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + // Set up a resolved model for the fast model, but createContentGenerator + // will fail in the test env (no auth), so it falls back to the main + // content generator. Verify the resolution was attempted. + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + generationConfig: { + extra_body: { enable_thinking: false }, + samplingParams: { temperature: 0.1 }, + }, + capabilities: {}, + }; + + const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel); + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + await client.generateContent( + contents, + { temperature: 0.5 }, + abortSignal, + 'fast-model', + ); + + // Verify that getResolvedModel was called with the fast model ID + expect(getResolvedModel).toHaveBeenCalledWith( + expect.any(String), + 'fast-model', + ); + + // The main content generator is used as fallback (since creating a new + // one fails in test env without auth). In production, a dedicated + // content generator with the fast model's settings would be created. + expect(mockContentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'fast-model', + }), + expect.any(String), + ); + }); + + it('should use a dedicated content generator for the fast model on success', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + // Create a mock dedicated content generator + const mockFastContentGenerator = { + generateContent: vi.fn().mockResolvedValue({ + text: 'fast response', + }), + } as unknown as ContentGenerator; + + // Set up a resolved model for the fast model + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + envKey: 'FAST_API_KEY', + generationConfig: { + extra_body: { enable_thinking: false }, + samplingParams: { temperature: 0.1 }, + }, + capabilities: {}, + }; + + const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel); + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + // Override createContentGenerator to return our test double (success path) + vi.mocked(createContentGenerator).mockResolvedValue( + mockFastContentGenerator, + ); + + await client.generateContent( + contents, + { temperature: 0.5 }, + abortSignal, + 'fast-model', + ); + + // Verify buildAgentContentGeneratorConfig was called with correct args + expect(buildAgentContentGeneratorConfig).toHaveBeenCalledWith( + mockConfig, + 'fast-model', + expect.objectContaining({ + baseUrl: 'https://fast-api.example.com', + }), + ); + + // The dedicated fast content generator should be used + expect(mockFastContentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'fast-model', + }), + expect.any(String), + ); + + // The original main content generator should NOT have been called + expect(mockContentGenerator.generateContent).not.toHaveBeenCalled(); + }); + + it('should use the main content generator when the requested model matches the main model', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + const getResolvedModel = vi.fn(); + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + await client.generateContent( + contents, + {}, + abortSignal, + 'test-model', // same as getModel() return value + ); + + // getResolvedModel should NOT be called when model matches main + expect(getResolvedModel).not.toHaveBeenCalled(); + + // The main content generator should be used directly + expect(mockContentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'test-model', + }), + expect.any(String), + ); + }); + + it('should fall back to main generator when model is not in registry', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + // getResolvedModel returns undefined — model not found in registry + const getResolvedModel = vi.fn().mockReturnValue(undefined); + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + // Should not throw — falls back to main generator + await expect( + client.generateContent( + contents, + { temperature: 0.5 }, + abortSignal, + 'unknown-model', + ), + ).resolves.toBeDefined(); + + // getResolvedModel was called to look up the model + expect(getResolvedModel).toHaveBeenCalledWith( + expect.any(String), + 'unknown-model', + ); + + // The main content generator is used as fallback + expect(mockContentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'unknown-model', + }), + expect.any(String), + ); + + // buildAgentContentGeneratorConfig must NOT be called when the model is + // not in the registry — the fallback path skips config construction. + expect(buildAgentContentGeneratorConfig).not.toHaveBeenCalled(); + }); + + it('should use fast model authType for retry, not main model authType', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + generationConfig: {}, + capabilities: {}, + }; + + const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel); + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + // Main config uses a different authType + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + authType: AuthType.QWEN_OAUTH, + apiKey: 'test-key', + apiModel: 'test-model', + } as unknown as ContentGeneratorConfig); + + // Success path for createContentGenerator + vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator); + + await client.generateContent( + contents, + { temperature: 0.5 }, + abortSignal, + 'fast-model', + ); + + // VERIFY: retryWithBackoff was called with the fast model's authType ('openai'), + // not the main model's authType ('QWEN_OAUTH'). + expect(retryWithBackoff).toHaveBeenCalledWith( + expect.any(Function), + expect.objectContaining({ + authType: 'openai', + }), + ); + }); + + it('should cache per-model content generators', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortController = new AbortController(); + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + generationConfig: {}, + capabilities: {}, + }; + + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel), + } as unknown as ModelsConfig); + + vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator); + + // First call + await client.generateContent( + contents, + {}, + abortController.signal, + 'fast-model', + ); + expect(createContentGenerator).toHaveBeenCalledTimes(1); + + // Second call - should use cache + await client.generateContent( + contents, + {}, + abortController.signal, + 'fast-model', + ); + expect(createContentGenerator).toHaveBeenCalledTimes(1); + }); + + it('should resolve model across authTypes when main authType misses', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortSignal = new AbortController().signal; + + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + generationConfig: {}, + capabilities: {}, + envKey: undefined, + }; + + // resolveModelAcrossAuthTypes calls getResolvedModel multiple times: + // 1. main authType (QWEN_OAUTH) → undefined (miss) + // 2. secondary authType (USE_OPENAI) → mockResolvedModel (hit) + // 3. buildAgentContentGeneratorConfig calls getResolvedModel again + // with the resolved authType → mockResolvedModel (hit) + const getResolvedModel = vi + .fn() + .mockReturnValueOnce(undefined) + .mockReturnValue(mockResolvedModel); + + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel, + } as unknown as ModelsConfig); + + // Main config uses QWEN_OAUTH — fast model registered under USE_OPENAI + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + authType: AuthType.QWEN_OAUTH, + apiKey: 'test-key', + apiModel: 'test-model', + } as unknown as ContentGeneratorConfig); + + // Mock createContentGenerator to succeed so the cross-authType + // resolution path completes without falling back + vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator); + + await client.generateContent( + contents, + { temperature: 0.5 }, + abortSignal, + 'fast-model', + ); + + // First call uses main authType (QWEN_OAUTH) — misses + expect(getResolvedModel).toHaveBeenNthCalledWith( + 1, + AuthType.QWEN_OAUTH, + 'fast-model', + ); + // Second call falls through to secondary authType — hits + expect(getResolvedModel).toHaveBeenNthCalledWith( + 2, + AuthType.USE_OPENAI, + 'fast-model', + ); + // Generator was created using the resolved model's config + expect(createContentGenerator).toHaveBeenCalled(); + }); + + it('should clear per-model generator cache on resetChat', async () => { + const contents = [{ role: 'user', parts: [{ text: 'hello' }] }]; + const abortController = new AbortController(); + const mockResolvedModel = { + id: 'fast-model', + authType: 'openai' as const, + name: 'Fast Model', + baseUrl: 'https://fast-api.example.com', + generationConfig: {}, + capabilities: {}, + }; + + vi.mocked(mockConfig.getModelsConfig).mockReturnValue({ + getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel), + } as unknown as ModelsConfig); + + vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator); + + // First call — populates cache + await client.generateContent( + contents, + {}, + abortController.signal, + 'fast-model', + ); + expect(createContentGenerator).toHaveBeenCalledTimes(1); + + // Reset chat should clear the cache + await client.resetChat(); + + // Second call after reset — cache should be cleared, generator recreated + await client.generateContent( + contents, + {}, + abortController.signal, + 'fast-model', + ); + expect(createContentGenerator).toHaveBeenCalledTimes(2); + }); + }); }); diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index a5c079da9..9089e2203 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -22,6 +22,8 @@ const debugLogger = createDebugLogger('CLIENT'); // Core modules import type { ContentGenerator } from './contentGenerator.js'; +import type { ResolvedModelConfig } from '../models/types.js'; +import { AuthType, createContentGenerator } from './contentGenerator.js'; import { GeminiChat } from './geminiChat.js'; import { getArenaSystemReminder, @@ -46,6 +48,9 @@ import { } from '../services/chatCompressionService.js'; import { LoopDetectionService } from '../services/loopDetectionService.js'; +// Models +import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js'; + // Tools import type { RelevantAutoMemoryPromptResult } from '../memory/manager.js'; import { ToolNames } from '../tools/tool-names.js'; @@ -135,6 +140,14 @@ export class GeminiClient { private lastSentIdeContext: IdeContext | undefined; private forceFullIdeContext = true; + /** + * Cache of per-model ContentGenerators keyed by model ID. + * Avoids rebuilding the generator (SDK instantiation, config resolution) + * on every side query (recap, title, tool summary). + * Cleared on session reset (resetChat) to pick up config changes. + */ + private perModelGeneratorCache = new Map>(); + /** * At any point in this conversation, was compression triggered without * being forced and did it fail? @@ -271,6 +284,7 @@ export class GeminiClient { // pointing at content the model can no longer retrieve. debugLogger.debug('[FILE_READ_CACHE] clear after resetChat'); this.config.getFileReadCache().clear(); + this.perModelGeneratorCache.clear(); await this.startChat(); } @@ -1136,10 +1150,31 @@ export class GeminiClient { systemInstruction: finalSystemInstruction, }; + // When the requested model differs from the main model (e.g. fast model + // side queries for session recap / title / summary), resolve the target + // model's own ContentGeneratorConfig so that per-model settings like + // extra_body, samplingParams, and reasoning are not inherited from the + // main model's config. + const mainModel = this.config.getModel() ?? model; + const isPerModel = model !== mainModel; + + // Resolve the authType for retry logic. When using a per-model content + // generator (e.g. fast model side queries), the retry authType must match + // the target model's provider, not the main session's provider. This + // ensures QWEN_OAUTH quota detection checks against the right provider. + const retryAuthType = isPerModel + ? (this.createRetryAuthTypeForModel(model) ?? + this.config.getContentGeneratorConfig()?.authType ?? + AuthType.USE_OPENAI) + : this.config.getContentGeneratorConfig()?.authType; + + const contentGenerator = isPerModel + ? await this.createContentGeneratorForModel(model) + : this.getContentGeneratorOrFail(); const apiCall = () => { currentAttemptModel = model; - return this.getContentGeneratorOrFail().generateContent( + return contentGenerator.generateContent( { model, config: requestConfig, @@ -1149,7 +1184,7 @@ export class GeminiClient { ); }; const result = await retryWithBackoff(apiCall, { - authType: this.config.getContentGeneratorConfig()?.authType, + authType: retryAuthType, persistentMode: isUnattendedMode(), signal: abortSignal, heartbeatFn: (info) => { @@ -1179,6 +1214,111 @@ export class GeminiClient { } } + /** + * Resolve a model across all authTypes. Handles the case where the target + * model is registered under a different authType than the main model + * (e.g. main=QWEN_OAUTH, fast=USE_ANTHROPIC). + * + * TODO: Move cross-authType resolution to ModelRegistry for a cleaner + * data-layer solution. Follow-up PR. + */ + + private resolveModelAcrossAuthTypes( + model: string, + ): ResolvedModelConfig | undefined { + const modelsConfig = this.config.getModelsConfig(); + const allAuthTypes: AuthType[] = [ + AuthType.QWEN_OAUTH, + AuthType.USE_OPENAI, + AuthType.USE_VERTEX_AI, + AuthType.USE_ANTHROPIC, + AuthType.USE_GEMINI, + ]; + + // Try the main authType first for early exit + const mainAuthType = this.config.getContentGeneratorConfig()?.authType; + if (mainAuthType) { + const resolved = modelsConfig.getResolvedModel(mainAuthType, model); + if (resolved) return resolved; + } + + for (const authType of allAuthTypes) { + if (authType === mainAuthType) continue; + const resolved = modelsConfig.getResolvedModel(authType, model); + if (resolved) return resolved; + } + + return undefined; + } + + /** + * Resolve the authType for a given model without creating a full generator. + * Used by retry logic to ensure provider-specific checks (e.g. QWEN_OAUTH + * quota detection) reference the correct provider. + */ + private createRetryAuthTypeForModel(model: string): string | undefined { + return this.resolveModelAcrossAuthTypes(model)?.authType; + } + + /** + * Return a ContentGenerator for a specific model (e.g. the fast model) with + * its own per-model settings from modelProviders. This prevents the main + * model's extra_body / samplingParams / reasoning from leaking into side + * queries that target a different model. + * + * Falls back to the main content generator when the target model is not in + * the registry or when creating a dedicated generator fails (e.g. in test + * environments without full auth setup). + * + * Results are cached by model ID to avoid rebuilding the generator + * (SDK instantiation, config resolution) on every side query. + */ + private async createContentGeneratorForModel( + model: string, + ): Promise { + // Check cache first (Promise coalescing to prevent redundant SDK instantiations) + const cached = this.perModelGeneratorCache.get(model); + if (cached) return cached; + + const generatorPromise = (async () => { + try { + const resolvedModel = this.resolveModelAcrossAuthTypes(model); + + if (!resolvedModel) { + debugLogger.warn( + `Model "${model}" not found in registry across all authTypes, falling back to main generator.`, + ); + return this.getContentGeneratorOrFail(); + } + + const targetConfig = buildAgentContentGeneratorConfig( + this.config, + model, + { + authType: resolvedModel.authType, + apiKey: resolvedModel.envKey + ? (process.env[resolvedModel.envKey] ?? undefined) + : undefined, + baseUrl: resolvedModel.baseUrl, + }, + ); + + return await createContentGenerator(targetConfig, this.config); + } catch (err: unknown) { + debugLogger.warn( + `Failed to create content generator for model "${model}", falling back to main generator.`, + err instanceof Error ? err.message : String(err), + ); + // On failure, delete from cache so subsequent attempts can retry. + this.perModelGeneratorCache.delete(model); + return this.getContentGeneratorOrFail(); + } + })(); + + this.perModelGeneratorCache.set(model, generatorPromise); + return generatorPromise; + } + async tryCompressChat( prompt_id: string, force: boolean = false,