fix(core): use per-model settings for fast model side queries (#3815)

Fixes #3765. Side queries (session recap, title generation, tool-use summary) running on the fast model previously inherited the main model's ContentGeneratorConfig, leaking extra_body / samplingParams / reasoning between models.

GeminiClient.generateContent() now resolves the target model's own config via buildAgentContentGeneratorConfig() when the requested model differs from the main model, and creates a dedicated ContentGenerator (cached, cleared on resetChat). Cross-authType resolution lets the fast model live under a different provider than the main model. Uses the target model's authType for retry logic so provider-specific checks (e.g. QWEN_OAUTH quota detection) reference the correct provider. Falls back to the main generator if the model is not in the registry.
This commit is contained in:
John London 2026-05-05 13:07:32 -05:00 committed by GitHub
parent 5d0adbb5bd
commit c7facf5013
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 544 additions and 2 deletions

View file

@ -19,18 +19,27 @@ import { GeminiClient, SendMessageType } from './client.js';
import { findCompressSplitPoint } from '../services/chatCompressionService.js';
import {
AuthType,
createContentGenerator,
type ContentGenerator,
type ContentGeneratorConfig,
} from './contentGenerator.js';
import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js';
import { type GeminiChat } from './geminiChat.js';
import type { Config } from '../config/config.js';
import { ApprovalMode } from '../config/config.js';
import type { ModelsConfig } from '../models/modelsConfig.js';
import { retryWithBackoff } from '../utils/retry.js';
import {
CompressionStatus,
GeminiEventType,
Turn,
type ChatCompressionInfo,
} from './turn.js';
vi.mock('../utils/retry.js', () => ({
retryWithBackoff: vi.fn(async (fn) => await fn()),
isUnattendedMode: vi.fn(() => false),
}));
import { getCoreSystemPrompt, getCustomSystemPrompt } from './prompts.js';
import { DEFAULT_QWEN_FLASH_MODEL } from '../config/models.js';
import { FileDiscoveryService } from '../services/fileDiscoveryService.js';
@ -90,6 +99,25 @@ vi.mock('./turn', async (importOriginal) => {
vi.mock('../config/config.js');
vi.mock('./prompts');
vi.mock('../models/content-generator-config.js', async (importOriginal) => {
const actual =
await importOriginal<
typeof import('../models/content-generator-config.js')
>();
return {
...actual,
buildAgentContentGeneratorConfig: vi
.fn()
.mockImplementation(actual.buildAgentContentGeneratorConfig),
};
});
vi.mock('./contentGenerator.js', async (importOriginal) => {
const actual = await importOriginal<typeof import('./contentGenerator.js')>();
return {
...actual,
createContentGenerator: vi.fn(),
};
});
vi.mock('../utils/getFolderStructure', () => ({
getFolderStructure: vi.fn().mockResolvedValue('Mock Folder Structure'),
}));
@ -151,6 +179,7 @@ vi.mock('../telemetry/uiTelemetry.js', () => ({
vi.mock('../telemetry/loggers.js', () => ({
logChatCompression: vi.fn(),
logNextSpeakerCheck: vi.fn(),
logApiRequest: vi.fn(),
}));
// Mock RequestTokenizer to use simple character-based estimation
@ -277,6 +306,12 @@ describe('Gemini Client (client.ts)', () => {
vi.resetAllMocks();
vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear();
// Default: createContentGenerator rejects (simulates test env without auth).
// Individual tests can override with mockResolvedValue for success path.
vi.mocked(createContentGenerator).mockRejectedValue(
new Error('no auth in test env'),
);
mockMemoryManager = {
scheduleExtract: vi.fn().mockResolvedValue({
touchedTopics: [],
@ -389,6 +424,9 @@ describe('Gemini Client (client.ts)', () => {
getArenaAgentClient: vi.fn().mockReturnValue(null),
getManagedAutoMemoryEnabled: vi.fn().mockReturnValue(true),
getMemoryManager: vi.fn().mockReturnValue(mockMemoryManager),
getModelsConfig: vi.fn().mockReturnValue({
getResolvedModel: vi.fn().mockReturnValue(undefined),
}),
getDisableAllHooks: vi.fn().mockReturnValue(true),
getArenaManager: vi.fn().mockReturnValue(null),
getMessageBus: vi.fn().mockReturnValue(undefined),
@ -3150,4 +3188,368 @@ Other open files:
// Note: there is currently no "fallback mode" model routing; the model used
// is always the one explicitly requested by the caller.
});
describe('generateContent with fast model', () => {
it('should resolve per-model config and fall back when createContentGenerator fails', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
// Set up a resolved model for the fast model, but createContentGenerator
// will fail in the test env (no auth), so it falls back to the main
// content generator. Verify the resolution was attempted.
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
generationConfig: {
extra_body: { enable_thinking: false },
samplingParams: { temperature: 0.1 },
},
capabilities: {},
};
const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
await client.generateContent(
contents,
{ temperature: 0.5 },
abortSignal,
'fast-model',
);
// Verify that getResolvedModel was called with the fast model ID
expect(getResolvedModel).toHaveBeenCalledWith(
expect.any(String),
'fast-model',
);
// The main content generator is used as fallback (since creating a new
// one fails in test env without auth). In production, a dedicated
// content generator with the fast model's settings would be created.
expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'fast-model',
}),
expect.any(String),
);
});
it('should use a dedicated content generator for the fast model on success', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
// Create a mock dedicated content generator
const mockFastContentGenerator = {
generateContent: vi.fn().mockResolvedValue({
text: 'fast response',
}),
} as unknown as ContentGenerator;
// Set up a resolved model for the fast model
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
envKey: 'FAST_API_KEY',
generationConfig: {
extra_body: { enable_thinking: false },
samplingParams: { temperature: 0.1 },
},
capabilities: {},
};
const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
// Override createContentGenerator to return our test double (success path)
vi.mocked(createContentGenerator).mockResolvedValue(
mockFastContentGenerator,
);
await client.generateContent(
contents,
{ temperature: 0.5 },
abortSignal,
'fast-model',
);
// Verify buildAgentContentGeneratorConfig was called with correct args
expect(buildAgentContentGeneratorConfig).toHaveBeenCalledWith(
mockConfig,
'fast-model',
expect.objectContaining({
baseUrl: 'https://fast-api.example.com',
}),
);
// The dedicated fast content generator should be used
expect(mockFastContentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'fast-model',
}),
expect.any(String),
);
// The original main content generator should NOT have been called
expect(mockContentGenerator.generateContent).not.toHaveBeenCalled();
});
it('should use the main content generator when the requested model matches the main model', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
const getResolvedModel = vi.fn();
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
await client.generateContent(
contents,
{},
abortSignal,
'test-model', // same as getModel() return value
);
// getResolvedModel should NOT be called when model matches main
expect(getResolvedModel).not.toHaveBeenCalled();
// The main content generator should be used directly
expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'test-model',
}),
expect.any(String),
);
});
it('should fall back to main generator when model is not in registry', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
// getResolvedModel returns undefined — model not found in registry
const getResolvedModel = vi.fn().mockReturnValue(undefined);
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
// Should not throw — falls back to main generator
await expect(
client.generateContent(
contents,
{ temperature: 0.5 },
abortSignal,
'unknown-model',
),
).resolves.toBeDefined();
// getResolvedModel was called to look up the model
expect(getResolvedModel).toHaveBeenCalledWith(
expect.any(String),
'unknown-model',
);
// The main content generator is used as fallback
expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'unknown-model',
}),
expect.any(String),
);
// buildAgentContentGeneratorConfig must NOT be called when the model is
// not in the registry — the fallback path skips config construction.
expect(buildAgentContentGeneratorConfig).not.toHaveBeenCalled();
});
it('should use fast model authType for retry, not main model authType', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
generationConfig: {},
capabilities: {},
};
const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
// Main config uses a different authType
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
authType: AuthType.QWEN_OAUTH,
apiKey: 'test-key',
apiModel: 'test-model',
} as unknown as ContentGeneratorConfig);
// Success path for createContentGenerator
vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
await client.generateContent(
contents,
{ temperature: 0.5 },
abortSignal,
'fast-model',
);
// VERIFY: retryWithBackoff was called with the fast model's authType ('openai'),
// not the main model's authType ('QWEN_OAUTH').
expect(retryWithBackoff).toHaveBeenCalledWith(
expect.any(Function),
expect.objectContaining({
authType: 'openai',
}),
);
});
it('should cache per-model content generators', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortController = new AbortController();
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
generationConfig: {},
capabilities: {},
};
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel),
} as unknown as ModelsConfig);
vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
// First call
await client.generateContent(
contents,
{},
abortController.signal,
'fast-model',
);
expect(createContentGenerator).toHaveBeenCalledTimes(1);
// Second call - should use cache
await client.generateContent(
contents,
{},
abortController.signal,
'fast-model',
);
expect(createContentGenerator).toHaveBeenCalledTimes(1);
});
it('should resolve model across authTypes when main authType misses', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortSignal = new AbortController().signal;
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
generationConfig: {},
capabilities: {},
envKey: undefined,
};
// resolveModelAcrossAuthTypes calls getResolvedModel multiple times:
// 1. main authType (QWEN_OAUTH) → undefined (miss)
// 2. secondary authType (USE_OPENAI) → mockResolvedModel (hit)
// 3. buildAgentContentGeneratorConfig calls getResolvedModel again
// with the resolved authType → mockResolvedModel (hit)
const getResolvedModel = vi
.fn()
.mockReturnValueOnce(undefined)
.mockReturnValue(mockResolvedModel);
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel,
} as unknown as ModelsConfig);
// Main config uses QWEN_OAUTH — fast model registered under USE_OPENAI
vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
authType: AuthType.QWEN_OAUTH,
apiKey: 'test-key',
apiModel: 'test-model',
} as unknown as ContentGeneratorConfig);
// Mock createContentGenerator to succeed so the cross-authType
// resolution path completes without falling back
vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
await client.generateContent(
contents,
{ temperature: 0.5 },
abortSignal,
'fast-model',
);
// First call uses main authType (QWEN_OAUTH) — misses
expect(getResolvedModel).toHaveBeenNthCalledWith(
1,
AuthType.QWEN_OAUTH,
'fast-model',
);
// Second call falls through to secondary authType — hits
expect(getResolvedModel).toHaveBeenNthCalledWith(
2,
AuthType.USE_OPENAI,
'fast-model',
);
// Generator was created using the resolved model's config
expect(createContentGenerator).toHaveBeenCalled();
});
it('should clear per-model generator cache on resetChat', async () => {
const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
const abortController = new AbortController();
const mockResolvedModel = {
id: 'fast-model',
authType: 'openai' as const,
name: 'Fast Model',
baseUrl: 'https://fast-api.example.com',
generationConfig: {},
capabilities: {},
};
vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel),
} as unknown as ModelsConfig);
vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
// First call — populates cache
await client.generateContent(
contents,
{},
abortController.signal,
'fast-model',
);
expect(createContentGenerator).toHaveBeenCalledTimes(1);
// Reset chat should clear the cache
await client.resetChat();
// Second call after reset — cache should be cleared, generator recreated
await client.generateContent(
contents,
{},
abortController.signal,
'fast-model',
);
expect(createContentGenerator).toHaveBeenCalledTimes(2);
});
});
});

View file

@ -22,6 +22,8 @@ const debugLogger = createDebugLogger('CLIENT');
// Core modules
import type { ContentGenerator } from './contentGenerator.js';
import type { ResolvedModelConfig } from '../models/types.js';
import { AuthType, createContentGenerator } from './contentGenerator.js';
import { GeminiChat } from './geminiChat.js';
import {
getArenaSystemReminder,
@ -46,6 +48,9 @@ import {
} from '../services/chatCompressionService.js';
import { LoopDetectionService } from '../services/loopDetectionService.js';
// Models
import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js';
// Tools
import type { RelevantAutoMemoryPromptResult } from '../memory/manager.js';
import { ToolNames } from '../tools/tool-names.js';
@ -135,6 +140,14 @@ export class GeminiClient {
private lastSentIdeContext: IdeContext | undefined;
private forceFullIdeContext = true;
/**
* Cache of per-model ContentGenerators keyed by model ID.
* Avoids rebuilding the generator (SDK instantiation, config resolution)
* on every side query (recap, title, tool summary).
* Cleared on session reset (resetChat) to pick up config changes.
*/
private perModelGeneratorCache = new Map<string, Promise<ContentGenerator>>();
/**
* At any point in this conversation, was compression triggered without
* being forced and did it fail?
@ -271,6 +284,7 @@ export class GeminiClient {
// pointing at content the model can no longer retrieve.
debugLogger.debug('[FILE_READ_CACHE] clear after resetChat');
this.config.getFileReadCache().clear();
this.perModelGeneratorCache.clear();
await this.startChat();
}
@ -1136,10 +1150,31 @@ export class GeminiClient {
systemInstruction: finalSystemInstruction,
};
// When the requested model differs from the main model (e.g. fast model
// side queries for session recap / title / summary), resolve the target
// model's own ContentGeneratorConfig so that per-model settings like
// extra_body, samplingParams, and reasoning are not inherited from the
// main model's config.
const mainModel = this.config.getModel() ?? model;
const isPerModel = model !== mainModel;
// Resolve the authType for retry logic. When using a per-model content
// generator (e.g. fast model side queries), the retry authType must match
// the target model's provider, not the main session's provider. This
// ensures QWEN_OAUTH quota detection checks against the right provider.
const retryAuthType = isPerModel
? (this.createRetryAuthTypeForModel(model) ??
this.config.getContentGeneratorConfig()?.authType ??
AuthType.USE_OPENAI)
: this.config.getContentGeneratorConfig()?.authType;
const contentGenerator = isPerModel
? await this.createContentGeneratorForModel(model)
: this.getContentGeneratorOrFail();
const apiCall = () => {
currentAttemptModel = model;
return this.getContentGeneratorOrFail().generateContent(
return contentGenerator.generateContent(
{
model,
config: requestConfig,
@ -1149,7 +1184,7 @@ export class GeminiClient {
);
};
const result = await retryWithBackoff(apiCall, {
authType: this.config.getContentGeneratorConfig()?.authType,
authType: retryAuthType,
persistentMode: isUnattendedMode(),
signal: abortSignal,
heartbeatFn: (info) => {
@ -1179,6 +1214,111 @@ export class GeminiClient {
}
}
/**
* Resolve a model across all authTypes. Handles the case where the target
* model is registered under a different authType than the main model
* (e.g. main=QWEN_OAUTH, fast=USE_ANTHROPIC).
*
* TODO: Move cross-authType resolution to ModelRegistry for a cleaner
* data-layer solution. Follow-up PR.
*/
private resolveModelAcrossAuthTypes(
model: string,
): ResolvedModelConfig | undefined {
const modelsConfig = this.config.getModelsConfig();
const allAuthTypes: AuthType[] = [
AuthType.QWEN_OAUTH,
AuthType.USE_OPENAI,
AuthType.USE_VERTEX_AI,
AuthType.USE_ANTHROPIC,
AuthType.USE_GEMINI,
];
// Try the main authType first for early exit
const mainAuthType = this.config.getContentGeneratorConfig()?.authType;
if (mainAuthType) {
const resolved = modelsConfig.getResolvedModel(mainAuthType, model);
if (resolved) return resolved;
}
for (const authType of allAuthTypes) {
if (authType === mainAuthType) continue;
const resolved = modelsConfig.getResolvedModel(authType, model);
if (resolved) return resolved;
}
return undefined;
}
/**
* Resolve the authType for a given model without creating a full generator.
* Used by retry logic to ensure provider-specific checks (e.g. QWEN_OAUTH
* quota detection) reference the correct provider.
*/
private createRetryAuthTypeForModel(model: string): string | undefined {
return this.resolveModelAcrossAuthTypes(model)?.authType;
}
/**
* Return a ContentGenerator for a specific model (e.g. the fast model) with
* its own per-model settings from modelProviders. This prevents the main
* model's extra_body / samplingParams / reasoning from leaking into side
* queries that target a different model.
*
* Falls back to the main content generator when the target model is not in
* the registry or when creating a dedicated generator fails (e.g. in test
* environments without full auth setup).
*
* Results are cached by model ID to avoid rebuilding the generator
* (SDK instantiation, config resolution) on every side query.
*/
private async createContentGeneratorForModel(
model: string,
): Promise<ContentGenerator> {
// Check cache first (Promise coalescing to prevent redundant SDK instantiations)
const cached = this.perModelGeneratorCache.get(model);
if (cached) return cached;
const generatorPromise = (async () => {
try {
const resolvedModel = this.resolveModelAcrossAuthTypes(model);
if (!resolvedModel) {
debugLogger.warn(
`Model "${model}" not found in registry across all authTypes, falling back to main generator.`,
);
return this.getContentGeneratorOrFail();
}
const targetConfig = buildAgentContentGeneratorConfig(
this.config,
model,
{
authType: resolvedModel.authType,
apiKey: resolvedModel.envKey
? (process.env[resolvedModel.envKey] ?? undefined)
: undefined,
baseUrl: resolvedModel.baseUrl,
},
);
return await createContentGenerator(targetConfig, this.config);
} catch (err: unknown) {
debugLogger.warn(
`Failed to create content generator for model "${model}", falling back to main generator.`,
err instanceof Error ? err.message : String(err),
);
// On failure, delete from cache so subsequent attempts can retry.
this.perModelGeneratorCache.delete(model);
return this.getContentGeneratorOrFail();
}
})();
this.perModelGeneratorCache.set(model, generatorPromise);
return generatorPromise;
}
async tryCompressChat(
prompt_id: string,
force: boolean = false,