From c7facf5013ee2c7f47604a1487fd25dd5af47a61 Mon Sep 17 00:00:00 2001
From: John London <benevolentjoker@gmail.com>
Date: Tue, 5 May 2026 13:07:32 -0500
Subject: [PATCH] fix(core): use per-model settings for fast model side queries
 (#3815)

Fixes #3765. Side queries (session recap, title generation, tool-use summary) running on the fast model previously inherited the main model's ContentGeneratorConfig, leaking extra_body / samplingParams / reasoning between models.

GeminiClient.generateContent() now resolves the target model's own config via buildAgentContentGeneratorConfig() when the requested model differs from the main model, and creates a dedicated ContentGenerator (cached, cleared on resetChat). Cross-authType resolution lets the fast model live under a different provider than the main model. Uses the target model's authType for retry logic so provider-specific checks (e.g. QWEN_OAUTH quota detection) reference the correct provider. Falls back to the main generator if the model is not in the registry.
---
 packages/core/src/core/client.test.ts | 402 ++++++++++++++++++++++++++
 packages/core/src/core/client.ts      | 144 ++++++++-
 2 files changed, 544 insertions(+), 2 deletions(-)
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 42187a1fc..ca586d0a5 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -19,18 +19,27 @@ import { GeminiClient, SendMessageType } from './client.js';
 import { findCompressSplitPoint } from '../services/chatCompressionService.js';
 import {
   AuthType,
+  createContentGenerator,
   type ContentGenerator,
   type ContentGeneratorConfig,
 } from './contentGenerator.js';
+import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js';
 import { type GeminiChat } from './geminiChat.js';
 import type { Config } from '../config/config.js';
 import { ApprovalMode } from '../config/config.js';
+import type { ModelsConfig } from '../models/modelsConfig.js';
+import { retryWithBackoff } from '../utils/retry.js';
 import {
   CompressionStatus,
   GeminiEventType,
   Turn,
   type ChatCompressionInfo,
 } from './turn.js';
+
+vi.mock('../utils/retry.js', () => ({
+  retryWithBackoff: vi.fn(async (fn) => await fn()),
+  isUnattendedMode: vi.fn(() => false),
+}));
 import { getCoreSystemPrompt, getCustomSystemPrompt } from './prompts.js';
 import { DEFAULT_QWEN_FLASH_MODEL } from '../config/models.js';
 import { FileDiscoveryService } from '../services/fileDiscoveryService.js';
@@ -90,6 +99,25 @@ vi.mock('./turn', async (importOriginal) => {
 
 vi.mock('../config/config.js');
 vi.mock('./prompts');
+vi.mock('../models/content-generator-config.js', async (importOriginal) => {
+  const actual =
+    await importOriginal<
+      typeof import('../models/content-generator-config.js')
+    >();
+  return {
+    ...actual,
+    buildAgentContentGeneratorConfig: vi
+      .fn()
+      .mockImplementation(actual.buildAgentContentGeneratorConfig),
+  };
+});
+vi.mock('./contentGenerator.js', async (importOriginal) => {
+  const actual = await importOriginal<typeof import('./contentGenerator.js')>();
+  return {
+    ...actual,
+    createContentGenerator: vi.fn(),
+  };
+});
 vi.mock('../utils/getFolderStructure', () => ({
   getFolderStructure: vi.fn().mockResolvedValue('Mock Folder Structure'),
 }));
@@ -151,6 +179,7 @@ vi.mock('../telemetry/uiTelemetry.js', () => ({
 vi.mock('../telemetry/loggers.js', () => ({
   logChatCompression: vi.fn(),
   logNextSpeakerCheck: vi.fn(),
+  logApiRequest: vi.fn(),
 }));
 
 // Mock RequestTokenizer to use simple character-based estimation
@@ -277,6 +306,12 @@ describe('Gemini Client (client.ts)', () => {
     vi.resetAllMocks();
     vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear();
 
+    // Default: createContentGenerator rejects (simulates test env without auth).
+    // Individual tests can override with mockResolvedValue for success path.
+    vi.mocked(createContentGenerator).mockRejectedValue(
+      new Error('no auth in test env'),
+    );
+
     mockMemoryManager = {
       scheduleExtract: vi.fn().mockResolvedValue({
         touchedTopics: [],
@@ -389,6 +424,9 @@ describe('Gemini Client (client.ts)', () => {
       getArenaAgentClient: vi.fn().mockReturnValue(null),
       getManagedAutoMemoryEnabled: vi.fn().mockReturnValue(true),
       getMemoryManager: vi.fn().mockReturnValue(mockMemoryManager),
+      getModelsConfig: vi.fn().mockReturnValue({
+        getResolvedModel: vi.fn().mockReturnValue(undefined),
+      }),
       getDisableAllHooks: vi.fn().mockReturnValue(true),
       getArenaManager: vi.fn().mockReturnValue(null),
       getMessageBus: vi.fn().mockReturnValue(undefined),
@@ -3150,4 +3188,368 @@ Other open files:
     // Note: there is currently no "fallback mode" model routing; the model used
     // is always the one explicitly requested by the caller.
   });
+
+  describe('generateContent with fast model', () => {
+    it('should resolve per-model config and fall back when createContentGenerator fails', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      // Set up a resolved model for the fast model, but createContentGenerator
+      // will fail in the test env (no auth), so it falls back to the main
+      // content generator. Verify the resolution was attempted.
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        generationConfig: {
+          extra_body: { enable_thinking: false },
+          samplingParams: { temperature: 0.1 },
+        },
+        capabilities: {},
+      };
+
+      const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      await client.generateContent(
+        contents,
+        { temperature: 0.5 },
+        abortSignal,
+        'fast-model',
+      );
+
+      // Verify that getResolvedModel was called with the fast model ID
+      expect(getResolvedModel).toHaveBeenCalledWith(
+        expect.any(String),
+        'fast-model',
+      );
+
+      // The main content generator is used as fallback (since creating a new
+      // one fails in test env without auth). In production, a dedicated
+      // content generator with the fast model's settings would be created.
+      expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          model: 'fast-model',
+        }),
+        expect.any(String),
+      );
+    });
+
+    it('should use a dedicated content generator for the fast model on success', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      // Create a mock dedicated content generator
+      const mockFastContentGenerator = {
+        generateContent: vi.fn().mockResolvedValue({
+          text: 'fast response',
+        }),
+      } as unknown as ContentGenerator;
+
+      // Set up a resolved model for the fast model
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        envKey: 'FAST_API_KEY',
+        generationConfig: {
+          extra_body: { enable_thinking: false },
+          samplingParams: { temperature: 0.1 },
+        },
+        capabilities: {},
+      };
+
+      const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      // Override createContentGenerator to return our test double (success path)
+      vi.mocked(createContentGenerator).mockResolvedValue(
+        mockFastContentGenerator,
+      );
+
+      await client.generateContent(
+        contents,
+        { temperature: 0.5 },
+        abortSignal,
+        'fast-model',
+      );
+
+      // Verify buildAgentContentGeneratorConfig was called with correct args
+      expect(buildAgentContentGeneratorConfig).toHaveBeenCalledWith(
+        mockConfig,
+        'fast-model',
+        expect.objectContaining({
+          baseUrl: 'https://fast-api.example.com',
+        }),
+      );
+
+      // The dedicated fast content generator should be used
+      expect(mockFastContentGenerator.generateContent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          model: 'fast-model',
+        }),
+        expect.any(String),
+      );
+
+      // The original main content generator should NOT have been called
+      expect(mockContentGenerator.generateContent).not.toHaveBeenCalled();
+    });
+
+    it('should use the main content generator when the requested model matches the main model', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      const getResolvedModel = vi.fn();
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      await client.generateContent(
+        contents,
+        {},
+        abortSignal,
+        'test-model', // same as getModel() return value
+      );
+
+      // getResolvedModel should NOT be called when model matches main
+      expect(getResolvedModel).not.toHaveBeenCalled();
+
+      // The main content generator should be used directly
+      expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          model: 'test-model',
+        }),
+        expect.any(String),
+      );
+    });
+
+    it('should fall back to main generator when model is not in registry', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      // getResolvedModel returns undefined — model not found in registry
+      const getResolvedModel = vi.fn().mockReturnValue(undefined);
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      // Should not throw — falls back to main generator
+      await expect(
+        client.generateContent(
+          contents,
+          { temperature: 0.5 },
+          abortSignal,
+          'unknown-model',
+        ),
+      ).resolves.toBeDefined();
+
+      // getResolvedModel was called to look up the model
+      expect(getResolvedModel).toHaveBeenCalledWith(
+        expect.any(String),
+        'unknown-model',
+      );
+
+      // The main content generator is used as fallback
+      expect(mockContentGenerator.generateContent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          model: 'unknown-model',
+        }),
+        expect.any(String),
+      );
+
+      // buildAgentContentGeneratorConfig must NOT be called when the model is
+      // not in the registry — the fallback path skips config construction.
+      expect(buildAgentContentGeneratorConfig).not.toHaveBeenCalled();
+    });
+
+    it('should use fast model authType for retry, not main model authType', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        generationConfig: {},
+        capabilities: {},
+      };
+
+      const getResolvedModel = vi.fn().mockReturnValue(mockResolvedModel);
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      // Main config uses a different authType
+      vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+        authType: AuthType.QWEN_OAUTH,
+        apiKey: 'test-key',
+        apiModel: 'test-model',
+      } as unknown as ContentGeneratorConfig);
+
+      // Success path for createContentGenerator
+      vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
+
+      await client.generateContent(
+        contents,
+        { temperature: 0.5 },
+        abortSignal,
+        'fast-model',
+      );
+
+      // VERIFY: retryWithBackoff was called with the fast model's authType ('openai'),
+      // not the main model's authType ('QWEN_OAUTH').
+      expect(retryWithBackoff).toHaveBeenCalledWith(
+        expect.any(Function),
+        expect.objectContaining({
+          authType: 'openai',
+        }),
+      );
+    });
+
+    it('should cache per-model content generators', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortController = new AbortController();
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        generationConfig: {},
+        capabilities: {},
+      };
+
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel),
+      } as unknown as ModelsConfig);
+
+      vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
+
+      // First call
+      await client.generateContent(
+        contents,
+        {},
+        abortController.signal,
+        'fast-model',
+      );
+      expect(createContentGenerator).toHaveBeenCalledTimes(1);
+
+      // Second call - should use cache
+      await client.generateContent(
+        contents,
+        {},
+        abortController.signal,
+        'fast-model',
+      );
+      expect(createContentGenerator).toHaveBeenCalledTimes(1);
+    });
+
+    it('should resolve model across authTypes when main authType misses', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortSignal = new AbortController().signal;
+
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        generationConfig: {},
+        capabilities: {},
+        envKey: undefined,
+      };
+
+      // resolveModelAcrossAuthTypes calls getResolvedModel multiple times:
+      // 1. main authType (QWEN_OAUTH) → undefined (miss)
+      // 2. secondary authType (USE_OPENAI) → mockResolvedModel (hit)
+      // 3. buildAgentContentGeneratorConfig calls getResolvedModel again
+      //    with the resolved authType → mockResolvedModel (hit)
+      const getResolvedModel = vi
+        .fn()
+        .mockReturnValueOnce(undefined)
+        .mockReturnValue(mockResolvedModel);
+
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel,
+      } as unknown as ModelsConfig);
+
+      // Main config uses QWEN_OAUTH — fast model registered under USE_OPENAI
+      vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+        authType: AuthType.QWEN_OAUTH,
+        apiKey: 'test-key',
+        apiModel: 'test-model',
+      } as unknown as ContentGeneratorConfig);
+
+      // Mock createContentGenerator to succeed so the cross-authType
+      // resolution path completes without falling back
+      vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
+
+      await client.generateContent(
+        contents,
+        { temperature: 0.5 },
+        abortSignal,
+        'fast-model',
+      );
+
+      // First call uses main authType (QWEN_OAUTH) — misses
+      expect(getResolvedModel).toHaveBeenNthCalledWith(
+        1,
+        AuthType.QWEN_OAUTH,
+        'fast-model',
+      );
+      // Second call falls through to secondary authType — hits
+      expect(getResolvedModel).toHaveBeenNthCalledWith(
+        2,
+        AuthType.USE_OPENAI,
+        'fast-model',
+      );
+      // Generator was created using the resolved model's config
+      expect(createContentGenerator).toHaveBeenCalled();
+    });
+
+    it('should clear per-model generator cache on resetChat', async () => {
+      const contents = [{ role: 'user', parts: [{ text: 'hello' }] }];
+      const abortController = new AbortController();
+      const mockResolvedModel = {
+        id: 'fast-model',
+        authType: 'openai' as const,
+        name: 'Fast Model',
+        baseUrl: 'https://fast-api.example.com',
+        generationConfig: {},
+        capabilities: {},
+      };
+
+      vi.mocked(mockConfig.getModelsConfig).mockReturnValue({
+        getResolvedModel: vi.fn().mockReturnValue(mockResolvedModel),
+      } as unknown as ModelsConfig);
+
+      vi.mocked(createContentGenerator).mockResolvedValue(mockContentGenerator);
+
+      // First call — populates cache
+      await client.generateContent(
+        contents,
+        {},
+        abortController.signal,
+        'fast-model',
+      );
+      expect(createContentGenerator).toHaveBeenCalledTimes(1);
+
+      // Reset chat should clear the cache
+      await client.resetChat();
+
+      // Second call after reset — cache should be cleared, generator recreated
+      await client.generateContent(
+        contents,
+        {},
+        abortController.signal,
+        'fast-model',
+      );
+      expect(createContentGenerator).toHaveBeenCalledTimes(2);
+    });
+  });
 });
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index a5c079da9..9089e2203 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -22,6 +22,8 @@ const debugLogger = createDebugLogger('CLIENT');
 
 // Core modules
 import type { ContentGenerator } from './contentGenerator.js';
+import type { ResolvedModelConfig } from '../models/types.js';
+import { AuthType, createContentGenerator } from './contentGenerator.js';
 import { GeminiChat } from './geminiChat.js';
 import {
   getArenaSystemReminder,
@@ -46,6 +48,9 @@ import {
 } from '../services/chatCompressionService.js';
 import { LoopDetectionService } from '../services/loopDetectionService.js';
 
+// Models
+import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js';
+
 // Tools
 import type { RelevantAutoMemoryPromptResult } from '../memory/manager.js';
 import { ToolNames } from '../tools/tool-names.js';
@@ -135,6 +140,14 @@ export class GeminiClient {
   private lastSentIdeContext: IdeContext | undefined;
   private forceFullIdeContext = true;
 
+  /**
+   * Cache of per-model ContentGenerators keyed by model ID.
+   * Avoids rebuilding the generator (SDK instantiation, config resolution)
+   * on every side query (recap, title, tool summary).
+   * Cleared on session reset (resetChat) to pick up config changes.
+   */
+  private perModelGeneratorCache = new Map<string, Promise<ContentGenerator>>();
+
   /**
    * At any point in this conversation, was compression triggered without
    * being forced and did it fail?
@@ -271,6 +284,7 @@ export class GeminiClient {
     // pointing at content the model can no longer retrieve.
     debugLogger.debug('[FILE_READ_CACHE] clear after resetChat');
     this.config.getFileReadCache().clear();
+    this.perModelGeneratorCache.clear();
     await this.startChat();
   }
 
@@ -1136,10 +1150,31 @@ export class GeminiClient {
         systemInstruction: finalSystemInstruction,
       };
 
+      // When the requested model differs from the main model (e.g. fast model
+      // side queries for session recap / title / summary), resolve the target
+      // model's own ContentGeneratorConfig so that per-model settings like
+      // extra_body, samplingParams, and reasoning are not inherited from the
+      // main model's config.
+      const mainModel = this.config.getModel() ?? model;
+      const isPerModel = model !== mainModel;
+
+      // Resolve the authType for retry logic. When using a per-model content
+      // generator (e.g. fast model side queries), the retry authType must match
+      // the target model's provider, not the main session's provider. This
+      // ensures QWEN_OAUTH quota detection checks against the right provider.
+      const retryAuthType = isPerModel
+        ? (this.createRetryAuthTypeForModel(model) ??
+          this.config.getContentGeneratorConfig()?.authType ??
+          AuthType.USE_OPENAI)
+        : this.config.getContentGeneratorConfig()?.authType;
+
+      const contentGenerator = isPerModel
+        ? await this.createContentGeneratorForModel(model)
+        : this.getContentGeneratorOrFail();
       const apiCall = () => {
         currentAttemptModel = model;
 
-        return this.getContentGeneratorOrFail().generateContent(
+        return contentGenerator.generateContent(
           {
             model,
             config: requestConfig,
@@ -1149,7 +1184,7 @@ export class GeminiClient {
         );
       };
       const result = await retryWithBackoff(apiCall, {
-        authType: this.config.getContentGeneratorConfig()?.authType,
+        authType: retryAuthType,
         persistentMode: isUnattendedMode(),
         signal: abortSignal,
         heartbeatFn: (info) => {
@@ -1179,6 +1214,111 @@ export class GeminiClient {
     }
   }
 
+  /**
+   * Resolve a model across all authTypes. Handles the case where the target
+   * model is registered under a different authType than the main model
+   * (e.g. main=QWEN_OAUTH, fast=USE_ANTHROPIC).
+   *
+   * TODO: Move cross-authType resolution to ModelRegistry for a cleaner
+   * data-layer solution. Follow-up PR.
+   */
+
+  private resolveModelAcrossAuthTypes(
+    model: string,
+  ): ResolvedModelConfig | undefined {
+    const modelsConfig = this.config.getModelsConfig();
+    const allAuthTypes: AuthType[] = [
+      AuthType.QWEN_OAUTH,
+      AuthType.USE_OPENAI,
+      AuthType.USE_VERTEX_AI,
+      AuthType.USE_ANTHROPIC,
+      AuthType.USE_GEMINI,
+    ];
+
+    // Try the main authType first for early exit
+    const mainAuthType = this.config.getContentGeneratorConfig()?.authType;
+    if (mainAuthType) {
+      const resolved = modelsConfig.getResolvedModel(mainAuthType, model);
+      if (resolved) return resolved;
+    }
+
+    for (const authType of allAuthTypes) {
+      if (authType === mainAuthType) continue;
+      const resolved = modelsConfig.getResolvedModel(authType, model);
+      if (resolved) return resolved;
+    }
+
+    return undefined;
+  }
+
+  /**
+   * Resolve the authType for a given model without creating a full generator.
+   * Used by retry logic to ensure provider-specific checks (e.g. QWEN_OAUTH
+   * quota detection) reference the correct provider.
+   */
+  private createRetryAuthTypeForModel(model: string): string | undefined {
+    return this.resolveModelAcrossAuthTypes(model)?.authType;
+  }
+
+  /**
+   * Return a ContentGenerator for a specific model (e.g. the fast model) with
+   * its own per-model settings from modelProviders.  This prevents the main
+   * model's extra_body / samplingParams / reasoning from leaking into side
+   * queries that target a different model.
+   *
+   * Falls back to the main content generator when the target model is not in
+   * the registry or when creating a dedicated generator fails (e.g. in test
+   * environments without full auth setup).
+   *
+   * Results are cached by model ID to avoid rebuilding the generator
+   * (SDK instantiation, config resolution) on every side query.
+   */
+  private async createContentGeneratorForModel(
+    model: string,
+  ): Promise<ContentGenerator> {
+    // Check cache first (Promise coalescing to prevent redundant SDK instantiations)
+    const cached = this.perModelGeneratorCache.get(model);
+    if (cached) return cached;
+
+    const generatorPromise = (async () => {
+      try {
+        const resolvedModel = this.resolveModelAcrossAuthTypes(model);
+
+        if (!resolvedModel) {
+          debugLogger.warn(
+            `Model "${model}" not found in registry across all authTypes, falling back to main generator.`,
+          );
+          return this.getContentGeneratorOrFail();
+        }
+
+        const targetConfig = buildAgentContentGeneratorConfig(
+          this.config,
+          model,
+          {
+            authType: resolvedModel.authType,
+            apiKey: resolvedModel.envKey
+              ? (process.env[resolvedModel.envKey] ?? undefined)
+              : undefined,
+            baseUrl: resolvedModel.baseUrl,
+          },
+        );
+
+        return await createContentGenerator(targetConfig, this.config);
+      } catch (err: unknown) {
+        debugLogger.warn(
+          `Failed to create content generator for model "${model}", falling back to main generator.`,
+          err instanceof Error ? err.message : String(err),
+        );
+        // On failure, delete from cache so subsequent attempts can retry.
+        this.perModelGeneratorCache.delete(model);
+        return this.getContentGeneratorOrFail();
+      }
+    })();
+
+    this.perModelGeneratorCache.set(model, generatorPromise);
+    return generatorPromise;
+  }
+
   async tryCompressChat(
     prompt_id: string,
     force: boolean = false,