Merge pull request #2897 from QwenLM/feat/thinking-cross-turn-retention-idle-cleanup

feat(core): thinking block cross-turn retention with idle cleanup
2026-05-22 03:03:56 +00:00 · 2026-04-08 15:26:53 +08:00 · 2026-04-08 15:26:53 +08:00 · 3c23952ef7
commit 3c23952ef7
parent db7488f3a2 6a55a9aeea
9 changed files with 434 additions and 2 deletions
--- a/docs/users/configuration/settings.md
+++ b/docs/users/configuration/settings.md
@ -206,6 +206,7 @@ The `extra_body` field allows you to add custom parameters to the request body s
 | `context.fileFiltering.respectQwenIgnore`         | boolean                    | Respect .qwenignore files when searching.                                                                                                                                                                                                                                                                                                                             | `true`      |
 | `context.fileFiltering.enableRecursiveFileSearch` | boolean                    | Whether to enable searching recursively for filenames under the current tree when completing `@` prefixes in the prompt.                                                                                                                                                                                                                                              | `true`      |
 | `context.fileFiltering.enableFuzzySearch`         | boolean                    | When `true`, enables fuzzy search capabilities when searching for files. Set to `false` to improve performance on projects with a large number of files.                                                                                                                                                                                                              | `true`      |
+| `context.gapThresholdMinutes`                     | number                     | Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with typical provider prompt-cache TTL. Set higher if your provider has a longer cache TTL.                                                                                                                                                                     | `5`         |

 #### Troubleshooting File Search Performance

--- a/packages/cli/src/config/config.ts
+++ b/packages/cli/src/config/config.ts
@ -1069,6 +1069,7 @@ export async function loadCliConfig(
    telemetry: telemetrySettings,
    usageStatisticsEnabled: settings.privacy?.usageStatisticsEnabled ?? true,
    fileFiltering: settings.context?.fileFiltering,
+    thinkingIdleThresholdMinutes: settings.context?.gapThresholdMinutes,
    checkpointing:
      argv.checkpointing || settings.general?.checkpointing?.enabled,
    proxy:
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@ -924,6 +924,16 @@ const SETTINGS_SCHEMA = {
          },
        },
      },
+      gapThresholdMinutes: {
+        type: 'number',
+        label: 'Thinking Block Idle Threshold (minutes)',
+        category: 'Context',
+        requiresRestart: false,
+        default: 5,
+        description:
+          'Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with provider prompt-cache TTL.',
+        showInDialog: false,
+      },
    },
  },

--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@ -371,6 +371,8 @@ export interface ConfigParameters {
  model?: string;
  outputLanguageFilePath?: string;
  maxSessionTurns?: number;
+  /** Minutes of inactivity before clearing retained thinking blocks. */
+  thinkingIdleThresholdMinutes?: number;
  sessionTokenLimit?: number;
  experimentalZedIntegration?: boolean;
  cronEnabled?: boolean;
@ -559,6 +561,7 @@ export class Config {
  private ideMode: boolean;

  private readonly maxSessionTurns: number;
+  private readonly thinkingIdleThresholdMs: number;
  private readonly sessionTokenLimit: number;
  private readonly listExtensions: boolean;
  private readonly overrideExtensions?: string[];
@ -685,6 +688,8 @@ export class Config {
    this.fileDiscoveryService = params.fileDiscoveryService ?? null;
    this.bugCommand = params.bugCommand;
    this.maxSessionTurns = params.maxSessionTurns ?? -1;
+    this.thinkingIdleThresholdMs =
+      (params.thinkingIdleThresholdMinutes ?? 5) * 60 * 1000;
    this.sessionTokenLimit = params.sessionTokenLimit ?? -1;
    this.experimentalZedIntegration =
      params.experimentalZedIntegration ?? false;
@ -1331,6 +1336,10 @@ export class Config {
    return this.maxSessionTurns;
  }

+  getThinkingIdleThresholdMs(): number {
+    return this.thinkingIdleThresholdMs;
+  }
+
  getSessionTokenLimit(): number {
    return this.sessionTokenLimit;
  }
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@ -323,6 +323,7 @@ describe('Gemini Client (client.ts)', () => {
      getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
      getFileService: vi.fn().mockReturnValue(fileService),
      getMaxSessionTurns: vi.fn().mockReturnValue(0),
+      getThinkingIdleThresholdMs: vi.fn().mockReturnValue(5 * 60 * 1000),
      getSessionTokenLimit: vi.fn().mockReturnValue(32000),
      getNoBrowser: vi.fn().mockReturnValue(false),
      getUsageStatisticsEnabled: vi.fn().mockReturnValue(true),
@ -427,6 +428,119 @@ describe('Gemini Client (client.ts)', () => {
    });
  });

+  describe('thinking block idle cleanup and latch', () => {
+    let mockChat: Partial<GeminiChat>;
+
+    beforeEach(() => {
+      const mockStream = (async function* () {
+        yield {
+          type: GeminiEventType.Content,
+          value: 'response',
+        };
+      })();
+      mockTurnRunFn.mockReturnValue(mockStream);
+
+      mockChat = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
+      };
+      client['chat'] = mockChat as GeminiChat;
+    });
+
+    it('should not strip thoughts on active session (< 5min idle)', async () => {
+      // Simulate a recent API completion (2 minutes ago — within default 5 min threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 2 * 60 * 1000;
+      client['thinkingClearLatched'] = false;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(
+        mockChat.stripThoughtsFromHistoryKeepRecent,
+      ).not.toHaveBeenCalled();
+    });
+
+    it('should latch and strip thoughts after > 5min idle', async () => {
+      // Simulate an old API completion (10 minutes ago — exceeds default 5 min threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 10 * 60 * 1000;
+      client['thinkingClearLatched'] = false;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-2',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['thinkingClearLatched']).toBe(true);
+      expect(mockChat.stripThoughtsFromHistoryKeepRecent).toHaveBeenCalledWith(
+        1,
+      );
+    });
+
+    it('should keep stripping once latched even if idle < 5min', async () => {
+      // Pre-set latch with a recent timestamp (2 minutes ago — within threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 2 * 60 * 1000;
+      client['thinkingClearLatched'] = true;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-3',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['thinkingClearLatched']).toBe(true);
+      expect(mockChat.stripThoughtsFromHistoryKeepRecent).toHaveBeenCalledWith(
+        1,
+      );
+    });
+
+    it('should update lastApiCompletionTimestamp after API call', async () => {
+      client['lastApiCompletionTimestamp'] = null;
+
+      const before = Date.now();
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-4',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['lastApiCompletionTimestamp']).toBeGreaterThanOrEqual(
+        before,
+      );
+    });
+
+    it('should reset latch and timestamp on resetChat', async () => {
+      client['lastApiCompletionTimestamp'] = Date.now();
+      client['thinkingClearLatched'] = true;
+
+      await client.resetChat();
+
+      expect(client['thinkingClearLatched']).toBe(false);
+      expect(client['lastApiCompletionTimestamp']).toBeNull();
+    });
+  });
+
  describe('tryCompressChat', () => {
    const mockGetHistory = vi.fn();

@ -436,6 +550,7 @@ describe('Gemini Client (client.ts)', () => {
        addHistory: vi.fn(),
        setHistory: vi.fn(),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      } as unknown as GeminiChat;
    });

@ -457,6 +572,7 @@ describe('Gemini Client (client.ts)', () => {
        getHistory: vi.fn((_curated?: boolean) => chatHistory),
        setHistory: vi.fn(),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockOriginalChat as GeminiChat;

@ -1149,6 +1265,7 @@ describe('Gemini Client (client.ts)', () => {
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      } as unknown as GeminiChat;
      client['chat'] = mockChat;

@ -1204,6 +1321,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1260,6 +1378,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1326,6 +1445,7 @@ hello
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1365,6 +1485,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1410,6 +1531,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1498,6 +1620,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1555,6 +1678,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1636,6 +1760,7 @@ Other open files:
              { role: 'user', parts: [{ text: 'previous message' }] },
            ]),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
      });
@ -1889,6 +2014,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]), // Default empty history
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;

@ -2228,6 +2354,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2265,6 +2392,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2305,6 +2433,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2329,6 +2458,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]),
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
          stripOrphanedUserEntriesFromHistory: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
@ -2361,6 +2491,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]),
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
          stripOrphanedUserEntriesFromHistory: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
@ -2405,6 +2536,7 @@ Other open files:
          addHistory: vi.fn(),
          getHistory: vi.fn().mockReturnValue([]),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
      });
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@ -126,6 +126,25 @@ export class GeminiClient {
   */
  private hasFailedCompressionAttempt = false;

+  /**
+   * Timestamp (epoch ms) of the last completed API call.
+   * Used to detect idle periods for thinking block cleanup.
+   * Starts as null — on the first query there is no prior thinking to clean,
+   * so the idle check is skipped until the first API call completes.
+   */
+  private lastApiCompletionTimestamp: number | null = null;
+
+  /**
+   * Sticky-on latch for clearing thinking blocks from prior turns.
+   * Triggered when idle exceeds the configured threshold (default 5 min,
+   * aligned with provider prompt-cache TTL). Once latched, stays true to
+   * prevent oscillation: without it, thinking would accumulate → get
+   * stripped → accumulate again, causing the message prefix to change
+   * repeatedly (bad for provider-side prompt caching and wastes context).
+   * Reset on /clear (resetChat).
+   */
+  private thinkingClearLatched = false;
+
  constructor(private readonly config: Config) {
    this.loopDetector = new LoopDetectionService(config);
  }
@ -199,6 +218,9 @@ export class GeminiClient {
  }

  async resetChat(): Promise<void> {
+    // Reset thinking clear latch — fresh chat, no prior thinking to clean up
+    this.thinkingClearLatched = false;
+    this.lastApiCompletionTimestamp = null;
    await this.startChat();
  }

@ -537,8 +559,27 @@ export class GeminiClient {
      // record user message for session management
      this.config.getChatRecordingService()?.recordUserMessage(request);

-      // strip thoughts from history before sending the message
-      this.stripThoughtsFromHistory();
+      // Thinking block cross-turn retention with idle cleanup:
+      // - Active session (< threshold idle): keep thinking blocks for reasoning coherence
+      // - Idle > threshold: clear old thinking, keep only last 1 turn to free context
+      // - Latch: once triggered, never revert — prevents oscillation
+      if (
+        !this.thinkingClearLatched &&
+        this.lastApiCompletionTimestamp !== null
+      ) {
+        const thresholdMs = this.config.getThinkingIdleThresholdMs();
+        const idleMs = Date.now() - this.lastApiCompletionTimestamp;
+        if (idleMs > thresholdMs) {
+          this.thinkingClearLatched = true;
+          debugLogger.debug(
+            `Thinking clear latched: idle ${Math.round(idleMs / 1000)}s > threshold ${thresholdMs / 1000}s`,
+          );
+        }
+      }
+      if (this.thinkingClearLatched) {
+        this.getChat().stripThoughtsFromHistoryKeepRecent(1);
+        debugLogger.debug('Stripped old thinking blocks (keeping last 1 turn)');
+      }
    }
    if (messageType !== SendMessageType.Retry) {
      this.sessionTurnCount++;
@ -680,6 +721,7 @@ export class GeminiClient {
          if (arenaAgentClient) {
            await arenaAgentClient.reportError('Loop detected');
          }
+          this.lastApiCompletionTimestamp = Date.now();
          return turn;
        }
      }
@ -698,9 +740,14 @@ export class GeminiClient {
              : 'Unknown error';
          await arenaAgentClient.reportError(errorMsg);
        }
+        this.lastApiCompletionTimestamp = Date.now();
        return turn;
      }
    }
+
+    // Track API completion time for thinking block idle cleanup
+    this.lastApiCompletionTimestamp = Date.now();
+
    // Fire Stop hook through MessageBus (only if hooks are enabled and registered)
    // This must be done before any early returns to ensure hooks are always triggered
    if (
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@ -1923,6 +1923,150 @@ describe('GeminiChat', async () => {
    });
  });

+  describe('stripThoughtsFromHistoryKeepRecent', () => {
+    it('should keep the most recent N model turns with thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'old thinking', thought: true },
+            { text: 'response1' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'mid thinking', thought: true },
+            { text: 'response2' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg3' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response3' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // First two model turns should have thoughts stripped
+      expect(history[1]!.parts).toEqual([{ text: 'response1' }]);
+      expect(history[3]!.parts).toEqual([{ text: 'response2' }]);
+      // Last model turn should keep thoughts
+      expect(history[5]!.parts).toEqual([
+        { text: 'recent thinking', thought: true },
+        { text: 'response3' },
+      ]);
+    });
+
+    it('should not strip anything when keepTurns >= model turns with thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'thinking', thought: true }, { text: 'response' }],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      expect(history[1]!.parts).toEqual([
+        { text: 'thinking', thought: true },
+        { text: 'response' },
+      ]);
+    });
+
+    it('should remove model content objects that become empty after stripping', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'only thinking', thought: true }],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // The first model turn (only thoughts) should be removed entirely
+      expect(history).toHaveLength(3);
+      expect(history[0]!.parts).toEqual([{ text: 'msg1' }]);
+      expect(history[1]!.parts).toEqual([{ text: 'msg2' }]);
+      expect(history[2]!.parts).toEqual([
+        { text: 'recent thinking', thought: true },
+        { text: 'response' },
+      ]);
+    });
+
+    it('should also strip thoughtSignature from stripped turns', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'old thinking', thought: true },
+            {
+              text: 'with sig',
+              thoughtSignature: 'sig1',
+            } as unknown as { text: string; thoughtSignature: string },
+            { text: 'response1' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response2' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // First model turn: thought stripped, thoughtSignature stripped
+      expect(history[1]!.parts).toEqual([
+        { text: 'with sig' },
+        { text: 'response1' },
+      ]);
+      expect(
+        (history[1]!.parts![0] as { thoughtSignature?: string })
+          .thoughtSignature,
+      ).toBeUndefined();
+    });
+
+    it('should handle keepTurns=0 by stripping all thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'thinking', thought: true }, { text: 'response' }],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(0);
+
+      const history = chat.getHistory();
+      expect(history[1]!.parts).toEqual([{ text: 'response' }]);
+    });
+  });
+
  describe('stripOrphanedUserEntriesFromHistory', () => {
    it('should pop a single trailing user entry', () => {
      chat.setHistory([
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@ -625,6 +625,89 @@ export class GeminiChat {
      .filter((content) => content.parts && content.parts.length > 0);
  }

+  /**
+   * Strip thought parts from history, keeping the most recent `keepTurns`
+   * model turns that contain thinking blocks intact.
+   *
+   * Selection is based on thought-containing turns specifically (not all
+   * model turns) so the most recent reasoning chain is always preserved
+   * even if later model turns happen to have no thinking.
+   *
+   * Used for idle cleanup: after exceeding the configured idle threshold
+   * the old thinking blocks are no longer useful for reasoning coherence
+   * but still consume context tokens.
+   */
+  stripThoughtsFromHistoryKeepRecent(keepTurns: number): void {
+    keepTurns = Number.isFinite(keepTurns)
+      ? Math.max(0, Math.floor(keepTurns))
+      : 0;
+
+    // Find indices of model turns that contain thought parts
+    const modelTurnIndices: number[] = [];
+    for (let i = 0; i < this.history.length; i++) {
+      const content = this.history[i];
+      if (
+        content.role === 'model' &&
+        content.parts?.some(
+          (part) =>
+            part &&
+            typeof part === 'object' &&
+            'thought' in part &&
+            part.thought,
+        )
+      ) {
+        modelTurnIndices.push(i);
+      }
+    }
+
+    // Determine which model turns to keep (the most recent `keepTurns`)
+    const turnsToStrip = new Set(
+      modelTurnIndices.slice(
+        0,
+        Math.max(0, modelTurnIndices.length - keepTurns),
+      ),
+    );
+
+    if (turnsToStrip.size === 0) return;
+
+    this.history = this.history
+      .map((content, index) => {
+        if (!turnsToStrip.has(index) || !content.parts) return content;
+
+        // Strip thought parts from this turn
+        const filteredParts = content.parts
+          .filter(
+            (part) =>
+              !(
+                part &&
+                typeof part === 'object' &&
+                'thought' in part &&
+                part.thought
+              ),
+          )
+          .map((part) => {
+            if (
+              part &&
+              typeof part === 'object' &&
+              'thoughtSignature' in part
+            ) {
+              const newPart = { ...part };
+              delete (newPart as { thoughtSignature?: string })
+                .thoughtSignature;
+              return newPart;
+            }
+            return part;
+          });
+
+        return {
+          ...content,
+          parts: filteredParts,
+        };
+      })
+      // Remove Content objects that have no parts left after filtering
+      .filter((content) => content.parts && content.parts.length > 0);
+  }
+
  /**
   * Pop all orphaned trailing user entries from chat history.
   * In a valid conversation the last entry is always a model response;
--- a/packages/vscode-ide-companion/schemas/settings.schema.json
+++ b/packages/vscode-ide-companion/schemas/settings.schema.json
@ -388,6 +388,11 @@
              "default": true
            }
          }
+        },
+        "gapThresholdMinutes": {
+          "description": "Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with provider prompt-cache TTL.",
+          "type": "number",
+          "default": 5
        }
      }
    },