From 8a1ed565a6eaf16e1194018c8a97cb9bc1998fdc Mon Sep 17 00:00:00 2001 From: tanzhenxin Date: Wed, 6 May 2026 14:09:07 +0800 Subject: [PATCH] fix(core): auto-compact subagent context to prevent overflow (#3735) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(core): auto-compact subagent context to prevent overflow Subagent chats accumulated history without ever compacting, so a long multi-turn run could hit "max context length exceeded" before the compaction logic the main session uses had a chance to fire. Move compaction down into the chat layer so both main agent and subagent auto-compress at the configured threshold, and surface the result via a new chat stream event that bridges into the existing ChatCompressed UI path. The main-session wrapper still owns full /compress reset. Closes #3664 * fix(core): address subagent compaction review feedback Code fixes: - Seed `lastPromptTokenCount` on subagent chats so the first-send threshold gate sees the inherited history's true size. - Add `COMPRESSION_FAILED_TOKEN_COUNT_ERROR` to the fail-latch chain so token-counting failures stop retrying compression every send. - Restore `FileReadCache.clear()` after compaction in both the manual /compress wrapper and the auto-compaction path inside GeminiChat, preventing post-summary `file_unchanged` placeholders from pointing at content the model can no longer retrieve. - Refresh stale comment on the `compressed → ChatCompressed` bridge in turn.ts now that this path is the primary route, not a fallback. Tests: - turn.test.ts asserts the compressed → ChatCompressed bridge. - geminiChat.test.ts asserts COMPRESSED yields as the first stream event after auto-compaction succeeds. - chatCompressionService.test.ts bumps originalTokenCount above the cheap-gate so the NOOP test exercises findCompressSplitPoint. - client.test.ts asserts forceFullIdeContext flips when a ChatCompressed event flows through sendMessageStream's loop. * chore(core): drop redundant StreamEvent cast and document auto-compaction trade-off - Remove the `as StreamEvent` cast at the COMPRESSED yield site — the literal already matches the union member. - Add a 4-line comment at the auto-compaction setHistory point that points readers to GeminiClient for the env-refresh trade-off rationale, so readers don't have to chase the layering decision back across files. --- .../core/src/agents/runtime/agent-core.ts | 20 +- .../src/agents/runtime/agent-headless.test.ts | 4 + packages/core/src/core/client.test.ts | 815 ++++-------------- packages/core/src/core/client.ts | 90 +- packages/core/src/core/geminiChat.test.ts | 354 ++++++++ packages/core/src/core/geminiChat.ts | 157 +++- packages/core/src/core/turn.test.ts | 34 +- packages/core/src/core/turn.ts | 13 + .../services/chatCompressionService.test.ts | 794 +++++++++++------ .../src/services/chatCompressionService.ts | 181 +++- 10 files changed, 1434 insertions(+), 1028 deletions(-) diff --git a/packages/core/src/agents/runtime/agent-core.ts b/packages/core/src/agents/runtime/agent-core.ts index 8f8202965..bb959f4d0 100644 --- a/packages/core/src/agents/runtime/agent-core.ts +++ b/packages/core/src/agents/runtime/agent-core.ts @@ -317,11 +317,17 @@ export class AgentCore { } try { - return new GeminiChat( + const chat = new GeminiChat( this.runtimeContext, generationConfig, startHistory, ); + // Seed the per-chat token count so the auto-compaction threshold + // gate sees the inherited history's true size on the first send. + // Without this, fork subagents start at 0 and the gate NOOPs even + // when `startHistory` is already huge — first API call can 400. + chat.setLastPromptTokenCount(this.lastPromptTokenCount); + return chat; } catch (error) { await reportError( error, @@ -540,6 +546,18 @@ export class AgentCore { continue; } + // GeminiChat already mutated its own history; surface to the debug + // log so subagent compactions show up alongside the main session's. + if (streamEvent.type === 'compressed') { + this.runtimeContext + .getDebugLogger() + .debug( + `[AGENT-COMPACT] subagent=${this.subagentId} round=${turnCounter} ` + + `tokens ${streamEvent.info.originalTokenCount} -> ${streamEvent.info.newTokenCount}`, + ); + continue; + } + // Handle chunk events if (streamEvent.type === 'chunk') { const resp = streamEvent.value; diff --git a/packages/core/src/agents/runtime/agent-headless.test.ts b/packages/core/src/agents/runtime/agent-headless.test.ts index 9f3f329a2..d54a7af18 100644 --- a/packages/core/src/agents/runtime/agent-headless.test.ts +++ b/packages/core/src/agents/runtime/agent-headless.test.ts @@ -280,6 +280,7 @@ describe('subagent.ts', () => { () => ({ sendMessageStream: mockSendMessageStream, + setLastPromptTokenCount: vi.fn(), }) as unknown as GeminiChat, ); @@ -958,6 +959,7 @@ describe('subagent.ts', () => { () => ({ sendMessageStream: mockSendMessageStream, + setLastPromptTokenCount: vi.fn(), }) as unknown as GeminiChat, ); @@ -997,6 +999,7 @@ describe('subagent.ts', () => { () => ({ sendMessageStream: mockSendMessageStream, + setLastPromptTokenCount: vi.fn(), }) as unknown as GeminiChat, ); @@ -1061,6 +1064,7 @@ describe('subagent.ts', () => { () => ({ sendMessageStream: mockSendMessageStream, + setLastPromptTokenCount: vi.fn(), }) as unknown as GeminiChat, ); diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts index 93bfe771f..47a8c9a87 100644 --- a/packages/core/src/core/client.test.ts +++ b/packages/core/src/core/client.test.ts @@ -29,12 +29,7 @@ import type { Config } from '../config/config.js'; import { ApprovalMode } from '../config/config.js'; import type { ModelsConfig } from '../models/modelsConfig.js'; import { retryWithBackoff } from '../utils/retry.js'; -import { - CompressionStatus, - GeminiEventType, - Turn, - type ChatCompressionInfo, -} from './turn.js'; +import { CompressionStatus, GeminiEventType, Turn } from './turn.js'; vi.mock('../utils/retry.js', () => ({ retryWithBackoff: vi.fn(async (fn) => await fn()), @@ -161,6 +156,8 @@ vi.mock('../utils/generateContentResponseUtilities', () => ({ const mockUiTelemetryService = vi.hoisted(() => ({ setLastPromptTokenCount: vi.fn(), getLastPromptTokenCount: vi.fn(), + reset: vi.fn(), + addEvent: vi.fn(), })); vi.mock('../telemetry/index.js', async (importOriginal) => { @@ -264,15 +261,16 @@ describe('findCompressSplitPoint', () => { expect(findCompressSplitPoint(history, 0.8)).toBe(4); }); - it('should return earlier splitpoint if no valid ones are after threshhold', () => { + it('compresses everything before the trailing in-flight functionCall', () => { const history: Content[] = [ { role: 'user', parts: [{ text: 'This is the first message.' }] }, { role: 'model', parts: [{ text: 'This is the second message.' }] }, { role: 'user', parts: [{ text: 'This is the third message.' }] }, { role: 'model', parts: [{ functionCall: {} }] }, ]; - // Can't return 4 because the previous item has a function call. - expect(findCompressSplitPoint(history, 0.99)).toBe(2); + // Trailing m+fc is in-flight; the in-flight fallback compresses + // everything except the trailing fc (no preceding pair to retain). + expect(findCompressSplitPoint(history, 0.99)).toBe(3); }); it('should handle a history with only one item', () => { @@ -446,12 +444,48 @@ describe('Gemini Client (client.ts)', () => { client = new GeminiClient(mockConfig); await client.initialize(); vi.mocked(mockConfig.getGeminiClient).mockReturnValue(client); + + // GeminiClient.sendMessageStream calls this.tryCompressChat (which now + // delegates to chat.tryCompress) before each turn. Most tests use a + // hand-rolled chat mock that doesn't implement tryCompress; default the + // wrapper to a NOOP so those tests don't crash. Tests that exercise + // compression directly (the delegation tests below, the + // emits-compression-event test) override this spy. + vi.spyOn(client, 'tryCompressChat').mockResolvedValue({ + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }); }); afterEach(() => { vi.restoreAllMocks(); }); + describe('initialize', () => { + it('seeds resumed chat with replayed prompt token count', async () => { + vi.mocked(mockConfig.getResumedSessionData).mockReturnValue({ + conversation: { + sessionId: 'resumed-session-id', + projectHash: 'project-hash', + startTime: new Date(0).toISOString(), + lastUpdated: new Date(0).toISOString(), + messages: [], + }, + filePath: '/test/session.jsonl', + lastCompletedUuid: null, + }); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 123_456, + ); + + const resumedClient = new GeminiClient(mockConfig); + await resumedClient.initialize(); + + expect(resumedClient.getChat().getLastPromptTokenCount()).toBe(123_456); + }); + }); + describe('addHistory', () => { it('should call chat.addHistory with the provided content', async () => { const mockChat = { @@ -642,6 +676,11 @@ describe('Gemini Client (client.ts)', () => { mockChat = { addHistory: vi.fn(), getHistory: vi.fn().mockReturnValue([]), + tryCompress: vi.fn().mockResolvedValue({ + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }), }; client['chat'] = mockChat as GeminiChat; }); @@ -768,66 +807,73 @@ describe('Gemini Client (client.ts)', () => { }); }); - describe('tryCompressChat', () => { - const mockGetHistory = vi.fn(); - + // tryCompressChat is now a thin wrapper around GeminiChat.tryCompress. + // The compression logic itself is exercised in chatCompressionService.test.ts + // (token math, threshold checks, hook firing) and geminiChat.test.ts (history + // mutation, recording, hasFailedCompressionAttempt). The tests below cover + // only what the wrapper itself adds: argument forwarding and the IDE-context + // flag flip. + describe('tryCompressChat (delegation)', () => { beforeEach(() => { - client['chat'] = { - getHistory: mockGetHistory, - addHistory: vi.fn(), - setHistory: vi.fn(), - } as unknown as GeminiChat; + // The top-level beforeEach stubs tryCompressChat to NOOP for unrelated + // tests; restore the real implementation here so we can observe it. + vi.mocked(client.tryCompressChat).mockRestore(); }); - function setup({ - chatHistory = [ - { role: 'user', parts: [{ text: 'Long conversation' }] }, - { role: 'model', parts: [{ text: 'Long response' }] }, - ] as Content[], - originalTokenCount = 1000, - summaryText = 'This is a summary.', - // Token counts returned in usageMetadata to simulate what the API would return - // Default values ensure successful compression: - // newTokenCount = originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount - // = 1000 - (1600 - 1000) + 50 = 1000 - 600 + 50 = 450 (< 1000, success) - compressionInputTokenCount = 1600, - compressionOutputTokenCount = 50, - } = {}) { - const mockOriginalChat: Partial = { - getHistory: vi.fn((_curated?: boolean) => chatHistory), - setHistory: vi.fn(), - }; - client['chat'] = mockOriginalChat as GeminiChat; + it('forwards prompt id, model, force, and signal to chat.tryCompress', async () => { + const tryCompress = vi.fn().mockResolvedValue({ + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }); + client['chat'] = { + tryCompress, + getHistory: vi.fn().mockReturnValue([]), + } as unknown as GeminiChat; + vi.mocked(mockConfig.getModel).mockReturnValue('the-model'); + const signal = new AbortController().signal; - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, - ); + await client.tryCompressChat('p1', true, signal); - mockGenerateContentFn.mockResolvedValue({ - candidates: [ - { - content: { - role: 'model', - parts: [{ text: summaryText }], - }, - }, - ], - usageMetadata: { - promptTokenCount: compressionInputTokenCount, - candidatesTokenCount: compressionOutputTokenCount, - totalTokenCount: - compressionInputTokenCount + compressionOutputTokenCount, - }, - } as unknown as GenerateContentResponse); + expect(tryCompress).toHaveBeenCalledWith('p1', 'the-model', true, signal); + }); - // Calculate what the new history will be - const splitPoint = findCompressSplitPoint(chatHistory, 0.7); // 1 - 0.3 - const historyToKeep = chatHistory.slice(splitPoint); + it('flips forceFullIdeContext on a successful compression', async () => { + client['chat'] = { + tryCompress: vi.fn().mockResolvedValue({ + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, + }), + getHistory: vi.fn().mockReturnValue([]), + } as unknown as GeminiChat; + client['forceFullIdeContext'] = false; - // This is the history that the new chat will have. - // It includes the default startChat history + the extra history from tryCompressChat - const newCompressedHistory: Content[] = [ - // Mocked envParts + canned response from startChat + await client.tryCompressChat('p2'); + + expect(client['forceFullIdeContext']).toBe(true); + }); + + it('re-prepends startup context and seeds the new chat after compression', async () => { + const compressedHistory: Content[] = [ + { role: 'user', parts: [{ text: 'summary' }] }, + { role: 'model', parts: [{ text: 'ok' }] }, + ]; + const originalChat = client.getChat(); + vi.spyOn(originalChat, 'tryCompress').mockImplementation(async () => { + originalChat.setHistory(compressedHistory); + return { + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, + }; + }); + client['forceFullIdeContext'] = false; + + await client.tryCompressChat('p4'); + + expect(client.getChat()).not.toBe(originalChat); + expect(client.getHistory()).toEqual([ { role: 'user', parts: [{ text: 'Mocked env context' }], @@ -836,616 +882,71 @@ describe('Gemini Client (client.ts)', () => { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }], }, - // extraHistory from tryCompressChat - { - role: 'user', - parts: [{ text: summaryText }], - }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the additional context!' }], - }, - ...historyToKeep, - ]; + ...compressedHistory, + ]); + expect(client.getChat().getLastPromptTokenCount()).toBe(200); + expect(client['forceFullIdeContext']).toBe(true); + }); - const mockNewChat: Partial = { - getHistory: vi.fn().mockReturnValue(newCompressedHistory), - setHistory: vi.fn(), - }; - - client['startChat'] = vi.fn().mockImplementation(async () => { - client['chat'] = mockNewChat as GeminiChat; - return mockNewChat as GeminiChat; - }); - - // New token count formula: originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount - const estimatedNewTokenCount = Math.max( - 0, - originalTokenCount - - (compressionInputTokenCount - 1000) + - compressionOutputTokenCount, - ); - - return { - client, - mockOriginalChat, - mockNewChat, - estimatedNewTokenCount, - }; - } - - describe('when compression inflates the token count', () => { - it('allows compression to be forced/manual after a failure', async () => { - // Call 1 (Fails): Setup with token counts that will inflate - // newTokenCount = originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount - // = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation) - const longSummary = 'long summary '.repeat(100); - const { client, estimatedNewTokenCount: inflatedTokenCount } = setup({ - originalTokenCount: 100, - summaryText: longSummary, - compressionInputTokenCount: 1010, - compressionOutputTokenCount: 200, - }); - expect(inflatedTokenCount).toBeGreaterThan(100); // Ensure setup is correct - - await client.tryCompressChat('prompt-id-4', false); // Fails - - // Call 2 (Forced): Re-setup with token counts that will compress - // newTokenCount = 100 - (1100 - 1000) + 50 = 100 - 100 + 50 = 50 <= 100 (compression) - const shortSummary = 'short'; - const { estimatedNewTokenCount: compressedTokenCount } = setup({ - originalTokenCount: 100, - summaryText: shortSummary, - compressionInputTokenCount: 1100, - compressionOutputTokenCount: 50, - }); - expect(compressedTokenCount).toBeLessThanOrEqual(100); // Ensure setup is correct - - const result = await client.tryCompressChat('prompt-id-4', true); // Forced - - expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED); - expect(result.originalTokenCount).toBe(100); - // newTokenCount might be clamped to originalTokenCount due to tolerance logic - expect(result.newTokenCount).toBeLessThanOrEqual(100); - }); - - it('yields the result even if the compression inflated the tokens', async () => { - // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation) - const longSummary = 'long summary '.repeat(100); - const { client, estimatedNewTokenCount } = setup({ - originalTokenCount: 100, - summaryText: longSummary, - compressionInputTokenCount: 1010, - compressionOutputTokenCount: 200, - }); - expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct - - // Mock contextWindowSize to ensure compression is triggered - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue( - { - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: 100, // Set to same as originalTokenCount to ensure threshold is exceeded - }, - ); - - const result = await client.tryCompressChat('prompt-id-4', false); - - expect(result.compressionStatus).toBe( - CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, - ); - expect(result.originalTokenCount).toBe(100); - // The newTokenCount should be higher than original since compression failed due to inflation - expect(result.newTokenCount).toBeGreaterThan(100); - // IMPORTANT: The change in client.ts means setLastPromptTokenCount is NOT called on failure - expect( - uiTelemetryService.setLastPromptTokenCount, - ).not.toHaveBeenCalled(); - }); - - it('does not manipulate the source chat', async () => { - // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation) - const longSummary = 'long summary '.repeat(100); - const { client, mockOriginalChat, estimatedNewTokenCount } = setup({ - originalTokenCount: 100, - summaryText: longSummary, - compressionInputTokenCount: 1010, - compressionOutputTokenCount: 200, - }); - expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct - - await client.tryCompressChat('prompt-id-4', false); - - // On failure, the chat should NOT be replaced - expect(client['chat']).toBe(mockOriginalChat); - }); - - it('will not attempt to compress context after a failure', async () => { - // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation) - const longSummary = 'long summary '.repeat(100); - const { client, estimatedNewTokenCount } = setup({ - originalTokenCount: 100, - summaryText: longSummary, - compressionInputTokenCount: 1010, - compressionOutputTokenCount: 200, - }); - expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct - - // Mock contextWindowSize to ensure compression is triggered - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue( - { - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: 100, // Set to same as originalTokenCount to ensure threshold is exceeded - }, - ); - - await client.tryCompressChat('prompt-id-4', false); // This fails and sets hasFailedCompressionAttempt = true - - // This call should now be a NOOP - const result = await client.tryCompressChat('prompt-id-5', false); - - // generateContent (for summary) should only have been called once - expect(mockGenerateContentFn).toHaveBeenCalledTimes(1); - expect(result).toEqual({ - compressionStatus: CompressionStatus.NOOP, - newTokenCount: 0, + it('does not flip forceFullIdeContext when compression NOOPs', async () => { + client['chat'] = { + tryCompress: vi.fn().mockResolvedValue({ originalTokenCount: 0, - }); - }); - }); - - it('should not trigger summarization if token count is below threshold', async () => { - const MOCKED_TOKEN_LIMIT = 1000; - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({ - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: MOCKED_TOKEN_LIMIT, - }); - mockGetHistory.mockReturnValue([ - { role: 'user', parts: [{ text: '...history...' }] }, - ]); - const originalTokenCount = MOCKED_TOKEN_LIMIT * 0.699; - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, - ); - - const initialChat = client.getChat(); - const result = await client.tryCompressChat('prompt-id-2', false); - const newChat = client.getChat(); - - expect(result).toEqual({ - compressionStatus: CompressionStatus.NOOP, - newTokenCount: originalTokenCount, - originalTokenCount, - }); - expect(newChat).toBe(initialChat); - }); - - it('logs a telemetry event when compressing', async () => { - const { logChatCompression } = await import('../telemetry/loggers.js'); - vi.mocked(logChatCompression).mockClear(); - - const MOCKED_TOKEN_LIMIT = 1000; - const MOCKED_CONTEXT_PERCENTAGE_THRESHOLD = 0.5; - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({ - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: MOCKED_TOKEN_LIMIT, - }); - vi.spyOn(client['config'], 'getChatCompression').mockReturnValue({ - contextPercentageThreshold: MOCKED_CONTEXT_PERCENTAGE_THRESHOLD, - }); - // Need multiple history items so there's something to compress - const history = [ - { role: 'user', parts: [{ text: '...history 1...' }] }, - { role: 'model', parts: [{ text: '...history 2...' }] }, - { role: 'user', parts: [{ text: '...history 3...' }] }, - { role: 'model', parts: [{ text: '...history 4...' }] }, - ]; - mockGetHistory.mockReturnValue(history); - - // Token count needs to be ABOVE the threshold to trigger compression - const originalTokenCount = - MOCKED_TOKEN_LIMIT * MOCKED_CONTEXT_PERCENTAGE_THRESHOLD + 1; - - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, - ); - - // Mock the summary response from the chat - // newTokenCount = 501 - (1400 - 1000) + 50 = 501 - 400 + 50 = 151 <= 501 (success) - const summaryText = 'This is a summary.'; - mockGenerateContentFn.mockResolvedValue({ - candidates: [ - { - content: { - role: 'model', - parts: [{ text: summaryText }], - }, - }, - ], - usageMetadata: { - promptTokenCount: 1400, - candidatesTokenCount: 50, - totalTokenCount: 1450, - }, - } as unknown as GenerateContentResponse); - - // Mock startChat to complete the compression flow - const splitPoint = findCompressSplitPoint(history, 0.7); - const historyToKeep = history.slice(splitPoint); - const newCompressedHistory: Content[] = [ - { role: 'user', parts: [{ text: 'Mocked env context' }] }, - { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] }, - { role: 'user', parts: [{ text: summaryText }] }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the additional context!' }], - }, - ...historyToKeep, - ]; - const mockNewChat: Partial = { - getHistory: vi.fn().mockReturnValue(newCompressedHistory), - }; - client['startChat'] = vi - .fn() - .mockResolvedValue(mockNewChat as GeminiChat); - - await client.tryCompressChat('prompt-id-3', false); - - expect(logChatCompression).toHaveBeenCalledWith( - expect.anything(), - expect.objectContaining({ - tokens_before: originalTokenCount, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, }), - ); - expect(uiTelemetryService.setLastPromptTokenCount).toHaveBeenCalled(); + getHistory: vi.fn().mockReturnValue([]), + } as unknown as GeminiChat; + client['forceFullIdeContext'] = false; + + await client.tryCompressChat('p3'); + + expect(client['forceFullIdeContext']).toBe(false); }); - it('should trigger summarization if token count is above threshold with contextPercentageThreshold setting', async () => { - const MOCKED_TOKEN_LIMIT = 1000; - const MOCKED_CONTEXT_PERCENTAGE_THRESHOLD = 0.5; - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({ - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: MOCKED_TOKEN_LIMIT, + it('flips forceFullIdeContext when ChatCompressed flows through sendMessageStream', async () => { + // Auto-compaction lives inside chat.sendMessageStream and surfaces via + // the compressed → ChatCompressed bridge in turn.ts. The flip on this + // path is owned by the for-await loop in client.sendMessageStream, not + // by tryCompressChat — so this test feeds the event in directly. + vi.spyOn(client, 'tryCompressChat').mockResolvedValue({ + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, }); - vi.spyOn(client['config'], 'getChatCompression').mockReturnValue({ - contextPercentageThreshold: MOCKED_CONTEXT_PERCENTAGE_THRESHOLD, - }); - // Need multiple history items so there's something to compress - const history = [ - { role: 'user', parts: [{ text: '...history 1...' }] }, - { role: 'model', parts: [{ text: '...history 2...' }] }, - { role: 'user', parts: [{ text: '...history 3...' }] }, - { role: 'model', parts: [{ text: '...history 4...' }] }, - ]; - mockGetHistory.mockReturnValue(history); - - // Token count needs to be ABOVE the threshold to trigger compression - const originalTokenCount = - MOCKED_TOKEN_LIMIT * MOCKED_CONTEXT_PERCENTAGE_THRESHOLD + 1; - - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, - ); - - // Mock summary and new chat - const summaryText = 'This is a summary.'; - const splitPoint = findCompressSplitPoint(history, 0.7); - const historyToKeep = history.slice(splitPoint); - const newCompressedHistory: Content[] = [ - { role: 'user', parts: [{ text: 'Mocked env context' }] }, - { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] }, - { role: 'user', parts: [{ text: summaryText }] }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the additional context!' }], - }, - ...historyToKeep, - ]; - const mockNewChat: Partial = { - getHistory: vi.fn().mockReturnValue(newCompressedHistory), - }; - client['startChat'] = vi.fn().mockImplementation(async () => { - client['chat'] = mockNewChat as GeminiChat; - return mockNewChat as GeminiChat; - }); - - // Mock the summary response from the chat - // newTokenCount = 501 - (1400 - 1000) + 50 = 501 - 400 + 50 = 151 <= 501 (success) - mockGenerateContentFn.mockResolvedValue({ - candidates: [ - { - content: { - role: 'model', - parts: [{ text: summaryText }], + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { + type: GeminiEventType.ChatCompressed, + value: { + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, }, - }, - ], - usageMetadata: { - promptTokenCount: 1400, - candidatesTokenCount: 50, - totalTokenCount: 1450, - }, - } as unknown as GenerateContentResponse); - - const initialChat = client.getChat(); - const result = await client.tryCompressChat('prompt-id-3', false); - const newChat = client.getChat(); - - expect(mockGenerateContentFn).toHaveBeenCalled(); - - // Assert that summarization happened - expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED); - expect(result.originalTokenCount).toBe(originalTokenCount); - // newTokenCount might be clamped to originalTokenCount due to tolerance logic - expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount); - - // Assert that the chat was reset - expect(newChat).not.toBe(initialChat); - }); - - it('should not compress across a function call response', async () => { - const MOCKED_TOKEN_LIMIT = 1000; - vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({ - model: 'test-model', - apiKey: 'test-key', - vertexai: false, - authType: AuthType.USE_GEMINI, - contextWindowSize: MOCKED_TOKEN_LIMIT, - }); - const history: Content[] = [ - { role: 'user', parts: [{ text: '...history 1...' }] }, - { role: 'model', parts: [{ text: '...history 2...' }] }, - { role: 'user', parts: [{ text: '...history 3...' }] }, - { role: 'model', parts: [{ text: '...history 4...' }] }, - { role: 'user', parts: [{ text: '...history 5...' }] }, - { role: 'model', parts: [{ text: '...history 6...' }] }, - { role: 'user', parts: [{ text: '...history 7...' }] }, - { role: 'model', parts: [{ text: '...history 8...' }] }, - // Normally we would break here, but we have a function response. - { - role: 'user', - parts: [{ functionResponse: { name: '...history 8...' } }], - }, - { role: 'model', parts: [{ text: '...history 10...' }] }, - // Instead we will break here. - { role: 'user', parts: [{ text: '...history 10...' }] }, - ]; - mockGetHistory.mockReturnValue(history); - - const originalTokenCount = 1000 * 0.7; - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, + }; + })(), ); + client['chat'] = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + } as unknown as GeminiChat; + client['forceFullIdeContext'] = false; - // Mock summary and new chat - const summaryText = 'This is a summary.'; - const splitPoint = findCompressSplitPoint(history, 0.7); // This should be 10 - expect(splitPoint).toBe(10); // Verify split point logic - const historyToKeep = history.slice(splitPoint); // Should keep last user message - expect(historyToKeep).toEqual([ - { role: 'user', parts: [{ text: '...history 10...' }] }, - ]); - - const newCompressedHistory: Content[] = [ - { role: 'user', parts: [{ text: 'Mocked env context' }] }, - { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] }, - { role: 'user', parts: [{ text: summaryText }] }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the additional context!' }], - }, - ...historyToKeep, - ]; - const mockNewChat: Partial = { - getHistory: vi.fn().mockReturnValue(newCompressedHistory), - }; - client['startChat'] = vi.fn().mockImplementation(async () => { - client['chat'] = mockNewChat as GeminiChat; - return mockNewChat as GeminiChat; - }); - - // Mock the summary response from the chat - // newTokenCount = 700 - (1500 - 1000) + 50 = 700 - 500 + 50 = 250 <= 700 (success) - mockGenerateContentFn.mockResolvedValue({ - candidates: [ - { - content: { - role: 'model', - parts: [{ text: summaryText }], - }, - }, - ], - usageMetadata: { - promptTokenCount: 1500, - candidatesTokenCount: 50, - totalTokenCount: 1550, - }, - } as unknown as GenerateContentResponse); - - const initialChat = client.getChat(); - const result = await client.tryCompressChat('prompt-id-3', false); - const newChat = client.getChat(); - - expect(mockGenerateContentFn).toHaveBeenCalled(); - - // Assert that summarization happened - expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED); - expect(result.originalTokenCount).toBe(originalTokenCount); - // newTokenCount might be clamped to originalTokenCount due to tolerance logic - expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount); - // Assert that the chat was reset - expect(newChat).not.toBe(initialChat); - - // 1. standard start context message (env) - // 2. standard canned model response - // 3. compressed summary message (user) - // 4. standard canned model response - // 5. The last user message (historyToKeep) - expect(newChat.getHistory().length).toEqual(5); - }); - - it('should always trigger summarization when force is true, regardless of token count', async () => { - // Need multiple history items so there's something to compress - const history = [ - { role: 'user', parts: [{ text: '...history 1...' }] }, - { role: 'model', parts: [{ text: '...history 2...' }] }, - { role: 'user', parts: [{ text: '...history 3...' }] }, - { role: 'model', parts: [{ text: '...history 4...' }] }, - ]; - mockGetHistory.mockReturnValue(history); - - const originalTokenCount = 100; // Well below threshold, but > estimated new count - vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( - originalTokenCount, + const stream = client.sendMessageStream( + [{ text: 'hi' }], + new AbortController().signal, + 'prompt-auto-flip', + { type: SendMessageType.UserQuery }, ); + for await (const _ of stream) { + /* drain */ + } - // Mock summary and new chat - const summaryText = 'This is a summary.'; - const splitPoint = findCompressSplitPoint(history, 0.7); - const historyToKeep = history.slice(splitPoint); - const newCompressedHistory: Content[] = [ - { role: 'user', parts: [{ text: 'Mocked env context' }] }, - { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] }, - { role: 'user', parts: [{ text: summaryText }] }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the additional context!' }], - }, - ...historyToKeep, - ]; - const mockNewChat: Partial = { - getHistory: vi.fn().mockReturnValue(newCompressedHistory), - }; - client['startChat'] = vi.fn().mockImplementation(async () => { - client['chat'] = mockNewChat as GeminiChat; - return mockNewChat as GeminiChat; - }); - - // Mock the summary response from the chat - // newTokenCount = 100 - (1060 - 1000) + 20 = 100 - 60 + 20 = 60 <= 100 (success) - mockGenerateContentFn.mockResolvedValue({ - candidates: [ - { - content: { - role: 'model', - parts: [{ text: summaryText }], - }, - }, - ], - usageMetadata: { - promptTokenCount: 1060, - candidatesTokenCount: 20, - totalTokenCount: 1080, - }, - } as unknown as GenerateContentResponse); - - const initialChat = client.getChat(); - const result = await client.tryCompressChat('prompt-id-1', true); // force = true - const newChat = client.getChat(); - - expect(mockGenerateContentFn).toHaveBeenCalled(); - - expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED); - expect(result.originalTokenCount).toBe(originalTokenCount); - // newTokenCount might be clamped to originalTokenCount due to tolerance logic - expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount); - - // Assert that the chat was reset - expect(newChat).not.toBe(initialChat); + expect(client['forceFullIdeContext']).toBe(true); }); }); describe('sendMessageStream', () => { - it('emits a compression event when the context was automatically compressed', async () => { - // Arrange - mockTurnRunFn.mockReturnValue( - (async function* () { - yield { type: 'content', value: 'Hello' }; - })(), - ); - - const compressionInfo: ChatCompressionInfo = { - compressionStatus: CompressionStatus.COMPRESSED, - originalTokenCount: 1000, - newTokenCount: 500, - }; - - vi.spyOn(client, 'tryCompressChat').mockResolvedValueOnce( - compressionInfo, - ); - - // Act - const stream = client.sendMessageStream( - [{ text: 'Hi' }], - new AbortController().signal, - 'prompt-id-1', - ); - - const events = await fromAsync(stream); - - // Assert - expect(events).toContainEqual({ - type: GeminiEventType.ChatCompressed, - value: compressionInfo, - }); - }); - - it.each([ - { - compressionStatus: - CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, - }, - { compressionStatus: CompressionStatus.NOOP }, - ])( - 'does not emit a compression event when the status is $compressionStatus', - async ({ compressionStatus }) => { - // Arrange - const mockStream = (async function* () { - yield { type: 'content', value: 'Hello' }; - })(); - mockTurnRunFn.mockReturnValue(mockStream); - - const compressionInfo: ChatCompressionInfo = { - compressionStatus, - originalTokenCount: 1000, - newTokenCount: 500, - }; - - vi.spyOn(client, 'tryCompressChat').mockResolvedValueOnce( - compressionInfo, - ); - - // Act - const stream = client.sendMessageStream( - [{ text: 'Hi' }], - new AbortController().signal, - 'prompt-id-1', - ); - - const events = await fromAsync(stream); - - // Assert - expect(events).not.toContainEqual({ - type: GeminiEventType.ChatCompressed, - value: expect.anything(), - }); - }, - ); - it('should include editor context when ideMode is enabled', async () => { // Arrange vi.mocked(ideContextStore.get).mockReturnValue({ diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index e163b5659..5f6156f2e 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -42,7 +42,6 @@ import { // Services import { - ChatCompressionService, COMPRESSION_PRESERVE_THRESHOLD, COMPRESSION_TOKEN_THRESHOLD, } from '../services/chatCompressionService.js'; @@ -186,12 +185,6 @@ export class GeminiClient { */ private perModelGeneratorCache = new Map>(); - /** - * At any point in this conversation, was compression triggered without - * being forced and did it fail? - */ - private hasFailedCompressionAttempt = false; - /** * Promises for pending background memory tasks (dream / extract). * Each promise resolves with a count of memory files touched (0 = nothing written). @@ -224,6 +217,9 @@ export class GeminiClient { resumedSessionData.conversation, ); await this.startChat(resumedHistory); + this.getChat().setLastPromptTokenCount( + uiTelemetryService.getLastPromptTokenCount(), + ); } else { await this.startChat(); } @@ -369,7 +365,6 @@ export class GeminiClient { async startChat(extraHistory?: Content[]): Promise { this.forceFullIdeContext = true; - this.hasFailedCompressionAttempt = false; // Clear stale cache params on session reset to prevent cross-session leakage clearCacheSafeParams(); @@ -847,14 +842,10 @@ export class GeminiClient { return new Turn(this.getChat(), prompt_id); } - const compressed = await this.tryCompressChat(prompt_id, false, signal); - - if (compressed.compressionStatus === CompressionStatus.COMPRESSED) { - yield { type: GeminiEventType.ChatCompressed, value: compressed }; - } - - // Check session token limit after compression. - // `lastPromptTokenCount` is treated as authoritative for the (possibly compressed) history; + // Auto-compaction happens inside GeminiChat.sendMessageStream and surfaces + // via the `compressed → ChatCompressed` bridge in turn.ts. Manual /compress + // still calls tryCompressChat directly for the full reset (env refresh + + // forceFullIdeContext flip). const sessionTokenLimit = this.config.getSessionTokenLimit(); if (sessionTokenLimit > 0) { const lastPromptTokenCount = uiTelemetryService.getLastPromptTokenCount(); @@ -1000,6 +991,13 @@ export class GeminiClient { await arenaAgentClient.updateStatus(); } + // Re-send a full IDE context blob on the next regular message — auto + // compaction inside chat.sendMessageStream may have summarized away + // the previous IDE-context turn. + if (event.type === GeminiEventType.ChatCompressed) { + this.forceFullIdeContext = true; + } + yield event; if (event.type === GeminiEventType.Error) { if (arenaAgentClient) { @@ -1391,58 +1389,36 @@ export class GeminiClient { return generatorPromise; } + /** + * Wrapper around {@link GeminiChat.tryCompress} that restores main-session + * startup context after successful compaction and flips the IDE full-context + * flag for the next regular message. + */ async tryCompressChat( prompt_id: string, force: boolean = false, signal?: AbortSignal, ): Promise { - const compressionService = new ChatCompressionService(); - - const { newHistory, info } = await compressionService.compress( - this.getChat(), + const info = await this.getChat().tryCompress( prompt_id, - force, this.config.getModel(), - this.config, - this.hasFailedCompressionAttempt, + force, signal, ); - - // Handle compression result if (info.compressionStatus === CompressionStatus.COMPRESSED) { - // Success: update chat with new compressed history - if (newHistory) { - const chatRecordingService = this.config.getChatRecordingService(); - chatRecordingService?.recordChatCompression({ - info, - compressedHistory: newHistory, - }); - - await this.startChat(newHistory); - // Compaction rewrites the prompt history: prior full-Read tool - // results may have been summarised away, but the FileReadCache - // still believes those reads are "in this conversation". A - // follow-up Read could then return the file_unchanged - // placeholder pointing at content the model can no longer - // retrieve from its own context. Clear the cache so post- - // compaction Reads re-emit the bytes. - debugLogger.debug('[FILE_READ_CACHE] clear after tryCompressChat'); - this.config.getFileReadCache().clear(); - uiTelemetryService.setLastPromptTokenCount(info.newTokenCount); - this.forceFullIdeContext = true; - } - } else if ( - info.compressionStatus === - CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT || - info.compressionStatus === - CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY - ) { - // Track failed attempts (only mark as failed if not forced) - if (!force) { - this.hasFailedCompressionAttempt = true; - } + const compressedHistory = this.getChat().getHistory(); + await this.startChat(compressedHistory); + // startChat() creates a new GeminiChat without touching FileReadCache, + // so prior read_file results that were summarised away would still + // resolve to the file_unchanged placeholder. Clear so post-compaction + // Reads re-emit bytes the model can no longer see in history. + debugLogger.debug('[FILE_READ_CACHE] clear after tryCompressChat'); + this.config.getFileReadCache().clear(); + this.getChat().setLastPromptTokenCount(info.newTokenCount); + // Re-send a full IDE context blob on the next regular message — + // compression dropped the previous context turn from history. + this.forceFullIdeContext = true; } - return info; } } diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts index 8a45bb5d4..521e37e06 100644 --- a/packages/core/src/core/geminiChat.test.ts +++ b/packages/core/src/core/geminiChat.test.ts @@ -22,6 +22,8 @@ import { StreamContentError } from './openaiContentGenerator/pipeline.js'; import type { Config } from '../config/config.js'; import { setSimulate429 } from '../utils/testUtils.js'; import { uiTelemetryService } from '../telemetry/uiTelemetry.js'; +import { CompressionStatus, type ChatCompressionInfo } from './turn.js'; +import { ChatCompressionService } from '../services/chatCompressionService.js'; // Mock fs module to prevent actual file system operations during tests const mockFileSystem = new Map(); @@ -119,6 +121,13 @@ describe('GeminiChat', async () => { getTool: vi.fn(), }), getContentGenerator: vi.fn().mockReturnValue(mockContentGenerator), + getChatCompression: vi.fn().mockReturnValue(undefined), + getHookSystem: vi.fn().mockReturnValue(undefined), + getDebugLogger: vi + .fn() + .mockReturnValue({ debug: vi.fn(), warn: vi.fn(), info: vi.fn() }), + getApprovalMode: vi.fn().mockReturnValue('default'), + getFileReadCache: vi.fn().mockReturnValue({ clear: vi.fn() }), } as unknown as Config; // Disable 429 simulation for tests @@ -1019,6 +1028,223 @@ describe('GeminiChat', async () => { }); }); + describe('auto-compression integration', () => { + function makeStreamResponse(text = 'ok') { + return (async function* () { + yield { + candidates: [ + { + content: { parts: [{ text }], role: 'model' }, + finishReason: 'STOP', + index: 0, + safetyRatings: [], + }, + ], + text: () => text, + } as unknown as GenerateContentResponse; + })(); + } + + it('releases the send-lock when auto-compression throws (no deadlock)', async () => { + const compressSpy = vi + .spyOn(ChatCompressionService.prototype, 'compress') + .mockRejectedValueOnce(new Error('compression API down')); + + // First send: compression rejects, error propagates to caller. The + // streamDoneResolver must run so this.sendPromise resolves; otherwise + // every subsequent send blocks forever. + await expect( + chat.sendMessageStream( + 'test-model', + { message: 'first' }, + 'prompt-id-deadlock-1', + ), + ).rejects.toThrow('compression API down'); + + // Second send: compress returns NOOP, request goes through. If the + // lock leaked, this await would never resolve. + compressSpy.mockResolvedValueOnce({ + newHistory: null, + info: { + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }, + }); + vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue( + makeStreamResponse('second response'), + ); + + const stream = await chat.sendMessageStream( + 'test-model', + { message: 'second' }, + 'prompt-id-deadlock-2', + ); + for await (const _ of stream) { + /* consume */ + } + + expect(compressSpy).toHaveBeenCalledTimes(2); + }); + + it('seeds inherited token count via setLastPromptTokenCount', async () => { + const subagentChat = new GeminiChat(mockConfig, config, [ + { role: 'user', parts: [{ text: 'inherited' }] }, + { role: 'model', parts: [{ text: 'inherited reply' }] }, + ]); + subagentChat.setLastPromptTokenCount(123_456); + expect(subagentChat.getLastPromptTokenCount()).toBe(123_456); + + // The compression service receives the seeded count, so the threshold + // check sees the inherited size — not the constructor default of 0. + const compressSpy = vi + .spyOn(ChatCompressionService.prototype, 'compress') + .mockResolvedValue({ + newHistory: null, + info: { + originalTokenCount: 123_456, + newTokenCount: 123_456, + compressionStatus: CompressionStatus.NOOP, + }, + }); + vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue( + makeStreamResponse(), + ); + + const stream = await subagentChat.sendMessageStream( + 'test-model', + { message: 'go' }, + 'prompt-id-seed', + ); + for await (const _ of stream) { + /* consume */ + } + + expect(compressSpy).toHaveBeenCalledTimes(1); + expect(compressSpy.mock.calls[0][1].originalTokenCount).toBe(123_456); + }); + + it('yields a COMPRESSED stream event as the first event after auto-compression succeeds', async () => { + const compressedHistory: Content[] = [ + { role: 'user', parts: [{ text: 'summary' }] }, + { role: 'model', parts: [{ text: 'ok' }] }, + ]; + vi.spyOn( + ChatCompressionService.prototype, + 'compress', + ).mockResolvedValueOnce({ + newHistory: compressedHistory, + info: { + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, + }, + }); + vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue( + makeStreamResponse('answer'), + ); + + const stream = await chat.sendMessageStream( + 'test-model', + { message: 'go' }, + 'prompt-id-yield-compressed', + ); + const events: Array<{ type: StreamEventType }> = []; + for await (const event of stream) { + events.push(event as { type: StreamEventType }); + } + + expect(events.length).toBeGreaterThan(0); + expect(events[0].type).toBe(StreamEventType.COMPRESSED); + expect( + (events[0] as { type: StreamEventType; info: ChatCompressionInfo }).info + .compressionStatus, + ).toBe(CompressionStatus.COMPRESSED); + expect( + (events[0] as { type: StreamEventType; info: ChatCompressionInfo }).info + .newTokenCount, + ).toBe(200); + }); + + it('clears hasFailedCompressionAttempt after a forced successful compression', async () => { + const compressSpy = vi.spyOn( + ChatCompressionService.prototype, + 'compress', + ); + + // Step 1: auto-compression fails — latch is set on the chat. + compressSpy.mockResolvedValueOnce({ + newHistory: null, + info: { + originalTokenCount: 100_000, + newTokenCount: 100_000, + compressionStatus: + CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, + }, + }); + vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue( + makeStreamResponse(), + ); + const stream1 = await chat.sendMessageStream( + 'test-model', + { message: 'first' }, + 'prompt-latch-1', + ); + for await (const _ of stream1) { + /* consume */ + } + // Latch passed to service was false on this attempt; service marks it + // failed and tryCompress flips the chat's flag to true. + expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe( + false, + ); + + // Step 2: a forced /compress succeeds. After this, the latch must + // be cleared so future auto-compressions are not suppressed. + compressSpy.mockResolvedValueOnce({ + newHistory: [ + { role: 'user', parts: [{ text: 'summary' }] }, + { role: 'model', parts: [{ text: 'ack' }] }, + ], + info: { + originalTokenCount: 100_000, + newTokenCount: 30_000, + compressionStatus: CompressionStatus.COMPRESSED, + }, + }); + await chat.tryCompress('prompt-latch-force', 'test-model', true); + // tryCompress was called with force=true, so the service got latch=true + // (the gate is `hasFailedCompressionAttempt && !force`, force overrides). + expect(compressSpy.mock.calls[1][1].hasFailedCompressionAttempt).toBe( + true, + ); + + // Step 3: next auto-compression sees the cleared latch. + compressSpy.mockResolvedValueOnce({ + newHistory: null, + info: { + originalTokenCount: 30_000, + newTokenCount: 30_000, + compressionStatus: CompressionStatus.NOOP, + }, + }); + vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue( + makeStreamResponse(), + ); + const stream2 = await chat.sendMessageStream( + 'test-model', + { message: 'second' }, + 'prompt-latch-2', + ); + for await (const _ of stream2) { + /* consume */ + } + expect(compressSpy.mock.calls[2][1].hasFailedCompressionAttempt).toBe( + false, + ); + }); + }); + describe('addHistory', () => { it('should add a new content item to the history', () => { const newContent: Content = { @@ -2425,4 +2651,132 @@ describe('GeminiChat', async () => { expect(mergedText).toBe('BCD'); }); }); + + // Compression logic is tested in chatCompressionService.test.ts; this + // suite covers per-chat state on GeminiChat: hasFailedCompressionAttempt + // stickiness, token-count mutation, history replacement, and conditional + // telemetry mirroring. + describe('tryCompress (per-chat state)', () => { + const userMsg = (text: string) => ({ + role: 'user' as const, + parts: [{ text }], + }); + const modelMsg = (text: string) => ({ + role: 'model' as const, + parts: [{ text }], + }); + + /** + * Mock a successful compression: the service returns COMPRESSED with a + * fresh history. We don't go through the real + * `config.getContentGenerator().generateContent` path here — the service + * is mocked at the boundary. + */ + function mockCompressionService( + result: 'compressed' | 'failed-inflated' | 'noop', + ) { + const compressSpy = vi.spyOn( + ChatCompressionService.prototype, + 'compress', + ); + if (result === 'compressed') { + compressSpy.mockResolvedValue({ + newHistory: [userMsg('summary'), modelMsg('ok'), userMsg('latest')], + info: { + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, + }, + }); + } else if (result === 'failed-inflated') { + compressSpy.mockResolvedValue({ + newHistory: null, + info: { + originalTokenCount: 1000, + newTokenCount: 1100, + compressionStatus: + CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, + }, + }); + } else { + compressSpy.mockResolvedValue({ + newHistory: null, + info: { + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }, + }); + } + return compressSpy; + } + + it('replaces history and updates per-chat lastPromptTokenCount on COMPRESSED', async () => { + mockCompressionService('compressed'); + chat.setHistory([userMsg('a'), modelMsg('b'), userMsg('c')]); + + const info = await chat.tryCompress('p1', 'm1'); + + expect(info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + expect(chat.getHistory()).toHaveLength(3); + expect(chat.getHistory()[0]).toEqual(userMsg('summary')); + expect(chat.getLastPromptTokenCount()).toBe(200); + }); + + it('mirrors lastPromptTokenCount to the global telemetry only when wired', async () => { + mockCompressionService('compressed'); + // chat under test was constructed with telemetryService=uiTelemetryService. + await chat.tryCompress('p2', 'm1'); + expect(uiTelemetryService.setLastPromptTokenCount).toHaveBeenCalledWith( + 200, + ); + + // A subagent-style chat with no telemetryService must NOT touch the + // global singleton (per the constructor docstring; per-chat counter + // still updates). + const subagentChat = new GeminiChat(mockConfig, config, []); + vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear(); + mockCompressionService('compressed'); + const info = await subagentChat.tryCompress('p3', 'm1'); + expect(info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + expect(subagentChat.getLastPromptTokenCount()).toBe(200); + expect(uiTelemetryService.setLastPromptTokenCount).not.toHaveBeenCalled(); + }); + + it('marks hasFailedCompressionAttempt and suppresses subsequent unforced auto-compactions', async () => { + const compressSpy = mockCompressionService('failed-inflated'); + + const first = await chat.tryCompress('p1', 'm1'); + expect(first.compressionStatus).toBe( + CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, + ); + expect(compressSpy).toHaveBeenCalledTimes(1); + + // The next unforced call should reach the service with + // hasFailedCompressionAttempt=true; the service's threshold check then + // returns NOOP. The important thing here is that GeminiChat actually + // forwards the sticky flag. + compressSpy.mockClear(); + compressSpy.mockResolvedValue({ + newHistory: null, + info: { + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }, + }); + await chat.tryCompress('p2', 'm1'); + expect(compressSpy).toHaveBeenCalledTimes(1); + expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe( + true, + ); + }); + + it('forwards force=true to the compression service', async () => { + const compressSpy = mockCompressionService('compressed'); + + await chat.tryCompress('p1', 'm1', true); + expect(compressSpy.mock.calls[0][1].force).toBe(true); + }); + }); }); diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 804c143d8..6249a56db 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -31,11 +31,13 @@ import { logContentRetryFailure, } from '../telemetry/loggers.js'; import { type ChatRecordingService } from '../services/chatRecordingService.js'; +import { ChatCompressionService } from '../services/chatCompressionService.js'; import { ContentRetryEvent, ContentRetryFailureEvent, } from '../telemetry/types.js'; import type { UiTelemetryService } from '../telemetry/uiTelemetry.js'; +import { type ChatCompressionInfo, CompressionStatus } from './turn.js'; const debugLogger = createDebugLogger('QWEN_CODE_CHAT'); @@ -45,6 +47,11 @@ export enum StreamEventType { /** A signal that a retry is about to happen. The UI should discard any partial * content from the attempt that just failed. */ RETRY = 'retry', + /** Emitted once at the start of the stream when an automatic compression + * pass succeeded. Carries the compression result so callers (the main + * agent UI, subagent loop) can surface it without each call site running + * its own compaction step. */ + COMPRESSED = 'compressed', } export type StreamEvent = @@ -56,7 +63,8 @@ export type StreamEvent = * fresh restart (escalation). The UI should keep the accumulated text * buffer so the continuation appends to it. */ isContinuation?: boolean; - }; + } + | { type: StreamEventType.COMPRESSED; info: ChatCompressionInfo }; /** * Options for retrying due to invalid content from the model. @@ -299,6 +307,22 @@ export class GeminiChat { // model. private sendPromise: Promise = Promise.resolve(); + /** + * Per-chat last-prompt-token-count, populated from `usageMetadata` on each + * model response. Used by the compaction threshold check so that subagents + * (which intentionally don't write to the global telemetry singleton) can + * still make compaction decisions based on their *own* context size. + */ + private lastPromptTokenCount = 0; + + /** + * Per-chat sticky flag. After an unforced compression attempt fails (empty + * summary or inflated token count), automatic compaction is suppressed + * for the remainder of this chat to avoid burning compression API calls + * in a loop. Manual `/compress` still works (it passes `force=true`). + */ + private hasFailedCompressionAttempt = false; + /** * Creates a new GeminiChat instance. * @@ -321,6 +345,96 @@ export class GeminiChat { validateHistory(history); } + /** + * Most recent prompt-token count reported by the model for *this* chat, + * mirroring the value in {@link UiTelemetryService} for the main session. + * Subagent chats have no telemetry service wired but still need a per-chat + * count for compaction decisions, so this is always populated regardless + * of whether the global telemetry is updated. + */ + getLastPromptTokenCount(): number { + return this.lastPromptTokenCount; + } + + /** + * Seed the last-prompt-token-count for chats created with inherited + * history (forks, subagents, speculation). Without this, the auto-compress + * threshold check sees `0` and refuses to compress — so the first API call + * can 400 from oversized history. Callers pass the parent chat's + * `getLastPromptTokenCount()` here. + */ + setLastPromptTokenCount(count: number): void { + this.lastPromptTokenCount = count; + } + + /** + * Attempt to compress this chat's history. + * + * Returns the compression info regardless of outcome. On a successful + * compaction (`COMPRESSED`), this method has already mutated the chat's + * history, recorded the event to `chatRecordingService` (if wired), and + * updated both the per-chat token count and (when wired) the global + * telemetry singleton. + */ + async tryCompress( + promptId: string, + model: string, + force = false, + signal?: AbortSignal, + ): Promise { + const service = new ChatCompressionService(); + const { newHistory, info } = await service.compress(this, { + promptId, + force, + model, + config: this.config, + hasFailedCompressionAttempt: this.hasFailedCompressionAttempt, + originalTokenCount: this.lastPromptTokenCount, + signal, + }); + + if (info.compressionStatus === CompressionStatus.COMPRESSED && newHistory) { + this.chatRecordingService?.recordChatCompression({ + info, + compressedHistory: newHistory, + }); + // Auto-compaction replaces history in place — no env-context refresh + // here. Manual /compress goes through GeminiClient.tryCompressChat, + // which calls startChat() to re-prepend a fresh env snapshot. See + // GeminiClient.sendMessageStream for the rationale behind the split. + this.setHistory(newHistory); + // Compaction summarises away prior full-Read tool results, but the + // FileReadCache still treats those reads as "in this conversation". + // A follow-up Read could then return the file_unchanged placeholder + // pointing at content the model can no longer retrieve from history. + debugLogger.debug('[FILE_READ_CACHE] clear after auto tryCompress'); + this.config.getFileReadCache().clear(); + this.lastPromptTokenCount = info.newTokenCount; + // Mirror to the global singleton only when wired (main session). + // Subagents pass `telemetryService=undefined` to keep their context + // usage out of the main agent's UI counters. + this.telemetryService?.setLastPromptTokenCount(info.newTokenCount); + // Re-enable auto-compaction so a forced /compress recovers a chat + // that an earlier auto-attempt latched off. + this.hasFailedCompressionAttempt = false; + } else if ( + info.compressionStatus === + CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT || + info.compressionStatus === + CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY || + info.compressionStatus === + CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR + ) { + // Track failed attempts (only mark as failed if not forced) so we + // stop spending compression-API calls on a chat that can't shrink. + if (!force) { + this.hasFailedCompressionAttempt = true; + } + } + + return info; + } + setSystemInstruction(sysInstr: string) { this.generationConfig.systemInstruction = sysInstr; } @@ -360,6 +474,23 @@ export class GeminiChat { }); this.sendPromise = streamDonePromise; + // The send-lock above is held but the generator's `finally` (which + // resolves it) has not run yet — if `tryCompress` throws, we must + // release the lock here or subsequent sends will block forever at + // `await this.sendPromise`. + let compressionInfo: ChatCompressionInfo; + try { + compressionInfo = await this.tryCompress( + prompt_id, + model, + false, + params.config?.abortSignal, + ); + } catch (error) { + streamDoneResolver!(); + throw error; + } + const userContent = createUserContent(params.message); // Add user content to history ONCE before any attempts. @@ -370,6 +501,20 @@ export class GeminiChat { const self = this; return (async function* () { try { + // Surface a successful auto-compression to the caller as the first + // event in the stream. Failed/skipped compaction attempts are silent. + // Must be inside the try so that a consumer abandoning the stream + // immediately after this event still triggers the finally below; + // otherwise `streamDoneResolver` never fires and the next send hangs. + if ( + compressionInfo.compressionStatus === CompressionStatus.COMPRESSED + ) { + yield { + type: StreamEventType.COMPRESSED, + info: compressionInfo, + }; + } + let lastError: unknown = new Error('Request failed after all retries.'); let rateLimitRetryCount = 0; let invalidStreamRetryCount = 0; @@ -890,8 +1035,14 @@ export class GeminiChat { // Some providers omit total_tokens or return 0 in streaming usage chunks. const lastPromptTokenCount = usageMetadata.totalTokenCount || usageMetadata.promptTokenCount; - if (lastPromptTokenCount && this.telemetryService) { - this.telemetryService.setLastPromptTokenCount(lastPromptTokenCount); + if (lastPromptTokenCount) { + // Always update the per-chat counter so this chat (including + // subagents) can make its own compaction decisions. + this.lastPromptTokenCount = lastPromptTokenCount; + // Mirror to the global telemetry only when wired — subagents + // pass `telemetryService=undefined` to keep their context usage + // out of the main session's UI counters. + this.telemetryService?.setLastPromptTokenCount(lastPromptTokenCount); } if (usageMetadata.cachedContentTokenCount && this.telemetryService) { this.telemetryService.setLastCachedContentTokenCount( diff --git a/packages/core/src/core/turn.test.ts b/packages/core/src/core/turn.test.ts index 92c86f9ed..6626f56c5 100644 --- a/packages/core/src/core/turn.test.ts +++ b/packages/core/src/core/turn.test.ts @@ -9,7 +9,7 @@ import type { ServerGeminiToolCallRequestEvent, ServerGeminiErrorEvent, } from './turn.js'; -import { Turn, GeminiEventType } from './turn.js'; +import { CompressionStatus, Turn, GeminiEventType } from './turn.js'; import type { GenerateContentResponse, Part, Content } from '@google/genai'; import { reportError } from '../utils/errorReporting.js'; import type { GeminiChat } from './geminiChat.js'; @@ -845,6 +845,38 @@ describe('Turn', () => { { type: GeminiEventType.Content, value: 'Success' }, ]); }); + + it('bridges a compressed stream event to a ChatCompressed event', async () => { + const compressionInfo = { + originalTokenCount: 1000, + newTokenCount: 200, + compressionStatus: CompressionStatus.COMPRESSED, + }; + const mockResponseStream = (async function* () { + yield { type: StreamEventType.COMPRESSED, info: compressionInfo }; + yield { + type: StreamEventType.CHUNK, + value: { + candidates: [{ content: { parts: [{ text: 'after' }] } }], + }, + }; + })(); + mockSendMessageStream.mockResolvedValue(mockResponseStream); + + const events = []; + for await (const event of turn.run( + 'test-model', + [], + new AbortController().signal, + )) { + events.push(event); + } + + expect(events).toEqual([ + { type: GeminiEventType.ChatCompressed, value: compressionInfo }, + { type: GeminiEventType.Content, value: 'after' }, + ]); + }); }); describe('getDebugResponses', () => { diff --git a/packages/core/src/core/turn.ts b/packages/core/src/core/turn.ts index b0971d0b6..34f78bcd0 100644 --- a/packages/core/src/core/turn.ts +++ b/packages/core/src/core/turn.ts @@ -306,6 +306,19 @@ export class Turn { continue; // Skip to the next event in the stream } + // Surface auto-compaction that fired inside chat.sendMessageStream + // as the top-level ChatCompressed event so existing UI handlers stay + // connected. This bridge is the primary path for auto-compaction + // events; manual /compress emits its own ChatCompressed in + // GeminiClient.tryCompressChat. + if (streamEvent.type === 'compressed') { + yield { + type: GeminiEventType.ChatCompressed, + value: streamEvent.info, + }; + continue; + } + // Assuming other events are chunks with a `value` property const resp = streamEvent.value as GenerateContentResponse; if (!resp) continue; // Skip if there's no response body diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts index 417988331..3a3520cce 100644 --- a/packages/core/src/services/chatCompressionService.test.ts +++ b/packages/core/src/services/chatCompressionService.test.ts @@ -75,15 +75,17 @@ describe('findCompressSplitPoint', () => { expect(findCompressSplitPoint(history, 0.8)).toBe(4); }); - it('should return earlier splitpoint if no valid ones are after threshhold', () => { + it('compresses everything before the trailing in-flight functionCall', () => { const history: Content[] = [ { role: 'user', parts: [{ text: 'This is the first message.' }] }, { role: 'model', parts: [{ text: 'This is the second message.' }] }, { role: 'user', parts: [{ text: 'This is the third message.' }] }, { role: 'model', parts: [{ functionCall: { name: 'foo', args: {} } }] }, ]; - // Can't return 4 because the previous item has a function call. - expect(findCompressSplitPoint(history, 0.99)).toBe(2); + // Trailing m+fc is in-flight; no preceding (m+fc, u+fr) pair to retain, + // so the in-flight fallback compresses everything except the trailing fc. + // The kept slice starts with m+fc; callers bridge with a synthetic user. + expect(findCompressSplitPoint(history, 0.99)).toBe(3); }); it('should handle a history with only one item', () => { @@ -143,7 +145,7 @@ describe('findCompressSplitPoint', () => { expect(findCompressSplitPoint(history, 0.7)).toBe(5); }); - it('should return primary split point when tool completions have no subsequent regular user message', () => { + it('retains last K complete tool rounds when no fresh user splits past target', () => { const history: Content[] = [ { role: 'user', parts: [{ text: 'Fix this' }] }, { @@ -181,14 +183,13 @@ describe('findCompressSplitPoint', () => { parts: [{ functionCall: { name: 'write1', args: {} } }], }, ]; - // Only one non-functionResponse user message (index 0) -> lastSplitPoint=0 - // Last message has functionCall -> can't compress everything - // historyToKeep must start with a regular user message, so split at 0 - // (compress nothing) is the only valid option. - expect(findCompressSplitPoint(history, 0.7)).toBe(0); + // 2 complete (m+fc, u+fr) pairs precede the trailing fc → retain both + // pairs + trailing fc = last 5 entries; compress index 0 (the task). + // Pre-refactor this returned 0 (NOOP); now it compresses-most. + expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5); }); - it('should prefer primary split point when tool completions yield no valid user-starting split', () => { + it('prefers compress-most over lastSplitPoint when scan finds no clean split past target', () => { const longContent = 'a'.repeat(10000); const history: Content[] = [ { role: 'user', parts: [{ text: 'Fix bug A' }] }, @@ -229,13 +230,12 @@ describe('findCompressSplitPoint', () => { parts: [{ functionCall: { name: 'write1', args: {} } }], }, ]; - // Primary split points at 0 and 2 (regular user messages before the bulky tool outputs) - // Last message has functionCall -> can't compress everything - // Should return lastSplitPoint=2 (last valid primary split point) - expect(findCompressSplitPoint(history, 0.7)).toBe(2); + // 2 complete pairs before the trailing fc → retain both + trailing = 5 + // entries kept. Pre-refactor returned lastSplitPoint=2 (compress less). + expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5); }); - it('should still prefer primary split point when it is better', () => { + it('compresses-most via in-flight fallback when scan never crosses the target', () => { const history: Content[] = [ { role: 'user', parts: [{ text: 'msg1' }] }, { role: 'model', parts: [{ text: 'resp1' }] }, @@ -266,10 +266,93 @@ describe('findCompressSplitPoint', () => { parts: [{ functionCall: { name: 'tool2', args: {} } }], }, ]; - // Primary split points: 0, 2, 5, 7 - // Last message has functionCall -> can't compress everything - // At 0.99 fraction, lastSplitPoint should be 7 - expect(findCompressSplitPoint(history, 0.99)).toBe(7); + // The entry before the trailing fc is a fresh user (msg4), not a u+fr, + // so the pair walk stops with 0 pairs found → retain only the trailing + // fc, compress everything else. Pre-refactor returned lastSplitPoint=7. + expect(findCompressSplitPoint(history, 0.99)).toBe(history.length - 1); + }); +}); + +describe('findCompressSplitPoint — in-flight fallback', () => { + const userTask = (text: string): Content => ({ + role: 'user', + parts: [{ text }], + }); + const modelText = (text: string): Content => ({ + role: 'model', + parts: [{ text }], + }); + const modelFc = (name: string): Content => ({ + role: 'model', + parts: [{ functionCall: { name, args: {} } }], + }); + const userFr = (name: string): Content => ({ + role: 'user', + parts: [{ functionResponse: { name, response: { result: 'x' } } }], + }); + + // Subagent-shaped history at compression check time: env bootstrap, task, + // alternating tool rounds, ending in a trailing in-flight model+fc whose + // functionResponse hasn't been pushed yet. The scan finds no clean split + // past the target fraction, so the in-flight fallback decides the index. + it('compresses everything except trailing fc + most recent retainCount pairs', () => { + const history = [ + userTask('env'), + modelText('env-ack'), + userTask('task'), + modelFc('a'), + userFr('a'), + modelFc('b'), + userFr('b'), + modelFc('c'), + userFr('c'), + modelFc('d'), + userFr('d'), + modelFc('trailing'), + ]; + // Default retainCount = 2 → keep last 5 (2 pairs + trailing). + expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5); + }); + + it('retains all pairs when fewer than retainCount exist', () => { + const history = [ + userTask('env'), + modelText('env-ack'), + userTask('task'), + modelFc('a'), + userFr('a'), + modelFc('trailing'), + ]; + // Only 1 complete pair → keep last 3 (1 pair + trailing). + expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 3); + }); + + it('retains just the trailing fc when no complete pairs precede it', () => { + const history = [ + userTask('env'), + modelText('env-ack'), + userTask('task'), + modelFc('trailing'), + ]; + // No complete pairs → keep only the trailing fc. + expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 1); + }); + + it('respects an explicit retainCount override', () => { + const history = [ + userTask('env'), + modelText('env-ack'), + userTask('task'), + modelFc('a'), + userFr('a'), + modelFc('b'), + userFr('b'), + modelFc('c'), + userFr('c'), + modelFc('trailing'), + ]; + // Override retainCount to 1 → keep last 3 (1 pair + trailing). + expect(findCompressSplitPoint(history, 0.7, 1)).toBe(history.length - 3); }); }); @@ -313,14 +396,14 @@ describe('ChatCompressionService', () => { it('should return NOOP if history is empty', async () => { vi.mocked(mockChat.getHistory).mockReturnValue([]); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(result.newHistory).toBeNull(); }); @@ -329,14 +412,14 @@ describe('ChatCompressionService', () => { vi.mocked(mockChat.getHistory).mockReturnValue([ { role: 'user', parts: [{ text: 'hi' }] }, ]); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - true, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: true, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(result.newHistory).toBeNull(); }); @@ -349,14 +432,14 @@ describe('ChatCompressionService', () => { vi.mocked(tokenLimit).mockReturnValue(1000); // Threshold is 0.7 * 1000 = 700. 600 < 700, so NOOP. - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(result.newHistory).toBeNull(); }); @@ -377,14 +460,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info).toMatchObject({ compressionStatus: CompressionStatus.NOOP, @@ -394,14 +477,14 @@ describe('ChatCompressionService', () => { expect(mockGenerateContent).not.toHaveBeenCalled(); expect(tokenLimit).not.toHaveBeenCalled(); - const forcedResult = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const forcedResult = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(forcedResult.info).toMatchObject({ compressionStatus: CompressionStatus.NOOP, originalTokenCount: 0, @@ -438,14 +521,14 @@ describe('ChatCompressionService', () => { } as unknown as ContentGenerator); // force=true bypasses the token threshold gate so we exercise the 5% guard - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(result.newHistory).toBeNull(); @@ -485,14 +568,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); expect(result.info.newTokenCount).toBe(250); // 800 - (1600 - 1000) + 50 @@ -539,14 +622,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, // forced - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + // forced + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); expect(result.newHistory).not.toBeNull(); @@ -586,14 +670,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, @@ -629,14 +713,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR, @@ -668,14 +752,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY, @@ -707,14 +791,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY, @@ -749,14 +833,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT, @@ -801,14 +885,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should still complete compression despite hook error expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); @@ -860,14 +944,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - true, // force = true -> Manual trigger - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + // force = true -> Manual trigger + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(mockFirePreCompactEvent).toHaveBeenCalledWith( PreCompactTrigger.Manual, @@ -910,14 +995,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - false, // force = false -> Auto trigger - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + // force = false -> Auto trigger + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(mockFirePreCompactEvent).toHaveBeenCalledWith( PreCompactTrigger.Auto, @@ -929,14 +1015,14 @@ describe('ChatCompressionService', () => { it('should not fire PreCompact hook when history is empty', async () => { vi.mocked(mockChat.getHistory).mockReturnValue([]); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(mockFirePreCompactEvent).not.toHaveBeenCalled(); @@ -952,14 +1038,14 @@ describe('ChatCompressionService', () => { contextPercentageThreshold: 0, }); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(mockFirePreCompactEvent).not.toHaveBeenCalled(); @@ -976,14 +1062,14 @@ describe('ChatCompressionService', () => { ); vi.mocked(tokenLimit).mockReturnValue(1000); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); expect(mockFirePreCompactEvent).not.toHaveBeenCalled(); @@ -1027,14 +1113,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should still complete compression despite hook error expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); @@ -1084,14 +1170,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // PreCompact should be called before SessionStart expect(callOrder).toEqual(['PreCompact', 'SessionStart']); @@ -1133,14 +1219,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should still complete compression without hook expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); @@ -1195,14 +1281,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - true, // force = true -> Manual trigger - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + // force = true -> Manual trigger + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(mockFirePostCompactEvent).toHaveBeenCalledWith( PostCompactTrigger.Manual, @@ -1245,14 +1332,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - false, // force = false -> Auto trigger - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + // force = false -> Auto trigger + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(mockFirePostCompactEvent).toHaveBeenCalledWith( PostCompactTrigger.Auto, @@ -1292,14 +1380,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); expect(result.info.compressionStatus).toBe( CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY, @@ -1345,14 +1433,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should still complete compression despite hook error expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); @@ -1405,14 +1493,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Hooks should be called in order: PreCompact -> SessionStart -> PostCompact expect(callOrder).toEqual(['PreCompact', 'SessionStart', 'PostCompact']); @@ -1454,14 +1542,14 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should still complete compression without hook expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); @@ -1535,14 +1623,15 @@ describe('ChatCompressionService', () => { generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - true, // force=true (manual /compress) - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: true, + // force=true (manual /compress) + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); // Should compress successfully — orphaned funcCall is stripped first, then // normal compression runs on the remaining history, historyToKeep is empty @@ -1558,10 +1647,12 @@ describe('ChatCompressionService', () => { expect(callArg.contents.length).toBe(history.length); // (history.length - 1) messages + 1 instruction }); - it('should NOT compress orphaned funcCall when force=false (auto-compress)', async () => { - // Auto-compress fires BEFORE the matching funcResponse is sent back to the - // model. Compressing the funcCall away would orphan the upcoming funcResponse - // and cause an API error. So force=false must NOT take this path. + it('compresses-most without orphaning when last entry is in-flight funcCall (auto-compress)', async () => { + // Auto-compress fires BEFORE the matching funcResponse is sent back to + // the model. The trailing funcCall must be retained (its response is + // coming); the in-flight fallback compresses everything safely before + // it. Pre-refactor this returned NOOP, leaving the chat to grow until + // it 400'd. const history: Content[] = [ { role: 'user', parts: [{ text: 'Fix all TypeScript errors.' }] }, { @@ -1586,7 +1677,6 @@ describe('ChatCompressionService', () => { }, ]; vi.mocked(mockChat.getHistory).mockReturnValue(history); - // Use a token count above threshold to ensure auto-compress isn't skipped vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( 800, ); @@ -1595,23 +1685,195 @@ describe('ChatCompressionService', () => { contextWindowSize: 1000, } as unknown as ReturnType); + const mockGenerateContent = vi.fn().mockResolvedValue({ + candidates: [ + { content: { parts: [{ text: 'state snapshot summary' }] } }, + ], + usageMetadata: { + promptTokenCount: 2000, + candidatesTokenCount: 50, + totalTokenCount: 2050, + }, + } as unknown as GenerateContentResponse); + vi.mocked(mockConfig.getContentGenerator).mockReturnValue({ + generateContent: mockGenerateContent, + } as unknown as ContentGenerator); + + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + expect(mockGenerateContent).toHaveBeenCalledTimes(1); + // Trailing in-flight functionCall is preserved last in the kept slice + // so the upcoming functionResponse pairs with it. + const newHistory = result.newHistory!; + const last = newHistory[newHistory.length - 1]; + expect(last.role).toBe('model'); + expect(last.parts?.some((p) => p.functionCall)).toBe(true); + // Strict role alternation throughout. + for (let i = 1; i < newHistory.length; i++) { + expect(newHistory[i].role).not.toBe(newHistory[i - 1].role); + } + }); + }); + + describe('tool-loop subagent absorption', () => { + // The fresh-user split heuristic produces a tiny compress slice when the + // history is dominated by tool rounds (every user past the task is a + // functionResponse). Without absorption, MIN_COMPRESSION_FRACTION would + // NOOP every send and the subagent eventually hits the 400 it was meant + // to avoid. + it('compresses by absorbing older tool rounds when fresh-user split is too small', async () => { + const FILLER = 'A'.repeat(20_000); + // Auto-compress fires BEFORE the next functionResponse is pushed, so + // the trailing entry is always a model+functionCall with no match yet. + // Build a history with N complete pairs followed by one trailing fc. + const buildHistory = (completePairs: number): Content[] => { + const h: Content[] = [ + { role: 'user', parts: [{ text: 'env-bootstrap' }] }, + { role: 'model', parts: [{ text: 'env-ack' }] }, + { role: 'user', parts: [{ text: 'task: explore' }] }, + ]; + for (let r = 0; r < completePairs; r++) { + h.push({ + role: 'model', + parts: [ + { text: `round ${r}: ${FILLER}` }, + { functionCall: { name: 'glob', args: { pattern: '**/*.md' } } }, + ], + }); + h.push({ + role: 'user', + parts: [ + { + functionResponse: { name: 'glob', response: { result: 'x' } }, + }, + ], + }); + } + // Trailing model+fc whose response is about to be sent. + h.push({ + role: 'model', + parts: [ + { text: `round ${completePairs}: ${FILLER}` }, + { functionCall: { name: 'glob', args: { pattern: '**/*.md' } } }, + ], + }); + return h; + }; + + // Five complete tool rounds + 1 trailing fc → 5 pairs in keep; absorbs + // 3 older pairs and retains the 2 most recent (plus the trailing fc). + vi.mocked(mockChat.getHistory).mockReturnValue(buildHistory(5)); + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 80_000, + ); + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + model: 'gemini-pro', + contextWindowSize: 100_000, + } as unknown as ReturnType); + + const mockGenerateContent = vi.fn().mockResolvedValue({ + candidates: [ + { content: { parts: [{ text: 'state snapshot summary' }] } }, + ], + usageMetadata: { + promptTokenCount: 60_000, + candidatesTokenCount: 200, + totalTokenCount: 60_200, + }, + } as unknown as GenerateContentResponse); + vi.mocked(mockConfig.getContentGenerator).mockReturnValue({ + generateContent: mockGenerateContent, + } as unknown as ContentGenerator); + + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + expect(result.newHistory).not.toBeNull(); + expect(mockGenerateContent).toHaveBeenCalledTimes(1); + + const newHistory = result.newHistory!; + // [summary_user, summary_ack_model, continuation_bridge_user, ...keep] + // where keep starts with the retained model+functionCall. + expect(newHistory[0].role).toBe('user'); + expect(newHistory[0].parts?.[0].text).toBe('state snapshot summary'); + expect(newHistory[1].role).toBe('model'); + expect(newHistory[2].role).toBe('user'); + expect(newHistory[2].parts?.[0].text).toMatch(/Continue/); + // Retained two complete pairs (4 entries) + trailing model+fc = 5. + expect(newHistory.slice(3)).toHaveLength(5); + expect(newHistory[3].role).toBe('model'); + expect(newHistory[3].parts?.some((p) => p.functionCall)).toBe(true); + expect(newHistory[4].role).toBe('user'); + expect(newHistory[4].parts?.some((p) => p.functionResponse)).toBe(true); + // Trailing model+fc remains last so the upcoming functionResponse pushed + // by sendMessageStream pairs with it correctly. + const last = newHistory[newHistory.length - 1]; + expect(last.role).toBe('model'); + expect(last.parts?.some((p) => p.functionCall)).toBe(true); + + // Strict role alternation throughout the new history. + for (let i = 1; i < newHistory.length; i++) { + expect(newHistory[i].role).not.toBe(newHistory[i - 1].role); + } + }); + + it('NOOPs when the keep slice has too few tool rounds to absorb', async () => { + const FILLER = 'A'.repeat(20_000); + const history: Content[] = [ + { role: 'user', parts: [{ text: 'env-bootstrap' }] }, + { role: 'model', parts: [{ text: 'env-ack' }] }, + { role: 'user', parts: [{ text: 'task' }] }, + { + role: 'model', + parts: [ + { text: FILLER }, + { functionCall: { name: 'glob', args: {} } }, + ], + }, + ]; + vi.mocked(mockChat.getHistory).mockReturnValue(history); + // Set originalTokenCount above the threshold gate (0.7 * 30000 = 21000) + // so the test actually exercises findCompressSplitPoint and the + // MIN_COMPRESSION_FRACTION decision rather than short-circuiting at + // the cheap-gate. + vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue( + 22_000, + ); + vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({ + model: 'gemini-pro', + contextWindowSize: 30_000, + } as unknown as ReturnType); + const mockGenerateContent = vi.fn(); vi.mocked(mockConfig.getContentGenerator).mockReturnValue({ generateContent: mockGenerateContent, } as unknown as ContentGenerator); - const result = await service.compress( - mockChat, - mockPromptId, - false, // force=false (auto-compress) - mockModel, - mockConfig, - false, - ); + const result = await service.compress(mockChat, { + promptId: mockPromptId, + force: false, + model: mockModel, + config: mockConfig, + hasFailedCompressionAttempt: false, + originalTokenCount: uiTelemetryService.getLastPromptTokenCount(), + }); - // Must return NOOP — compressing would orphan the upcoming funcResponse expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP); - expect(result.newHistory).toBeNull(); expect(mockGenerateContent).not.toHaveBeenCalled(); }); }); diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts index 876d051ac..4e6b19bd3 100644 --- a/packages/core/src/services/chatCompressionService.ts +++ b/packages/core/src/services/chatCompressionService.ts @@ -8,7 +8,6 @@ import type { Content } from '@google/genai'; import type { Config } from '../config/config.js'; import type { GeminiChat } from '../core/geminiChat.js'; import { type ChatCompressionInfo, CompressionStatus } from '../core/turn.js'; -import { uiTelemetryService } from '../telemetry/uiTelemetry.js'; import { DEFAULT_TOKEN_LIMIT } from '../core/tokenLimits.js'; import { getCompressionPrompt } from '../core/prompts.js'; import { getResponseText } from '../utils/partUtils.js'; @@ -40,15 +39,79 @@ export const COMPRESSION_PRESERVE_THRESHOLD = 0.3; */ export const MIN_COMPRESSION_FRACTION = 0.05; +/** + * When the trailing entry is an in-flight `model+functionCall` and the regular + * scan finds no clean split past the target fraction, the splitter falls back + * to compressing everything except the last few entries. This constant sets + * how many most-recent complete `(model+functionCall, user+functionResponse)` + * tool rounds are retained as working context (the trailing in-flight call is + * always retained on top of these). + */ +export const TOOL_ROUND_RETAIN_COUNT = 2; + +const hasFunctionCall = (content: Content | undefined): boolean => + !!content?.parts?.some((part) => !!part.functionCall); + +const hasFunctionResponse = (content: Content | undefined): boolean => + !!content?.parts?.some((part) => !!part.functionResponse); + +/** + * Walk backward from the trailing in-flight `model+functionCall` and return + * the index after which the most-recent `retainCount` complete tool-round + * pairs sit (plus the trailing fc itself). Used by the splitter's in-flight + * fallback path. Stops counting at the first non-pair encountered, so the + * retain count is best-effort: if there are fewer complete pairs than + * requested, all of them are retained. + */ +function splitPointRetainingTrailingPairs( + contents: Content[], + retainCount: number, +): number { + let pairsFound = 0; + let i = contents.length - 2; + while (i >= 1 && pairsFound < retainCount) { + if (hasFunctionCall(contents[i - 1]) && hasFunctionResponse(contents[i])) { + pairsFound += 1; + i -= 2; + } else { + break; + } + } + return contents.length - (2 * pairsFound + 1); +} + /** * Returns the index of the oldest item to keep when compressing. May return * contents.length which indicates that everything should be compressed. * + * The algorithm has two phases: + * + * 1. **Scan:** walk left-to-right looking for the first non-functionResponse + * user message that lands past `fraction` of total chars. That's the + * "clean" split — the kept slice starts with a fresh user prompt. + * + * 2. **Fallbacks** (no clean split found): the gate that gets us here has + * already decided we need to compress, so all three fallbacks bias toward + * *more* compression rather than less: + * + * - last entry is `model` without functionCall → compress everything. + * - last entry is `user` with functionResponse → compress everything (the + * trailing tool round is complete; no orphans). + * - last entry is `model` with functionCall (in-flight) → compress + * everything except the trailing call plus the last `retainCount` + * complete tool rounds. The kept slice may start with `model+fc`; + * callers must inject a synthetic continuation user message between + * `summary_ack_model` and the kept slice to preserve role alternation. + * + * The pre-fallback returns of `lastSplitPoint` (compress less) only happen + * for malformed histories that don't end in user/model. + * * Exported for testing purposes. */ export function findCompressSplitPoint( contents: Content[], fraction: number, + retainCount = TOOL_ROUND_RETAIN_COUNT, ): number { if (fraction <= 0 || fraction >= 1) { throw new Error('Fraction must be between 0 and 1'); @@ -58,14 +121,11 @@ export function findCompressSplitPoint( const totalCharCount = charCounts.reduce((a, b) => a + b, 0); const targetCharCount = totalCharCount * fraction; - let lastSplitPoint = 0; // 0 is always valid (compress nothing) + let lastSplitPoint = 0; let cumulativeCharCount = 0; for (let i = 0; i < contents.length; i++) { const content = contents[i]; - if ( - content.role === 'user' && - !content.parts?.some((part) => !!part.functionResponse) - ) { + if (content.role === 'user' && !hasFunctionResponse(content)) { if (cumulativeCharCount >= targetCharCount) { return i; } @@ -74,48 +134,57 @@ export function findCompressSplitPoint( cumulativeCharCount += charCounts[i]; } - // We found no split points after targetCharCount. - // Check if it's safe to compress everything. const lastContent = contents[contents.length - 1]; - if ( - lastContent?.role === 'model' && - !lastContent?.parts?.some((part) => part.functionCall) - ) { + if (lastContent?.role === 'model') { + if (!hasFunctionCall(lastContent)) return contents.length; + return splitPointRetainingTrailingPairs(contents, retainCount); + } + if (lastContent?.role === 'user' && hasFunctionResponse(lastContent)) { return contents.length; } - // Also safe to compress everything if the last message completes a tool call - // sequence (all function calls have matching responses). - if ( - lastContent?.role === 'user' && - lastContent?.parts?.some((part) => !!part.functionResponse) - ) { - return contents.length; - } - return lastSplitPoint; } +export interface CompressOptions { + promptId: string; + force: boolean; + model: string; + config: Config; + /** + * Whether a previous unforced compression attempt failed for this chat. + * Suppresses auto-compaction; manual `/compress` (force=true) overrides. + */ + hasFailedCompressionAttempt: boolean; + /** + * Most recent prompt token count for this chat. Compared against + * `threshold * contextWindowSize` for the auto-compaction gate. Callers + * source this from the per-chat counter (main session, subagents alike) — + * the service does not read or write any global telemetry. + */ + originalTokenCount: number; + signal?: AbortSignal; +} + export class ChatCompressionService { async compress( chat: GeminiChat, - promptId: string, - force: boolean, - model: string, - config: Config, - hasFailedCompressionAttempt: boolean, - signal?: AbortSignal, + opts: CompressOptions, ): Promise<{ newHistory: Content[] | null; info: ChatCompressionInfo }> { - const curatedHistory = chat.getHistory(true); + const { + promptId, + force, + model, + config, + hasFailedCompressionAttempt, + originalTokenCount, + signal, + } = opts; const threshold = config.getChatCompression()?.contextPercentageThreshold ?? COMPRESSION_TOKEN_THRESHOLD; - // Regardless of `force`, don't do anything if the history is empty. - if ( - curatedHistory.length === 0 || - threshold <= 0 || - (hasFailedCompressionAttempt && !force) - ) { + // Cheap gates first — these don't need the curated history. + if (threshold <= 0 || (hasFailedCompressionAttempt && !force)) { return { newHistory: null, info: { @@ -126,9 +195,9 @@ export class ChatCompressionService { }; } - const originalTokenCount = uiTelemetryService.getLastPromptTokenCount(); - - // Don't compress if not forced and we are under the limit. + // Don't compress if not forced and we are under the limit. This is the + // steady-state path on every send; we want to exit before paying for the + // full `getHistory(true)` clone below. if (!force) { const contextLimit = config.getContentGeneratorConfig()?.contextWindowSize ?? @@ -145,6 +214,18 @@ export class ChatCompressionService { } } + const curatedHistory = chat.getHistory(true); + if (curatedHistory.length === 0) { + return { + newHistory: null, + info: { + originalTokenCount: 0, + newTokenCount: 0, + compressionStatus: CompressionStatus.NOOP, + }, + }; + } + // Fire PreCompact hook before compression begins const hookSystem = config.getHookSystem(); if (hookSystem) { @@ -181,6 +262,10 @@ export class ChatCompressionService { const historyToCompress = historyForSplit.slice(0, splitPoint); const historyToKeep = historyForSplit.slice(splitPoint); + // The in-flight fallback path may produce a kept slice starting with + // model+functionCall; the post-summary history needs a synthetic user + // between the summary's model_ack and the kept entries. + const keepNeedsContinuationBridge = historyToKeep[0]?.role === 'model'; if (historyToCompress.length === 0) { return { @@ -196,10 +281,6 @@ export class ChatCompressionService { // Guard: if historyToCompress is too small relative to the total history, // skip compression. This prevents futile API calls where the model receives // almost no context and generates a useless "summary" that inflates tokens. - // - // Note: findCompressSplitPoint already computes charCounts internally but - // returns only the split index. We intentionally recompute here to keep - // the function signature simple; this is a minor, acceptable duplication. const compressCharCount = historyToCompress.reduce( (sum, c) => sum + JSON.stringify(c).length, 0, @@ -274,6 +355,22 @@ export class ChatCompressionService { role: 'model', parts: [{ text: 'Got it. Thanks for the additional context!' }], }, + // When the kept slice starts with model+functionCall (because + // tool-round absorption pulled the only fresh user message into + // compress), inject a synthetic continuation prompt so the joined + // history alternates correctly. + ...(keepNeedsContinuationBridge + ? [ + { + role: 'user' as const, + parts: [ + { + text: 'Continue with the prior task using the context above.', + }, + ], + }, + ] + : []), ...historyToKeep, ]; @@ -339,8 +436,6 @@ export class ChatCompressionService { }, }; } else { - uiTelemetryService.setLastPromptTokenCount(newTokenCount); - // Fire SessionStart event after successful compression try { const permissionMode = String(