From 8a1ed565a6eaf16e1194018c8a97cb9bc1998fdc Mon Sep 17 00:00:00 2001
From: tanzhenxin <tanzhenxing1987@gmail.com>
Date: Wed, 6 May 2026 14:09:07 +0800
Subject: [PATCH] fix(core): auto-compact subagent context to prevent overflow
 (#3735)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(core): auto-compact subagent context to prevent overflow

Subagent chats accumulated history without ever compacting, so a long
multi-turn run could hit "max context length exceeded" before the
compaction logic the main session uses had a chance to fire.

Move compaction down into the chat layer so both main agent and subagent
auto-compress at the configured threshold, and surface the result via a
new chat stream event that bridges into the existing ChatCompressed UI
path. The main-session wrapper still owns full /compress reset.

Closes #3664

* fix(core): address subagent compaction review feedback

Code fixes:
- Seed `lastPromptTokenCount` on subagent chats so the first-send
  threshold gate sees the inherited history's true size.
- Add `COMPRESSION_FAILED_TOKEN_COUNT_ERROR` to the fail-latch chain
  so token-counting failures stop retrying compression every send.
- Restore `FileReadCache.clear()` after compaction in both the manual
  /compress wrapper and the auto-compaction path inside GeminiChat,
  preventing post-summary `file_unchanged` placeholders from pointing
  at content the model can no longer retrieve.
- Refresh stale comment on the `compressed → ChatCompressed` bridge
  in turn.ts now that this path is the primary route, not a fallback.

Tests:
- turn.test.ts asserts the compressed → ChatCompressed bridge.
- geminiChat.test.ts asserts COMPRESSED yields as the first stream
  event after auto-compaction succeeds.
- chatCompressionService.test.ts bumps originalTokenCount above the
  cheap-gate so the NOOP test exercises findCompressSplitPoint.
- client.test.ts asserts forceFullIdeContext flips when a
  ChatCompressed event flows through sendMessageStream's loop.

* chore(core): drop redundant StreamEvent cast and document auto-compaction trade-off

- Remove the `as StreamEvent` cast at the COMPRESSED yield site — the
  literal already matches the union member.
- Add a 4-line comment at the auto-compaction setHistory point that
  points readers to GeminiClient for the env-refresh trade-off rationale,
  so readers don't have to chase the layering decision back across files.
---
 .../core/src/agents/runtime/agent-core.ts     |  20 +-
 .../src/agents/runtime/agent-headless.test.ts |   4 +
 packages/core/src/core/client.test.ts         | 815 ++++--------------
 packages/core/src/core/client.ts              |  90 +-
 packages/core/src/core/geminiChat.test.ts     | 354 ++++++++
 packages/core/src/core/geminiChat.ts          | 157 +++-
 packages/core/src/core/turn.test.ts           |  34 +-
 packages/core/src/core/turn.ts                |  13 +
 .../services/chatCompressionService.test.ts   | 794 +++++++++++------
 .../src/services/chatCompressionService.ts    | 181 +++-
 10 files changed, 1434 insertions(+), 1028 deletions(-)

diff --git a/packages/core/src/agents/runtime/agent-core.ts b/packages/core/src/agents/runtime/agent-core.ts
index 8f8202965..bb959f4d0 100644
--- a/packages/core/src/agents/runtime/agent-core.ts
+++ b/packages/core/src/agents/runtime/agent-core.ts
@@ -317,11 +317,17 @@ export class AgentCore {
     }
 
     try {
-      return new GeminiChat(
+      const chat = new GeminiChat(
         this.runtimeContext,
         generationConfig,
         startHistory,
       );
+      // Seed the per-chat token count so the auto-compaction threshold
+      // gate sees the inherited history's true size on the first send.
+      // Without this, fork subagents start at 0 and the gate NOOPs even
+      // when `startHistory` is already huge — first API call can 400.
+      chat.setLastPromptTokenCount(this.lastPromptTokenCount);
+      return chat;
     } catch (error) {
       await reportError(
         error,
@@ -540,6 +546,18 @@ export class AgentCore {
           continue;
         }
 
+        // GeminiChat already mutated its own history; surface to the debug
+        // log so subagent compactions show up alongside the main session's.
+        if (streamEvent.type === 'compressed') {
+          this.runtimeContext
+            .getDebugLogger()
+            .debug(
+              `[AGENT-COMPACT] subagent=${this.subagentId} round=${turnCounter} ` +
+                `tokens ${streamEvent.info.originalTokenCount} -> ${streamEvent.info.newTokenCount}`,
+            );
+          continue;
+        }
+
         // Handle chunk events
         if (streamEvent.type === 'chunk') {
           const resp = streamEvent.value;
diff --git a/packages/core/src/agents/runtime/agent-headless.test.ts b/packages/core/src/agents/runtime/agent-headless.test.ts
index 9f3f329a2..d54a7af18 100644
--- a/packages/core/src/agents/runtime/agent-headless.test.ts
+++ b/packages/core/src/agents/runtime/agent-headless.test.ts
@@ -280,6 +280,7 @@ describe('subagent.ts', () => {
         () =>
           ({
             sendMessageStream: mockSendMessageStream,
+            setLastPromptTokenCount: vi.fn(),
           }) as unknown as GeminiChat,
       );
 
@@ -958,6 +959,7 @@ describe('subagent.ts', () => {
           () =>
             ({
               sendMessageStream: mockSendMessageStream,
+              setLastPromptTokenCount: vi.fn(),
             }) as unknown as GeminiChat,
         );
 
@@ -997,6 +999,7 @@ describe('subagent.ts', () => {
           () =>
             ({
               sendMessageStream: mockSendMessageStream,
+              setLastPromptTokenCount: vi.fn(),
             }) as unknown as GeminiChat,
         );
 
@@ -1061,6 +1064,7 @@ describe('subagent.ts', () => {
           () =>
             ({
               sendMessageStream: mockSendMessageStream,
+              setLastPromptTokenCount: vi.fn(),
             }) as unknown as GeminiChat,
         );
 
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 93bfe771f..47a8c9a87 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -29,12 +29,7 @@ import type { Config } from '../config/config.js';
 import { ApprovalMode } from '../config/config.js';
 import type { ModelsConfig } from '../models/modelsConfig.js';
 import { retryWithBackoff } from '../utils/retry.js';
-import {
-  CompressionStatus,
-  GeminiEventType,
-  Turn,
-  type ChatCompressionInfo,
-} from './turn.js';
+import { CompressionStatus, GeminiEventType, Turn } from './turn.js';
 
 vi.mock('../utils/retry.js', () => ({
   retryWithBackoff: vi.fn(async (fn) => await fn()),
@@ -161,6 +156,8 @@ vi.mock('../utils/generateContentResponseUtilities', () => ({
 const mockUiTelemetryService = vi.hoisted(() => ({
   setLastPromptTokenCount: vi.fn(),
   getLastPromptTokenCount: vi.fn(),
+  reset: vi.fn(),
+  addEvent: vi.fn(),
 }));
 
 vi.mock('../telemetry/index.js', async (importOriginal) => {
@@ -264,15 +261,16 @@ describe('findCompressSplitPoint', () => {
     expect(findCompressSplitPoint(history, 0.8)).toBe(4);
   });
 
-  it('should return earlier splitpoint if no valid ones are after threshhold', () => {
+  it('compresses everything before the trailing in-flight functionCall', () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'This is the first message.' }] },
       { role: 'model', parts: [{ text: 'This is the second message.' }] },
       { role: 'user', parts: [{ text: 'This is the third message.' }] },
       { role: 'model', parts: [{ functionCall: {} }] },
     ];
-    // Can't return 4 because the previous item has a function call.
-    expect(findCompressSplitPoint(history, 0.99)).toBe(2);
+    // Trailing m+fc is in-flight; the in-flight fallback compresses
+    // everything except the trailing fc (no preceding pair to retain).
+    expect(findCompressSplitPoint(history, 0.99)).toBe(3);
   });
 
   it('should handle a history with only one item', () => {
@@ -446,12 +444,48 @@ describe('Gemini Client (client.ts)', () => {
     client = new GeminiClient(mockConfig);
     await client.initialize();
     vi.mocked(mockConfig.getGeminiClient).mockReturnValue(client);
+
+    // GeminiClient.sendMessageStream calls this.tryCompressChat (which now
+    // delegates to chat.tryCompress) before each turn. Most tests use a
+    // hand-rolled chat mock that doesn't implement tryCompress; default the
+    // wrapper to a NOOP so those tests don't crash. Tests that exercise
+    // compression directly (the delegation tests below, the
+    // emits-compression-event test) override this spy.
+    vi.spyOn(client, 'tryCompressChat').mockResolvedValue({
+      originalTokenCount: 0,
+      newTokenCount: 0,
+      compressionStatus: CompressionStatus.NOOP,
+    });
   });
 
   afterEach(() => {
     vi.restoreAllMocks();
   });
 
+  describe('initialize', () => {
+    it('seeds resumed chat with replayed prompt token count', async () => {
+      vi.mocked(mockConfig.getResumedSessionData).mockReturnValue({
+        conversation: {
+          sessionId: 'resumed-session-id',
+          projectHash: 'project-hash',
+          startTime: new Date(0).toISOString(),
+          lastUpdated: new Date(0).toISOString(),
+          messages: [],
+        },
+        filePath: '/test/session.jsonl',
+        lastCompletedUuid: null,
+      });
+      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
+        123_456,
+      );
+
+      const resumedClient = new GeminiClient(mockConfig);
+      await resumedClient.initialize();
+
+      expect(resumedClient.getChat().getLastPromptTokenCount()).toBe(123_456);
+    });
+  });
+
   describe('addHistory', () => {
     it('should call chat.addHistory with the provided content', async () => {
       const mockChat = {
@@ -642,6 +676,11 @@ describe('Gemini Client (client.ts)', () => {
       mockChat = {
         addHistory: vi.fn(),
         getHistory: vi.fn().mockReturnValue([]),
+        tryCompress: vi.fn().mockResolvedValue({
+          originalTokenCount: 0,
+          newTokenCount: 0,
+          compressionStatus: CompressionStatus.NOOP,
+        }),
       };
       client['chat'] = mockChat as GeminiChat;
     });
@@ -768,66 +807,73 @@ describe('Gemini Client (client.ts)', () => {
     });
   });
 
-  describe('tryCompressChat', () => {
-    const mockGetHistory = vi.fn();
-
+  // tryCompressChat is now a thin wrapper around GeminiChat.tryCompress.
+  // The compression logic itself is exercised in chatCompressionService.test.ts
+  // (token math, threshold checks, hook firing) and geminiChat.test.ts (history
+  // mutation, recording, hasFailedCompressionAttempt). The tests below cover
+  // only what the wrapper itself adds: argument forwarding and the IDE-context
+  // flag flip.
+  describe('tryCompressChat (delegation)', () => {
     beforeEach(() => {
-      client['chat'] = {
-        getHistory: mockGetHistory,
-        addHistory: vi.fn(),
-        setHistory: vi.fn(),
-      } as unknown as GeminiChat;
+      // The top-level beforeEach stubs tryCompressChat to NOOP for unrelated
+      // tests; restore the real implementation here so we can observe it.
+      vi.mocked(client.tryCompressChat).mockRestore();
     });
 
-    function setup({
-      chatHistory = [
-        { role: 'user', parts: [{ text: 'Long conversation' }] },
-        { role: 'model', parts: [{ text: 'Long response' }] },
-      ] as Content[],
-      originalTokenCount = 1000,
-      summaryText = 'This is a summary.',
-      // Token counts returned in usageMetadata to simulate what the API would return
-      // Default values ensure successful compression:
-      // newTokenCount = originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount
-      // = 1000 - (1600 - 1000) + 50 = 1000 - 600 + 50 = 450 (< 1000, success)
-      compressionInputTokenCount = 1600,
-      compressionOutputTokenCount = 50,
-    } = {}) {
-      const mockOriginalChat: Partial<GeminiChat> = {
-        getHistory: vi.fn((_curated?: boolean) => chatHistory),
-        setHistory: vi.fn(),
-      };
-      client['chat'] = mockOriginalChat as GeminiChat;
+    it('forwards prompt id, model, force, and signal to chat.tryCompress', async () => {
+      const tryCompress = vi.fn().mockResolvedValue({
+        originalTokenCount: 0,
+        newTokenCount: 0,
+        compressionStatus: CompressionStatus.NOOP,
+      });
+      client['chat'] = {
+        tryCompress,
+        getHistory: vi.fn().mockReturnValue([]),
+      } as unknown as GeminiChat;
+      vi.mocked(mockConfig.getModel).mockReturnValue('the-model');
+      const signal = new AbortController().signal;
 
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
-      );
+      await client.tryCompressChat('p1', true, signal);
 
-      mockGenerateContentFn.mockResolvedValue({
-        candidates: [
-          {
-            content: {
-              role: 'model',
-              parts: [{ text: summaryText }],
-            },
-          },
-        ],
-        usageMetadata: {
-          promptTokenCount: compressionInputTokenCount,
-          candidatesTokenCount: compressionOutputTokenCount,
-          totalTokenCount:
-            compressionInputTokenCount + compressionOutputTokenCount,
-        },
-      } as unknown as GenerateContentResponse);
+      expect(tryCompress).toHaveBeenCalledWith('p1', 'the-model', true, signal);
+    });
 
-      // Calculate what the new history will be
-      const splitPoint = findCompressSplitPoint(chatHistory, 0.7); // 1 - 0.3
-      const historyToKeep = chatHistory.slice(splitPoint);
+    it('flips forceFullIdeContext on a successful compression', async () => {
+      client['chat'] = {
+        tryCompress: vi.fn().mockResolvedValue({
+          originalTokenCount: 1000,
+          newTokenCount: 200,
+          compressionStatus: CompressionStatus.COMPRESSED,
+        }),
+        getHistory: vi.fn().mockReturnValue([]),
+      } as unknown as GeminiChat;
+      client['forceFullIdeContext'] = false;
 
-      // This is the history that the new chat will have.
-      // It includes the default startChat history + the extra history from tryCompressChat
-      const newCompressedHistory: Content[] = [
-        // Mocked envParts + canned response from startChat
+      await client.tryCompressChat('p2');
+
+      expect(client['forceFullIdeContext']).toBe(true);
+    });
+
+    it('re-prepends startup context and seeds the new chat after compression', async () => {
+      const compressedHistory: Content[] = [
+        { role: 'user', parts: [{ text: 'summary' }] },
+        { role: 'model', parts: [{ text: 'ok' }] },
+      ];
+      const originalChat = client.getChat();
+      vi.spyOn(originalChat, 'tryCompress').mockImplementation(async () => {
+        originalChat.setHistory(compressedHistory);
+        return {
+          originalTokenCount: 1000,
+          newTokenCount: 200,
+          compressionStatus: CompressionStatus.COMPRESSED,
+        };
+      });
+      client['forceFullIdeContext'] = false;
+
+      await client.tryCompressChat('p4');
+
+      expect(client.getChat()).not.toBe(originalChat);
+      expect(client.getHistory()).toEqual([
         {
           role: 'user',
           parts: [{ text: 'Mocked env context' }],
@@ -836,616 +882,71 @@ describe('Gemini Client (client.ts)', () => {
           role: 'model',
           parts: [{ text: 'Got it. Thanks for the context!' }],
         },
-        // extraHistory from tryCompressChat
-        {
-          role: 'user',
-          parts: [{ text: summaryText }],
-        },
-        {
-          role: 'model',
-          parts: [{ text: 'Got it. Thanks for the additional context!' }],
-        },
-        ...historyToKeep,
-      ];
+        ...compressedHistory,
+      ]);
+      expect(client.getChat().getLastPromptTokenCount()).toBe(200);
+      expect(client['forceFullIdeContext']).toBe(true);
+    });
 
-      const mockNewChat: Partial<GeminiChat> = {
-        getHistory: vi.fn().mockReturnValue(newCompressedHistory),
-        setHistory: vi.fn(),
-      };
-
-      client['startChat'] = vi.fn().mockImplementation(async () => {
-        client['chat'] = mockNewChat as GeminiChat;
-        return mockNewChat as GeminiChat;
-      });
-
-      // New token count formula: originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount
-      const estimatedNewTokenCount = Math.max(
-        0,
-        originalTokenCount -
-          (compressionInputTokenCount - 1000) +
-          compressionOutputTokenCount,
-      );
-
-      return {
-        client,
-        mockOriginalChat,
-        mockNewChat,
-        estimatedNewTokenCount,
-      };
-    }
-
-    describe('when compression inflates the token count', () => {
-      it('allows compression to be forced/manual after a failure', async () => {
-        // Call 1 (Fails): Setup with token counts that will inflate
-        // newTokenCount = originalTokenCount - (compressionInputTokenCount - 1000) + compressionOutputTokenCount
-        // = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation)
-        const longSummary = 'long summary '.repeat(100);
-        const { client, estimatedNewTokenCount: inflatedTokenCount } = setup({
-          originalTokenCount: 100,
-          summaryText: longSummary,
-          compressionInputTokenCount: 1010,
-          compressionOutputTokenCount: 200,
-        });
-        expect(inflatedTokenCount).toBeGreaterThan(100); // Ensure setup is correct
-
-        await client.tryCompressChat('prompt-id-4', false); // Fails
-
-        // Call 2 (Forced): Re-setup with token counts that will compress
-        // newTokenCount = 100 - (1100 - 1000) + 50 = 100 - 100 + 50 = 50 <= 100 (compression)
-        const shortSummary = 'short';
-        const { estimatedNewTokenCount: compressedTokenCount } = setup({
-          originalTokenCount: 100,
-          summaryText: shortSummary,
-          compressionInputTokenCount: 1100,
-          compressionOutputTokenCount: 50,
-        });
-        expect(compressedTokenCount).toBeLessThanOrEqual(100); // Ensure setup is correct
-
-        const result = await client.tryCompressChat('prompt-id-4', true); // Forced
-
-        expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-        expect(result.originalTokenCount).toBe(100);
-        // newTokenCount might be clamped to originalTokenCount due to tolerance logic
-        expect(result.newTokenCount).toBeLessThanOrEqual(100);
-      });
-
-      it('yields the result even if the compression inflated the tokens', async () => {
-        // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation)
-        const longSummary = 'long summary '.repeat(100);
-        const { client, estimatedNewTokenCount } = setup({
-          originalTokenCount: 100,
-          summaryText: longSummary,
-          compressionInputTokenCount: 1010,
-          compressionOutputTokenCount: 200,
-        });
-        expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct
-
-        // Mock contextWindowSize to ensure compression is triggered
-        vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue(
-          {
-            model: 'test-model',
-            apiKey: 'test-key',
-            vertexai: false,
-            authType: AuthType.USE_GEMINI,
-            contextWindowSize: 100, // Set to same as originalTokenCount to ensure threshold is exceeded
-          },
-        );
-
-        const result = await client.tryCompressChat('prompt-id-4', false);
-
-        expect(result.compressionStatus).toBe(
-          CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
-        );
-        expect(result.originalTokenCount).toBe(100);
-        // The newTokenCount should be higher than original since compression failed due to inflation
-        expect(result.newTokenCount).toBeGreaterThan(100);
-        // IMPORTANT: The change in client.ts means setLastPromptTokenCount is NOT called on failure
-        expect(
-          uiTelemetryService.setLastPromptTokenCount,
-        ).not.toHaveBeenCalled();
-      });
-
-      it('does not manipulate the source chat', async () => {
-        // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation)
-        const longSummary = 'long summary '.repeat(100);
-        const { client, mockOriginalChat, estimatedNewTokenCount } = setup({
-          originalTokenCount: 100,
-          summaryText: longSummary,
-          compressionInputTokenCount: 1010,
-          compressionOutputTokenCount: 200,
-        });
-        expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct
-
-        await client.tryCompressChat('prompt-id-4', false);
-
-        // On failure, the chat should NOT be replaced
-        expect(client['chat']).toBe(mockOriginalChat);
-      });
-
-      it('will not attempt to compress context after a failure', async () => {
-        // newTokenCount = 100 - (1010 - 1000) + 200 = 100 - 10 + 200 = 290 > 100 (inflation)
-        const longSummary = 'long summary '.repeat(100);
-        const { client, estimatedNewTokenCount } = setup({
-          originalTokenCount: 100,
-          summaryText: longSummary,
-          compressionInputTokenCount: 1010,
-          compressionOutputTokenCount: 200,
-        });
-        expect(estimatedNewTokenCount).toBeGreaterThan(100); // Ensure setup is correct
-
-        // Mock contextWindowSize to ensure compression is triggered
-        vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue(
-          {
-            model: 'test-model',
-            apiKey: 'test-key',
-            vertexai: false,
-            authType: AuthType.USE_GEMINI,
-            contextWindowSize: 100, // Set to same as originalTokenCount to ensure threshold is exceeded
-          },
-        );
-
-        await client.tryCompressChat('prompt-id-4', false); // This fails and sets hasFailedCompressionAttempt = true
-
-        // This call should now be a NOOP
-        const result = await client.tryCompressChat('prompt-id-5', false);
-
-        // generateContent (for summary) should only have been called once
-        expect(mockGenerateContentFn).toHaveBeenCalledTimes(1);
-        expect(result).toEqual({
-          compressionStatus: CompressionStatus.NOOP,
-          newTokenCount: 0,
+    it('does not flip forceFullIdeContext when compression NOOPs', async () => {
+      client['chat'] = {
+        tryCompress: vi.fn().mockResolvedValue({
           originalTokenCount: 0,
-        });
-      });
-    });
-
-    it('should not trigger summarization if token count is below threshold', async () => {
-      const MOCKED_TOKEN_LIMIT = 1000;
-      vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({
-        model: 'test-model',
-        apiKey: 'test-key',
-        vertexai: false,
-        authType: AuthType.USE_GEMINI,
-        contextWindowSize: MOCKED_TOKEN_LIMIT,
-      });
-      mockGetHistory.mockReturnValue([
-        { role: 'user', parts: [{ text: '...history...' }] },
-      ]);
-      const originalTokenCount = MOCKED_TOKEN_LIMIT * 0.699;
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
-      );
-
-      const initialChat = client.getChat();
-      const result = await client.tryCompressChat('prompt-id-2', false);
-      const newChat = client.getChat();
-
-      expect(result).toEqual({
-        compressionStatus: CompressionStatus.NOOP,
-        newTokenCount: originalTokenCount,
-        originalTokenCount,
-      });
-      expect(newChat).toBe(initialChat);
-    });
-
-    it('logs a telemetry event when compressing', async () => {
-      const { logChatCompression } = await import('../telemetry/loggers.js');
-      vi.mocked(logChatCompression).mockClear();
-
-      const MOCKED_TOKEN_LIMIT = 1000;
-      const MOCKED_CONTEXT_PERCENTAGE_THRESHOLD = 0.5;
-      vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({
-        model: 'test-model',
-        apiKey: 'test-key',
-        vertexai: false,
-        authType: AuthType.USE_GEMINI,
-        contextWindowSize: MOCKED_TOKEN_LIMIT,
-      });
-      vi.spyOn(client['config'], 'getChatCompression').mockReturnValue({
-        contextPercentageThreshold: MOCKED_CONTEXT_PERCENTAGE_THRESHOLD,
-      });
-      // Need multiple history items so there's something to compress
-      const history = [
-        { role: 'user', parts: [{ text: '...history 1...' }] },
-        { role: 'model', parts: [{ text: '...history 2...' }] },
-        { role: 'user', parts: [{ text: '...history 3...' }] },
-        { role: 'model', parts: [{ text: '...history 4...' }] },
-      ];
-      mockGetHistory.mockReturnValue(history);
-
-      // Token count needs to be ABOVE the threshold to trigger compression
-      const originalTokenCount =
-        MOCKED_TOKEN_LIMIT * MOCKED_CONTEXT_PERCENTAGE_THRESHOLD + 1;
-
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
-      );
-
-      // Mock the summary response from the chat
-      // newTokenCount = 501 - (1400 - 1000) + 50 = 501 - 400 + 50 = 151 <= 501 (success)
-      const summaryText = 'This is a summary.';
-      mockGenerateContentFn.mockResolvedValue({
-        candidates: [
-          {
-            content: {
-              role: 'model',
-              parts: [{ text: summaryText }],
-            },
-          },
-        ],
-        usageMetadata: {
-          promptTokenCount: 1400,
-          candidatesTokenCount: 50,
-          totalTokenCount: 1450,
-        },
-      } as unknown as GenerateContentResponse);
-
-      // Mock startChat to complete the compression flow
-      const splitPoint = findCompressSplitPoint(history, 0.7);
-      const historyToKeep = history.slice(splitPoint);
-      const newCompressedHistory: Content[] = [
-        { role: 'user', parts: [{ text: 'Mocked env context' }] },
-        { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] },
-        { role: 'user', parts: [{ text: summaryText }] },
-        {
-          role: 'model',
-          parts: [{ text: 'Got it. Thanks for the additional context!' }],
-        },
-        ...historyToKeep,
-      ];
-      const mockNewChat: Partial<GeminiChat> = {
-        getHistory: vi.fn().mockReturnValue(newCompressedHistory),
-      };
-      client['startChat'] = vi
-        .fn()
-        .mockResolvedValue(mockNewChat as GeminiChat);
-
-      await client.tryCompressChat('prompt-id-3', false);
-
-      expect(logChatCompression).toHaveBeenCalledWith(
-        expect.anything(),
-        expect.objectContaining({
-          tokens_before: originalTokenCount,
+          newTokenCount: 0,
+          compressionStatus: CompressionStatus.NOOP,
         }),
-      );
-      expect(uiTelemetryService.setLastPromptTokenCount).toHaveBeenCalled();
+        getHistory: vi.fn().mockReturnValue([]),
+      } as unknown as GeminiChat;
+      client['forceFullIdeContext'] = false;
+
+      await client.tryCompressChat('p3');
+
+      expect(client['forceFullIdeContext']).toBe(false);
     });
 
-    it('should trigger summarization if token count is above threshold with contextPercentageThreshold setting', async () => {
-      const MOCKED_TOKEN_LIMIT = 1000;
-      const MOCKED_CONTEXT_PERCENTAGE_THRESHOLD = 0.5;
-      vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({
-        model: 'test-model',
-        apiKey: 'test-key',
-        vertexai: false,
-        authType: AuthType.USE_GEMINI,
-        contextWindowSize: MOCKED_TOKEN_LIMIT,
+    it('flips forceFullIdeContext when ChatCompressed flows through sendMessageStream', async () => {
+      // Auto-compaction lives inside chat.sendMessageStream and surfaces via
+      // the compressed → ChatCompressed bridge in turn.ts. The flip on this
+      // path is owned by the for-await loop in client.sendMessageStream, not
+      // by tryCompressChat — so this test feeds the event in directly.
+      vi.spyOn(client, 'tryCompressChat').mockResolvedValue({
+        originalTokenCount: 0,
+        newTokenCount: 0,
+        compressionStatus: CompressionStatus.NOOP,
       });
-      vi.spyOn(client['config'], 'getChatCompression').mockReturnValue({
-        contextPercentageThreshold: MOCKED_CONTEXT_PERCENTAGE_THRESHOLD,
-      });
-      // Need multiple history items so there's something to compress
-      const history = [
-        { role: 'user', parts: [{ text: '...history 1...' }] },
-        { role: 'model', parts: [{ text: '...history 2...' }] },
-        { role: 'user', parts: [{ text: '...history 3...' }] },
-        { role: 'model', parts: [{ text: '...history 4...' }] },
-      ];
-      mockGetHistory.mockReturnValue(history);
-
-      // Token count needs to be ABOVE the threshold to trigger compression
-      const originalTokenCount =
-        MOCKED_TOKEN_LIMIT * MOCKED_CONTEXT_PERCENTAGE_THRESHOLD + 1;
-
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
-      );
-
-      // Mock summary and new chat
-      const summaryText = 'This is a summary.';
-      const splitPoint = findCompressSplitPoint(history, 0.7);
-      const historyToKeep = history.slice(splitPoint);
-      const newCompressedHistory: Content[] = [
-        { role: 'user', parts: [{ text: 'Mocked env context' }] },
-        { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] },
-        { role: 'user', parts: [{ text: summaryText }] },
-        {
-          role: 'model',
-          parts: [{ text: 'Got it. Thanks for the additional context!' }],
-        },
-        ...historyToKeep,
-      ];
-      const mockNewChat: Partial<GeminiChat> = {
-        getHistory: vi.fn().mockReturnValue(newCompressedHistory),
-      };
-      client['startChat'] = vi.fn().mockImplementation(async () => {
-        client['chat'] = mockNewChat as GeminiChat;
-        return mockNewChat as GeminiChat;
-      });
-
-      // Mock the summary response from the chat
-      // newTokenCount = 501 - (1400 - 1000) + 50 = 501 - 400 + 50 = 151 <= 501 (success)
-      mockGenerateContentFn.mockResolvedValue({
-        candidates: [
-          {
-            content: {
-              role: 'model',
-              parts: [{ text: summaryText }],
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield {
+            type: GeminiEventType.ChatCompressed,
+            value: {
+              originalTokenCount: 1000,
+              newTokenCount: 200,
+              compressionStatus: CompressionStatus.COMPRESSED,
             },
-          },
-        ],
-        usageMetadata: {
-          promptTokenCount: 1400,
-          candidatesTokenCount: 50,
-          totalTokenCount: 1450,
-        },
-      } as unknown as GenerateContentResponse);
-
-      const initialChat = client.getChat();
-      const result = await client.tryCompressChat('prompt-id-3', false);
-      const newChat = client.getChat();
-
-      expect(mockGenerateContentFn).toHaveBeenCalled();
-
-      // Assert that summarization happened
-      expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-      expect(result.originalTokenCount).toBe(originalTokenCount);
-      // newTokenCount might be clamped to originalTokenCount due to tolerance logic
-      expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount);
-
-      // Assert that the chat was reset
-      expect(newChat).not.toBe(initialChat);
-    });
-
-    it('should not compress across a function call response', async () => {
-      const MOCKED_TOKEN_LIMIT = 1000;
-      vi.spyOn(client['config'], 'getContentGeneratorConfig').mockReturnValue({
-        model: 'test-model',
-        apiKey: 'test-key',
-        vertexai: false,
-        authType: AuthType.USE_GEMINI,
-        contextWindowSize: MOCKED_TOKEN_LIMIT,
-      });
-      const history: Content[] = [
-        { role: 'user', parts: [{ text: '...history 1...' }] },
-        { role: 'model', parts: [{ text: '...history 2...' }] },
-        { role: 'user', parts: [{ text: '...history 3...' }] },
-        { role: 'model', parts: [{ text: '...history 4...' }] },
-        { role: 'user', parts: [{ text: '...history 5...' }] },
-        { role: 'model', parts: [{ text: '...history 6...' }] },
-        { role: 'user', parts: [{ text: '...history 7...' }] },
-        { role: 'model', parts: [{ text: '...history 8...' }] },
-        // Normally we would break here, but we have a function response.
-        {
-          role: 'user',
-          parts: [{ functionResponse: { name: '...history 8...' } }],
-        },
-        { role: 'model', parts: [{ text: '...history 10...' }] },
-        // Instead we will break here.
-        { role: 'user', parts: [{ text: '...history 10...' }] },
-      ];
-      mockGetHistory.mockReturnValue(history);
-
-      const originalTokenCount = 1000 * 0.7;
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
+          };
+        })(),
       );
+      client['chat'] = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      } as unknown as GeminiChat;
+      client['forceFullIdeContext'] = false;
 
-      // Mock summary and new chat
-      const summaryText = 'This is a summary.';
-      const splitPoint = findCompressSplitPoint(history, 0.7); // This should be 10
-      expect(splitPoint).toBe(10); // Verify split point logic
-      const historyToKeep = history.slice(splitPoint); // Should keep last user message
-      expect(historyToKeep).toEqual([
-        { role: 'user', parts: [{ text: '...history 10...' }] },
-      ]);
-
-      const newCompressedHistory: Content[] = [
-        { role: 'user', parts: [{ text: 'Mocked env context' }] },
-        { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] },
-        { role: 'user', parts: [{ text: summaryText }] },
-        {
-          role: 'model',
-          parts: [{ text: 'Got it. Thanks for the additional context!' }],
-        },
-        ...historyToKeep,
-      ];
-      const mockNewChat: Partial<GeminiChat> = {
-        getHistory: vi.fn().mockReturnValue(newCompressedHistory),
-      };
-      client['startChat'] = vi.fn().mockImplementation(async () => {
-        client['chat'] = mockNewChat as GeminiChat;
-        return mockNewChat as GeminiChat;
-      });
-
-      // Mock the summary response from the chat
-      // newTokenCount = 700 - (1500 - 1000) + 50 = 700 - 500 + 50 = 250 <= 700 (success)
-      mockGenerateContentFn.mockResolvedValue({
-        candidates: [
-          {
-            content: {
-              role: 'model',
-              parts: [{ text: summaryText }],
-            },
-          },
-        ],
-        usageMetadata: {
-          promptTokenCount: 1500,
-          candidatesTokenCount: 50,
-          totalTokenCount: 1550,
-        },
-      } as unknown as GenerateContentResponse);
-
-      const initialChat = client.getChat();
-      const result = await client.tryCompressChat('prompt-id-3', false);
-      const newChat = client.getChat();
-
-      expect(mockGenerateContentFn).toHaveBeenCalled();
-
-      // Assert that summarization happened
-      expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-      expect(result.originalTokenCount).toBe(originalTokenCount);
-      // newTokenCount might be clamped to originalTokenCount due to tolerance logic
-      expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount);
-      // Assert that the chat was reset
-      expect(newChat).not.toBe(initialChat);
-
-      // 1. standard start context message (env)
-      // 2. standard canned model response
-      // 3. compressed summary message (user)
-      // 4. standard canned model response
-      // 5. The last user message (historyToKeep)
-      expect(newChat.getHistory().length).toEqual(5);
-    });
-
-    it('should always trigger summarization when force is true, regardless of token count', async () => {
-      // Need multiple history items so there's something to compress
-      const history = [
-        { role: 'user', parts: [{ text: '...history 1...' }] },
-        { role: 'model', parts: [{ text: '...history 2...' }] },
-        { role: 'user', parts: [{ text: '...history 3...' }] },
-        { role: 'model', parts: [{ text: '...history 4...' }] },
-      ];
-      mockGetHistory.mockReturnValue(history);
-
-      const originalTokenCount = 100; // Well below threshold, but > estimated new count
-      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
-        originalTokenCount,
+      const stream = client.sendMessageStream(
+        [{ text: 'hi' }],
+        new AbortController().signal,
+        'prompt-auto-flip',
+        { type: SendMessageType.UserQuery },
       );
+      for await (const _ of stream) {
+        /* drain */
+      }
 
-      // Mock summary and new chat
-      const summaryText = 'This is a summary.';
-      const splitPoint = findCompressSplitPoint(history, 0.7);
-      const historyToKeep = history.slice(splitPoint);
-      const newCompressedHistory: Content[] = [
-        { role: 'user', parts: [{ text: 'Mocked env context' }] },
-        { role: 'model', parts: [{ text: 'Got it. Thanks for the context!' }] },
-        { role: 'user', parts: [{ text: summaryText }] },
-        {
-          role: 'model',
-          parts: [{ text: 'Got it. Thanks for the additional context!' }],
-        },
-        ...historyToKeep,
-      ];
-      const mockNewChat: Partial<GeminiChat> = {
-        getHistory: vi.fn().mockReturnValue(newCompressedHistory),
-      };
-      client['startChat'] = vi.fn().mockImplementation(async () => {
-        client['chat'] = mockNewChat as GeminiChat;
-        return mockNewChat as GeminiChat;
-      });
-
-      // Mock the summary response from the chat
-      // newTokenCount = 100 - (1060 - 1000) + 20 = 100 - 60 + 20 = 60 <= 100 (success)
-      mockGenerateContentFn.mockResolvedValue({
-        candidates: [
-          {
-            content: {
-              role: 'model',
-              parts: [{ text: summaryText }],
-            },
-          },
-        ],
-        usageMetadata: {
-          promptTokenCount: 1060,
-          candidatesTokenCount: 20,
-          totalTokenCount: 1080,
-        },
-      } as unknown as GenerateContentResponse);
-
-      const initialChat = client.getChat();
-      const result = await client.tryCompressChat('prompt-id-1', true); // force = true
-      const newChat = client.getChat();
-
-      expect(mockGenerateContentFn).toHaveBeenCalled();
-
-      expect(result.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-      expect(result.originalTokenCount).toBe(originalTokenCount);
-      // newTokenCount might be clamped to originalTokenCount due to tolerance logic
-      expect(result.newTokenCount).toBeLessThanOrEqual(originalTokenCount);
-
-      // Assert that the chat was reset
-      expect(newChat).not.toBe(initialChat);
+      expect(client['forceFullIdeContext']).toBe(true);
     });
   });
 
   describe('sendMessageStream', () => {
-    it('emits a compression event when the context was automatically compressed', async () => {
-      // Arrange
-      mockTurnRunFn.mockReturnValue(
-        (async function* () {
-          yield { type: 'content', value: 'Hello' };
-        })(),
-      );
-
-      const compressionInfo: ChatCompressionInfo = {
-        compressionStatus: CompressionStatus.COMPRESSED,
-        originalTokenCount: 1000,
-        newTokenCount: 500,
-      };
-
-      vi.spyOn(client, 'tryCompressChat').mockResolvedValueOnce(
-        compressionInfo,
-      );
-
-      // Act
-      const stream = client.sendMessageStream(
-        [{ text: 'Hi' }],
-        new AbortController().signal,
-        'prompt-id-1',
-      );
-
-      const events = await fromAsync(stream);
-
-      // Assert
-      expect(events).toContainEqual({
-        type: GeminiEventType.ChatCompressed,
-        value: compressionInfo,
-      });
-    });
-
-    it.each([
-      {
-        compressionStatus:
-          CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
-      },
-      { compressionStatus: CompressionStatus.NOOP },
-    ])(
-      'does not emit a compression event when the status is $compressionStatus',
-      async ({ compressionStatus }) => {
-        // Arrange
-        const mockStream = (async function* () {
-          yield { type: 'content', value: 'Hello' };
-        })();
-        mockTurnRunFn.mockReturnValue(mockStream);
-
-        const compressionInfo: ChatCompressionInfo = {
-          compressionStatus,
-          originalTokenCount: 1000,
-          newTokenCount: 500,
-        };
-
-        vi.spyOn(client, 'tryCompressChat').mockResolvedValueOnce(
-          compressionInfo,
-        );
-
-        // Act
-        const stream = client.sendMessageStream(
-          [{ text: 'Hi' }],
-          new AbortController().signal,
-          'prompt-id-1',
-        );
-
-        const events = await fromAsync(stream);
-
-        // Assert
-        expect(events).not.toContainEqual({
-          type: GeminiEventType.ChatCompressed,
-          value: expect.anything(),
-        });
-      },
-    );
-
     it('should include editor context when ideMode is enabled', async () => {
       // Arrange
       vi.mocked(ideContextStore.get).mockReturnValue({
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index e163b5659..5f6156f2e 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -42,7 +42,6 @@ import {
 
 // Services
 import {
-  ChatCompressionService,
   COMPRESSION_PRESERVE_THRESHOLD,
   COMPRESSION_TOKEN_THRESHOLD,
 } from '../services/chatCompressionService.js';
@@ -186,12 +185,6 @@ export class GeminiClient {
    */
   private perModelGeneratorCache = new Map<string, Promise<ContentGenerator>>();
 
-  /**
-   * At any point in this conversation, was compression triggered without
-   * being forced and did it fail?
-   */
-  private hasFailedCompressionAttempt = false;
-
   /**
    * Promises for pending background memory tasks (dream / extract).
    * Each promise resolves with a count of memory files touched (0 = nothing written).
@@ -224,6 +217,9 @@ export class GeminiClient {
         resumedSessionData.conversation,
       );
       await this.startChat(resumedHistory);
+      this.getChat().setLastPromptTokenCount(
+        uiTelemetryService.getLastPromptTokenCount(),
+      );
     } else {
       await this.startChat();
     }
@@ -369,7 +365,6 @@ export class GeminiClient {
 
   async startChat(extraHistory?: Content[]): Promise<GeminiChat> {
     this.forceFullIdeContext = true;
-    this.hasFailedCompressionAttempt = false;
     // Clear stale cache params on session reset to prevent cross-session leakage
     clearCacheSafeParams();
 
@@ -847,14 +842,10 @@ export class GeminiClient {
       return new Turn(this.getChat(), prompt_id);
     }
 
-    const compressed = await this.tryCompressChat(prompt_id, false, signal);
-
-    if (compressed.compressionStatus === CompressionStatus.COMPRESSED) {
-      yield { type: GeminiEventType.ChatCompressed, value: compressed };
-    }
-
-    // Check session token limit after compression.
-    // `lastPromptTokenCount` is treated as authoritative for the (possibly compressed) history;
+    // Auto-compaction happens inside GeminiChat.sendMessageStream and surfaces
+    // via the `compressed → ChatCompressed` bridge in turn.ts. Manual /compress
+    // still calls tryCompressChat directly for the full reset (env refresh +
+    // forceFullIdeContext flip).
     const sessionTokenLimit = this.config.getSessionTokenLimit();
     if (sessionTokenLimit > 0) {
       const lastPromptTokenCount = uiTelemetryService.getLastPromptTokenCount();
@@ -1000,6 +991,13 @@ export class GeminiClient {
         await arenaAgentClient.updateStatus();
       }
 
+      // Re-send a full IDE context blob on the next regular message — auto
+      // compaction inside chat.sendMessageStream may have summarized away
+      // the previous IDE-context turn.
+      if (event.type === GeminiEventType.ChatCompressed) {
+        this.forceFullIdeContext = true;
+      }
+
       yield event;
       if (event.type === GeminiEventType.Error) {
         if (arenaAgentClient) {
@@ -1391,58 +1389,36 @@ export class GeminiClient {
     return generatorPromise;
   }
 
+  /**
+   * Wrapper around {@link GeminiChat.tryCompress} that restores main-session
+   * startup context after successful compaction and flips the IDE full-context
+   * flag for the next regular message.
+   */
   async tryCompressChat(
     prompt_id: string,
     force: boolean = false,
     signal?: AbortSignal,
   ): Promise<ChatCompressionInfo> {
-    const compressionService = new ChatCompressionService();
-
-    const { newHistory, info } = await compressionService.compress(
-      this.getChat(),
+    const info = await this.getChat().tryCompress(
       prompt_id,
-      force,
       this.config.getModel(),
-      this.config,
-      this.hasFailedCompressionAttempt,
+      force,
       signal,
     );
-
-    // Handle compression result
     if (info.compressionStatus === CompressionStatus.COMPRESSED) {
-      // Success: update chat with new compressed history
-      if (newHistory) {
-        const chatRecordingService = this.config.getChatRecordingService();
-        chatRecordingService?.recordChatCompression({
-          info,
-          compressedHistory: newHistory,
-        });
-
-        await this.startChat(newHistory);
-        // Compaction rewrites the prompt history: prior full-Read tool
-        // results may have been summarised away, but the FileReadCache
-        // still believes those reads are "in this conversation". A
-        // follow-up Read could then return the file_unchanged
-        // placeholder pointing at content the model can no longer
-        // retrieve from its own context. Clear the cache so post-
-        // compaction Reads re-emit the bytes.
-        debugLogger.debug('[FILE_READ_CACHE] clear after tryCompressChat');
-        this.config.getFileReadCache().clear();
-        uiTelemetryService.setLastPromptTokenCount(info.newTokenCount);
-        this.forceFullIdeContext = true;
-      }
-    } else if (
-      info.compressionStatus ===
-        CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT ||
-      info.compressionStatus ===
-        CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY
-    ) {
-      // Track failed attempts (only mark as failed if not forced)
-      if (!force) {
-        this.hasFailedCompressionAttempt = true;
-      }
+      const compressedHistory = this.getChat().getHistory();
+      await this.startChat(compressedHistory);
+      // startChat() creates a new GeminiChat without touching FileReadCache,
+      // so prior read_file results that were summarised away would still
+      // resolve to the file_unchanged placeholder. Clear so post-compaction
+      // Reads re-emit bytes the model can no longer see in history.
+      debugLogger.debug('[FILE_READ_CACHE] clear after tryCompressChat');
+      this.config.getFileReadCache().clear();
+      this.getChat().setLastPromptTokenCount(info.newTokenCount);
+      // Re-send a full IDE context blob on the next regular message —
+      // compression dropped the previous context turn from history.
+      this.forceFullIdeContext = true;
     }
-
     return info;
   }
 }
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 8a45bb5d4..521e37e06 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -22,6 +22,8 @@ import { StreamContentError } from './openaiContentGenerator/pipeline.js';
 import type { Config } from '../config/config.js';
 import { setSimulate429 } from '../utils/testUtils.js';
 import { uiTelemetryService } from '../telemetry/uiTelemetry.js';
+import { CompressionStatus, type ChatCompressionInfo } from './turn.js';
+import { ChatCompressionService } from '../services/chatCompressionService.js';
 
 // Mock fs module to prevent actual file system operations during tests
 const mockFileSystem = new Map<string, string>();
@@ -119,6 +121,13 @@ describe('GeminiChat', async () => {
         getTool: vi.fn(),
       }),
       getContentGenerator: vi.fn().mockReturnValue(mockContentGenerator),
+      getChatCompression: vi.fn().mockReturnValue(undefined),
+      getHookSystem: vi.fn().mockReturnValue(undefined),
+      getDebugLogger: vi
+        .fn()
+        .mockReturnValue({ debug: vi.fn(), warn: vi.fn(), info: vi.fn() }),
+      getApprovalMode: vi.fn().mockReturnValue('default'),
+      getFileReadCache: vi.fn().mockReturnValue({ clear: vi.fn() }),
     } as unknown as Config;
 
     // Disable 429 simulation for tests
@@ -1019,6 +1028,223 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('auto-compression integration', () => {
+    function makeStreamResponse(text = 'ok') {
+      return (async function* () {
+        yield {
+          candidates: [
+            {
+              content: { parts: [{ text }], role: 'model' },
+              finishReason: 'STOP',
+              index: 0,
+              safetyRatings: [],
+            },
+          ],
+          text: () => text,
+        } as unknown as GenerateContentResponse;
+      })();
+    }
+
+    it('releases the send-lock when auto-compression throws (no deadlock)', async () => {
+      const compressSpy = vi
+        .spyOn(ChatCompressionService.prototype, 'compress')
+        .mockRejectedValueOnce(new Error('compression API down'));
+
+      // First send: compression rejects, error propagates to caller. The
+      // streamDoneResolver must run so this.sendPromise resolves; otherwise
+      // every subsequent send blocks forever.
+      await expect(
+        chat.sendMessageStream(
+          'test-model',
+          { message: 'first' },
+          'prompt-id-deadlock-1',
+        ),
+      ).rejects.toThrow('compression API down');
+
+      // Second send: compress returns NOOP, request goes through. If the
+      // lock leaked, this await would never resolve.
+      compressSpy.mockResolvedValueOnce({
+        newHistory: null,
+        info: {
+          originalTokenCount: 0,
+          newTokenCount: 0,
+          compressionStatus: CompressionStatus.NOOP,
+        },
+      });
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        makeStreamResponse('second response'),
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'second' },
+        'prompt-id-deadlock-2',
+      );
+      for await (const _ of stream) {
+        /* consume */
+      }
+
+      expect(compressSpy).toHaveBeenCalledTimes(2);
+    });
+
+    it('seeds inherited token count via setLastPromptTokenCount', async () => {
+      const subagentChat = new GeminiChat(mockConfig, config, [
+        { role: 'user', parts: [{ text: 'inherited' }] },
+        { role: 'model', parts: [{ text: 'inherited reply' }] },
+      ]);
+      subagentChat.setLastPromptTokenCount(123_456);
+      expect(subagentChat.getLastPromptTokenCount()).toBe(123_456);
+
+      // The compression service receives the seeded count, so the threshold
+      // check sees the inherited size — not the constructor default of 0.
+      const compressSpy = vi
+        .spyOn(ChatCompressionService.prototype, 'compress')
+        .mockResolvedValue({
+          newHistory: null,
+          info: {
+            originalTokenCount: 123_456,
+            newTokenCount: 123_456,
+            compressionStatus: CompressionStatus.NOOP,
+          },
+        });
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        makeStreamResponse(),
+      );
+
+      const stream = await subagentChat.sendMessageStream(
+        'test-model',
+        { message: 'go' },
+        'prompt-id-seed',
+      );
+      for await (const _ of stream) {
+        /* consume */
+      }
+
+      expect(compressSpy).toHaveBeenCalledTimes(1);
+      expect(compressSpy.mock.calls[0][1].originalTokenCount).toBe(123_456);
+    });
+
+    it('yields a COMPRESSED stream event as the first event after auto-compression succeeds', async () => {
+      const compressedHistory: Content[] = [
+        { role: 'user', parts: [{ text: 'summary' }] },
+        { role: 'model', parts: [{ text: 'ok' }] },
+      ];
+      vi.spyOn(
+        ChatCompressionService.prototype,
+        'compress',
+      ).mockResolvedValueOnce({
+        newHistory: compressedHistory,
+        info: {
+          originalTokenCount: 1000,
+          newTokenCount: 200,
+          compressionStatus: CompressionStatus.COMPRESSED,
+        },
+      });
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        makeStreamResponse('answer'),
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'go' },
+        'prompt-id-yield-compressed',
+      );
+      const events: Array<{ type: StreamEventType }> = [];
+      for await (const event of stream) {
+        events.push(event as { type: StreamEventType });
+      }
+
+      expect(events.length).toBeGreaterThan(0);
+      expect(events[0].type).toBe(StreamEventType.COMPRESSED);
+      expect(
+        (events[0] as { type: StreamEventType; info: ChatCompressionInfo }).info
+          .compressionStatus,
+      ).toBe(CompressionStatus.COMPRESSED);
+      expect(
+        (events[0] as { type: StreamEventType; info: ChatCompressionInfo }).info
+          .newTokenCount,
+      ).toBe(200);
+    });
+
+    it('clears hasFailedCompressionAttempt after a forced successful compression', async () => {
+      const compressSpy = vi.spyOn(
+        ChatCompressionService.prototype,
+        'compress',
+      );
+
+      // Step 1: auto-compression fails — latch is set on the chat.
+      compressSpy.mockResolvedValueOnce({
+        newHistory: null,
+        info: {
+          originalTokenCount: 100_000,
+          newTokenCount: 100_000,
+          compressionStatus:
+            CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
+        },
+      });
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        makeStreamResponse(),
+      );
+      const stream1 = await chat.sendMessageStream(
+        'test-model',
+        { message: 'first' },
+        'prompt-latch-1',
+      );
+      for await (const _ of stream1) {
+        /* consume */
+      }
+      // Latch passed to service was false on this attempt; service marks it
+      // failed and tryCompress flips the chat's flag to true.
+      expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe(
+        false,
+      );
+
+      // Step 2: a forced /compress succeeds. After this, the latch must
+      // be cleared so future auto-compressions are not suppressed.
+      compressSpy.mockResolvedValueOnce({
+        newHistory: [
+          { role: 'user', parts: [{ text: 'summary' }] },
+          { role: 'model', parts: [{ text: 'ack' }] },
+        ],
+        info: {
+          originalTokenCount: 100_000,
+          newTokenCount: 30_000,
+          compressionStatus: CompressionStatus.COMPRESSED,
+        },
+      });
+      await chat.tryCompress('prompt-latch-force', 'test-model', true);
+      // tryCompress was called with force=true, so the service got latch=true
+      // (the gate is `hasFailedCompressionAttempt && !force`, force overrides).
+      expect(compressSpy.mock.calls[1][1].hasFailedCompressionAttempt).toBe(
+        true,
+      );
+
+      // Step 3: next auto-compression sees the cleared latch.
+      compressSpy.mockResolvedValueOnce({
+        newHistory: null,
+        info: {
+          originalTokenCount: 30_000,
+          newTokenCount: 30_000,
+          compressionStatus: CompressionStatus.NOOP,
+        },
+      });
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        makeStreamResponse(),
+      );
+      const stream2 = await chat.sendMessageStream(
+        'test-model',
+        { message: 'second' },
+        'prompt-latch-2',
+      );
+      for await (const _ of stream2) {
+        /* consume */
+      }
+      expect(compressSpy.mock.calls[2][1].hasFailedCompressionAttempt).toBe(
+        false,
+      );
+    });
+  });
+
   describe('addHistory', () => {
     it('should add a new content item to the history', () => {
       const newContent: Content = {
@@ -2425,4 +2651,132 @@ describe('GeminiChat', async () => {
       expect(mergedText).toBe('BCD');
     });
   });
+
+  // Compression logic is tested in chatCompressionService.test.ts; this
+  // suite covers per-chat state on GeminiChat: hasFailedCompressionAttempt
+  // stickiness, token-count mutation, history replacement, and conditional
+  // telemetry mirroring.
+  describe('tryCompress (per-chat state)', () => {
+    const userMsg = (text: string) => ({
+      role: 'user' as const,
+      parts: [{ text }],
+    });
+    const modelMsg = (text: string) => ({
+      role: 'model' as const,
+      parts: [{ text }],
+    });
+
+    /**
+     * Mock a successful compression: the service returns COMPRESSED with a
+     * fresh history. We don't go through the real
+     * `config.getContentGenerator().generateContent` path here — the service
+     * is mocked at the boundary.
+     */
+    function mockCompressionService(
+      result: 'compressed' | 'failed-inflated' | 'noop',
+    ) {
+      const compressSpy = vi.spyOn(
+        ChatCompressionService.prototype,
+        'compress',
+      );
+      if (result === 'compressed') {
+        compressSpy.mockResolvedValue({
+          newHistory: [userMsg('summary'), modelMsg('ok'), userMsg('latest')],
+          info: {
+            originalTokenCount: 1000,
+            newTokenCount: 200,
+            compressionStatus: CompressionStatus.COMPRESSED,
+          },
+        });
+      } else if (result === 'failed-inflated') {
+        compressSpy.mockResolvedValue({
+          newHistory: null,
+          info: {
+            originalTokenCount: 1000,
+            newTokenCount: 1100,
+            compressionStatus:
+              CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
+          },
+        });
+      } else {
+        compressSpy.mockResolvedValue({
+          newHistory: null,
+          info: {
+            originalTokenCount: 0,
+            newTokenCount: 0,
+            compressionStatus: CompressionStatus.NOOP,
+          },
+        });
+      }
+      return compressSpy;
+    }
+
+    it('replaces history and updates per-chat lastPromptTokenCount on COMPRESSED', async () => {
+      mockCompressionService('compressed');
+      chat.setHistory([userMsg('a'), modelMsg('b'), userMsg('c')]);
+
+      const info = await chat.tryCompress('p1', 'm1');
+
+      expect(info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+      expect(chat.getHistory()).toHaveLength(3);
+      expect(chat.getHistory()[0]).toEqual(userMsg('summary'));
+      expect(chat.getLastPromptTokenCount()).toBe(200);
+    });
+
+    it('mirrors lastPromptTokenCount to the global telemetry only when wired', async () => {
+      mockCompressionService('compressed');
+      // chat under test was constructed with telemetryService=uiTelemetryService.
+      await chat.tryCompress('p2', 'm1');
+      expect(uiTelemetryService.setLastPromptTokenCount).toHaveBeenCalledWith(
+        200,
+      );
+
+      // A subagent-style chat with no telemetryService must NOT touch the
+      // global singleton (per the constructor docstring; per-chat counter
+      // still updates).
+      const subagentChat = new GeminiChat(mockConfig, config, []);
+      vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear();
+      mockCompressionService('compressed');
+      const info = await subagentChat.tryCompress('p3', 'm1');
+      expect(info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+      expect(subagentChat.getLastPromptTokenCount()).toBe(200);
+      expect(uiTelemetryService.setLastPromptTokenCount).not.toHaveBeenCalled();
+    });
+
+    it('marks hasFailedCompressionAttempt and suppresses subsequent unforced auto-compactions', async () => {
+      const compressSpy = mockCompressionService('failed-inflated');
+
+      const first = await chat.tryCompress('p1', 'm1');
+      expect(first.compressionStatus).toBe(
+        CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
+      );
+      expect(compressSpy).toHaveBeenCalledTimes(1);
+
+      // The next unforced call should reach the service with
+      // hasFailedCompressionAttempt=true; the service's threshold check then
+      // returns NOOP. The important thing here is that GeminiChat actually
+      // forwards the sticky flag.
+      compressSpy.mockClear();
+      compressSpy.mockResolvedValue({
+        newHistory: null,
+        info: {
+          originalTokenCount: 0,
+          newTokenCount: 0,
+          compressionStatus: CompressionStatus.NOOP,
+        },
+      });
+      await chat.tryCompress('p2', 'm1');
+      expect(compressSpy).toHaveBeenCalledTimes(1);
+      expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe(
+        true,
+      );
+    });
+
+    it('forwards force=true to the compression service', async () => {
+      const compressSpy = mockCompressionService('compressed');
+
+      await chat.tryCompress('p1', 'm1', true);
+      expect(compressSpy.mock.calls[0][1].force).toBe(true);
+    });
+  });
 });
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 804c143d8..6249a56db 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -31,11 +31,13 @@ import {
   logContentRetryFailure,
 } from '../telemetry/loggers.js';
 import { type ChatRecordingService } from '../services/chatRecordingService.js';
+import { ChatCompressionService } from '../services/chatCompressionService.js';
 import {
   ContentRetryEvent,
   ContentRetryFailureEvent,
 } from '../telemetry/types.js';
 import type { UiTelemetryService } from '../telemetry/uiTelemetry.js';
+import { type ChatCompressionInfo, CompressionStatus } from './turn.js';
 
 const debugLogger = createDebugLogger('QWEN_CODE_CHAT');
 
@@ -45,6 +47,11 @@ export enum StreamEventType {
   /** A signal that a retry is about to happen. The UI should discard any partial
    * content from the attempt that just failed. */
   RETRY = 'retry',
+  /** Emitted once at the start of the stream when an automatic compression
+   * pass succeeded. Carries the compression result so callers (the main
+   * agent UI, subagent loop) can surface it without each call site running
+   * its own compaction step. */
+  COMPRESSED = 'compressed',
 }
 
 export type StreamEvent =
@@ -56,7 +63,8 @@ export type StreamEvent =
        *  fresh restart (escalation). The UI should keep the accumulated text
        *  buffer so the continuation appends to it. */
       isContinuation?: boolean;
-    };
+    }
+  | { type: StreamEventType.COMPRESSED; info: ChatCompressionInfo };
 
 /**
  * Options for retrying due to invalid content from the model.
@@ -299,6 +307,22 @@ export class GeminiChat {
   // model.
   private sendPromise: Promise<void> = Promise.resolve();
 
+  /**
+   * Per-chat last-prompt-token-count, populated from `usageMetadata` on each
+   * model response. Used by the compaction threshold check so that subagents
+   * (which intentionally don't write to the global telemetry singleton) can
+   * still make compaction decisions based on their *own* context size.
+   */
+  private lastPromptTokenCount = 0;
+
+  /**
+   * Per-chat sticky flag. After an unforced compression attempt fails (empty
+   * summary or inflated token count), automatic compaction is suppressed
+   * for the remainder of this chat to avoid burning compression API calls
+   * in a loop. Manual `/compress` still works (it passes `force=true`).
+   */
+  private hasFailedCompressionAttempt = false;
+
   /**
    * Creates a new GeminiChat instance.
    *
@@ -321,6 +345,96 @@ export class GeminiChat {
     validateHistory(history);
   }
 
+  /**
+   * Most recent prompt-token count reported by the model for *this* chat,
+   * mirroring the value in {@link UiTelemetryService} for the main session.
+   * Subagent chats have no telemetry service wired but still need a per-chat
+   * count for compaction decisions, so this is always populated regardless
+   * of whether the global telemetry is updated.
+   */
+  getLastPromptTokenCount(): number {
+    return this.lastPromptTokenCount;
+  }
+
+  /**
+   * Seed the last-prompt-token-count for chats created with inherited
+   * history (forks, subagents, speculation). Without this, the auto-compress
+   * threshold check sees `0` and refuses to compress — so the first API call
+   * can 400 from oversized history. Callers pass the parent chat's
+   * `getLastPromptTokenCount()` here.
+   */
+  setLastPromptTokenCount(count: number): void {
+    this.lastPromptTokenCount = count;
+  }
+
+  /**
+   * Attempt to compress this chat's history.
+   *
+   * Returns the compression info regardless of outcome. On a successful
+   * compaction (`COMPRESSED`), this method has already mutated the chat's
+   * history, recorded the event to `chatRecordingService` (if wired), and
+   * updated both the per-chat token count and (when wired) the global
+   * telemetry singleton.
+   */
+  async tryCompress(
+    promptId: string,
+    model: string,
+    force = false,
+    signal?: AbortSignal,
+  ): Promise<ChatCompressionInfo> {
+    const service = new ChatCompressionService();
+    const { newHistory, info } = await service.compress(this, {
+      promptId,
+      force,
+      model,
+      config: this.config,
+      hasFailedCompressionAttempt: this.hasFailedCompressionAttempt,
+      originalTokenCount: this.lastPromptTokenCount,
+      signal,
+    });
+
+    if (info.compressionStatus === CompressionStatus.COMPRESSED && newHistory) {
+      this.chatRecordingService?.recordChatCompression({
+        info,
+        compressedHistory: newHistory,
+      });
+      // Auto-compaction replaces history in place — no env-context refresh
+      // here. Manual /compress goes through GeminiClient.tryCompressChat,
+      // which calls startChat() to re-prepend a fresh env snapshot. See
+      // GeminiClient.sendMessageStream for the rationale behind the split.
+      this.setHistory(newHistory);
+      // Compaction summarises away prior full-Read tool results, but the
+      // FileReadCache still treats those reads as "in this conversation".
+      // A follow-up Read could then return the file_unchanged placeholder
+      // pointing at content the model can no longer retrieve from history.
+      debugLogger.debug('[FILE_READ_CACHE] clear after auto tryCompress');
+      this.config.getFileReadCache().clear();
+      this.lastPromptTokenCount = info.newTokenCount;
+      // Mirror to the global singleton only when wired (main session).
+      // Subagents pass `telemetryService=undefined` to keep their context
+      // usage out of the main agent's UI counters.
+      this.telemetryService?.setLastPromptTokenCount(info.newTokenCount);
+      // Re-enable auto-compaction so a forced /compress recovers a chat
+      // that an earlier auto-attempt latched off.
+      this.hasFailedCompressionAttempt = false;
+    } else if (
+      info.compressionStatus ===
+        CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT ||
+      info.compressionStatus ===
+        CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY ||
+      info.compressionStatus ===
+        CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR
+    ) {
+      // Track failed attempts (only mark as failed if not forced) so we
+      // stop spending compression-API calls on a chat that can't shrink.
+      if (!force) {
+        this.hasFailedCompressionAttempt = true;
+      }
+    }
+
+    return info;
+  }
+
   setSystemInstruction(sysInstr: string) {
     this.generationConfig.systemInstruction = sysInstr;
   }
@@ -360,6 +474,23 @@ export class GeminiChat {
     });
     this.sendPromise = streamDonePromise;
 
+    // The send-lock above is held but the generator's `finally` (which
+    // resolves it) has not run yet — if `tryCompress` throws, we must
+    // release the lock here or subsequent sends will block forever at
+    // `await this.sendPromise`.
+    let compressionInfo: ChatCompressionInfo;
+    try {
+      compressionInfo = await this.tryCompress(
+        prompt_id,
+        model,
+        false,
+        params.config?.abortSignal,
+      );
+    } catch (error) {
+      streamDoneResolver!();
+      throw error;
+    }
+
     const userContent = createUserContent(params.message);
 
     // Add user content to history ONCE before any attempts.
@@ -370,6 +501,20 @@ export class GeminiChat {
     const self = this;
     return (async function* () {
       try {
+        // Surface a successful auto-compression to the caller as the first
+        // event in the stream. Failed/skipped compaction attempts are silent.
+        // Must be inside the try so that a consumer abandoning the stream
+        // immediately after this event still triggers the finally below;
+        // otherwise `streamDoneResolver` never fires and the next send hangs.
+        if (
+          compressionInfo.compressionStatus === CompressionStatus.COMPRESSED
+        ) {
+          yield {
+            type: StreamEventType.COMPRESSED,
+            info: compressionInfo,
+          };
+        }
+
         let lastError: unknown = new Error('Request failed after all retries.');
         let rateLimitRetryCount = 0;
         let invalidStreamRetryCount = 0;
@@ -890,8 +1035,14 @@ export class GeminiChat {
         // Some providers omit total_tokens or return 0 in streaming usage chunks.
         const lastPromptTokenCount =
           usageMetadata.totalTokenCount || usageMetadata.promptTokenCount;
-        if (lastPromptTokenCount && this.telemetryService) {
-          this.telemetryService.setLastPromptTokenCount(lastPromptTokenCount);
+        if (lastPromptTokenCount) {
+          // Always update the per-chat counter so this chat (including
+          // subagents) can make its own compaction decisions.
+          this.lastPromptTokenCount = lastPromptTokenCount;
+          // Mirror to the global telemetry only when wired — subagents
+          // pass `telemetryService=undefined` to keep their context usage
+          // out of the main session's UI counters.
+          this.telemetryService?.setLastPromptTokenCount(lastPromptTokenCount);
         }
         if (usageMetadata.cachedContentTokenCount && this.telemetryService) {
           this.telemetryService.setLastCachedContentTokenCount(
diff --git a/packages/core/src/core/turn.test.ts b/packages/core/src/core/turn.test.ts
index 92c86f9ed..6626f56c5 100644
--- a/packages/core/src/core/turn.test.ts
+++ b/packages/core/src/core/turn.test.ts
@@ -9,7 +9,7 @@ import type {
   ServerGeminiToolCallRequestEvent,
   ServerGeminiErrorEvent,
 } from './turn.js';
-import { Turn, GeminiEventType } from './turn.js';
+import { CompressionStatus, Turn, GeminiEventType } from './turn.js';
 import type { GenerateContentResponse, Part, Content } from '@google/genai';
 import { reportError } from '../utils/errorReporting.js';
 import type { GeminiChat } from './geminiChat.js';
@@ -845,6 +845,38 @@ describe('Turn', () => {
         { type: GeminiEventType.Content, value: 'Success' },
       ]);
     });
+
+    it('bridges a compressed stream event to a ChatCompressed event', async () => {
+      const compressionInfo = {
+        originalTokenCount: 1000,
+        newTokenCount: 200,
+        compressionStatus: CompressionStatus.COMPRESSED,
+      };
+      const mockResponseStream = (async function* () {
+        yield { type: StreamEventType.COMPRESSED, info: compressionInfo };
+        yield {
+          type: StreamEventType.CHUNK,
+          value: {
+            candidates: [{ content: { parts: [{ text: 'after' }] } }],
+          },
+        };
+      })();
+      mockSendMessageStream.mockResolvedValue(mockResponseStream);
+
+      const events = [];
+      for await (const event of turn.run(
+        'test-model',
+        [],
+        new AbortController().signal,
+      )) {
+        events.push(event);
+      }
+
+      expect(events).toEqual([
+        { type: GeminiEventType.ChatCompressed, value: compressionInfo },
+        { type: GeminiEventType.Content, value: 'after' },
+      ]);
+    });
   });
 
   describe('getDebugResponses', () => {
diff --git a/packages/core/src/core/turn.ts b/packages/core/src/core/turn.ts
index b0971d0b6..34f78bcd0 100644
--- a/packages/core/src/core/turn.ts
+++ b/packages/core/src/core/turn.ts
@@ -306,6 +306,19 @@ export class Turn {
           continue; // Skip to the next event in the stream
         }
 
+        // Surface auto-compaction that fired inside chat.sendMessageStream
+        // as the top-level ChatCompressed event so existing UI handlers stay
+        // connected. This bridge is the primary path for auto-compaction
+        // events; manual /compress emits its own ChatCompressed in
+        // GeminiClient.tryCompressChat.
+        if (streamEvent.type === 'compressed') {
+          yield {
+            type: GeminiEventType.ChatCompressed,
+            value: streamEvent.info,
+          };
+          continue;
+        }
+
         // Assuming other events are chunks with a `value` property
         const resp = streamEvent.value as GenerateContentResponse;
         if (!resp) continue; // Skip if there's no response body
diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts
index 417988331..3a3520cce 100644
--- a/packages/core/src/services/chatCompressionService.test.ts
+++ b/packages/core/src/services/chatCompressionService.test.ts
@@ -75,15 +75,17 @@ describe('findCompressSplitPoint', () => {
     expect(findCompressSplitPoint(history, 0.8)).toBe(4);
   });
 
-  it('should return earlier splitpoint if no valid ones are after threshhold', () => {
+  it('compresses everything before the trailing in-flight functionCall', () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'This is the first message.' }] },
       { role: 'model', parts: [{ text: 'This is the second message.' }] },
       { role: 'user', parts: [{ text: 'This is the third message.' }] },
       { role: 'model', parts: [{ functionCall: { name: 'foo', args: {} } }] },
     ];
-    // Can't return 4 because the previous item has a function call.
-    expect(findCompressSplitPoint(history, 0.99)).toBe(2);
+    // Trailing m+fc is in-flight; no preceding (m+fc, u+fr) pair to retain,
+    // so the in-flight fallback compresses everything except the trailing fc.
+    // The kept slice starts with m+fc; callers bridge with a synthetic user.
+    expect(findCompressSplitPoint(history, 0.99)).toBe(3);
   });
 
   it('should handle a history with only one item', () => {
@@ -143,7 +145,7 @@ describe('findCompressSplitPoint', () => {
     expect(findCompressSplitPoint(history, 0.7)).toBe(5);
   });
 
-  it('should return primary split point when tool completions have no subsequent regular user message', () => {
+  it('retains last K complete tool rounds when no fresh user splits past target', () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'Fix this' }] },
       {
@@ -181,14 +183,13 @@ describe('findCompressSplitPoint', () => {
         parts: [{ functionCall: { name: 'write1', args: {} } }],
       },
     ];
-    // Only one non-functionResponse user message (index 0) -> lastSplitPoint=0
-    // Last message has functionCall -> can't compress everything
-    // historyToKeep must start with a regular user message, so split at 0
-    // (compress nothing) is the only valid option.
-    expect(findCompressSplitPoint(history, 0.7)).toBe(0);
+    // 2 complete (m+fc, u+fr) pairs precede the trailing fc → retain both
+    // pairs + trailing fc = last 5 entries; compress index 0 (the task).
+    // Pre-refactor this returned 0 (NOOP); now it compresses-most.
+    expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5);
   });
 
-  it('should prefer primary split point when tool completions yield no valid user-starting split', () => {
+  it('prefers compress-most over lastSplitPoint when scan finds no clean split past target', () => {
     const longContent = 'a'.repeat(10000);
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'Fix bug A' }] },
@@ -229,13 +230,12 @@ describe('findCompressSplitPoint', () => {
         parts: [{ functionCall: { name: 'write1', args: {} } }],
       },
     ];
-    // Primary split points at 0 and 2 (regular user messages before the bulky tool outputs)
-    // Last message has functionCall -> can't compress everything
-    // Should return lastSplitPoint=2 (last valid primary split point)
-    expect(findCompressSplitPoint(history, 0.7)).toBe(2);
+    // 2 complete pairs before the trailing fc → retain both + trailing = 5
+    // entries kept. Pre-refactor returned lastSplitPoint=2 (compress less).
+    expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5);
   });
 
-  it('should still prefer primary split point when it is better', () => {
+  it('compresses-most via in-flight fallback when scan never crosses the target', () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'msg1' }] },
       { role: 'model', parts: [{ text: 'resp1' }] },
@@ -266,10 +266,93 @@ describe('findCompressSplitPoint', () => {
         parts: [{ functionCall: { name: 'tool2', args: {} } }],
       },
     ];
-    // Primary split points: 0, 2, 5, 7
-    // Last message has functionCall -> can't compress everything
-    // At 0.99 fraction, lastSplitPoint should be 7
-    expect(findCompressSplitPoint(history, 0.99)).toBe(7);
+    // The entry before the trailing fc is a fresh user (msg4), not a u+fr,
+    // so the pair walk stops with 0 pairs found → retain only the trailing
+    // fc, compress everything else. Pre-refactor returned lastSplitPoint=7.
+    expect(findCompressSplitPoint(history, 0.99)).toBe(history.length - 1);
+  });
+});
+
+describe('findCompressSplitPoint — in-flight fallback', () => {
+  const userTask = (text: string): Content => ({
+    role: 'user',
+    parts: [{ text }],
+  });
+  const modelText = (text: string): Content => ({
+    role: 'model',
+    parts: [{ text }],
+  });
+  const modelFc = (name: string): Content => ({
+    role: 'model',
+    parts: [{ functionCall: { name, args: {} } }],
+  });
+  const userFr = (name: string): Content => ({
+    role: 'user',
+    parts: [{ functionResponse: { name, response: { result: 'x' } } }],
+  });
+
+  // Subagent-shaped history at compression check time: env bootstrap, task,
+  // alternating tool rounds, ending in a trailing in-flight model+fc whose
+  // functionResponse hasn't been pushed yet. The scan finds no clean split
+  // past the target fraction, so the in-flight fallback decides the index.
+  it('compresses everything except trailing fc + most recent retainCount pairs', () => {
+    const history = [
+      userTask('env'),
+      modelText('env-ack'),
+      userTask('task'),
+      modelFc('a'),
+      userFr('a'),
+      modelFc('b'),
+      userFr('b'),
+      modelFc('c'),
+      userFr('c'),
+      modelFc('d'),
+      userFr('d'),
+      modelFc('trailing'),
+    ];
+    // Default retainCount = 2 → keep last 5 (2 pairs + trailing).
+    expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 5);
+  });
+
+  it('retains all pairs when fewer than retainCount exist', () => {
+    const history = [
+      userTask('env'),
+      modelText('env-ack'),
+      userTask('task'),
+      modelFc('a'),
+      userFr('a'),
+      modelFc('trailing'),
+    ];
+    // Only 1 complete pair → keep last 3 (1 pair + trailing).
+    expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 3);
+  });
+
+  it('retains just the trailing fc when no complete pairs precede it', () => {
+    const history = [
+      userTask('env'),
+      modelText('env-ack'),
+      userTask('task'),
+      modelFc('trailing'),
+    ];
+    // No complete pairs → keep only the trailing fc.
+    expect(findCompressSplitPoint(history, 0.7)).toBe(history.length - 1);
+  });
+
+  it('respects an explicit retainCount override', () => {
+    const history = [
+      userTask('env'),
+      modelText('env-ack'),
+      userTask('task'),
+      modelFc('a'),
+      userFr('a'),
+      modelFc('b'),
+      userFr('b'),
+      modelFc('c'),
+      userFr('c'),
+      modelFc('trailing'),
+    ];
+    // Override retainCount to 1 → keep last 3 (1 pair + trailing).
+    expect(findCompressSplitPoint(history, 0.7, 1)).toBe(history.length - 3);
   });
 });
 
@@ -313,14 +396,14 @@ describe('ChatCompressionService', () => {
 
   it('should return NOOP if history is empty', async () => {
     vi.mocked(mockChat.getHistory).mockReturnValue([]);
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
     expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
     expect(result.newHistory).toBeNull();
   });
@@ -329,14 +412,14 @@ describe('ChatCompressionService', () => {
     vi.mocked(mockChat.getHistory).mockReturnValue([
       { role: 'user', parts: [{ text: 'hi' }] },
     ]);
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      true,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: true,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
     expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
     expect(result.newHistory).toBeNull();
   });
@@ -349,14 +432,14 @@ describe('ChatCompressionService', () => {
     vi.mocked(tokenLimit).mockReturnValue(1000);
     // Threshold is 0.7 * 1000 = 700. 600 < 700, so NOOP.
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
     expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
     expect(result.newHistory).toBeNull();
   });
@@ -377,14 +460,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info).toMatchObject({
       compressionStatus: CompressionStatus.NOOP,
@@ -394,14 +477,14 @@ describe('ChatCompressionService', () => {
     expect(mockGenerateContent).not.toHaveBeenCalled();
     expect(tokenLimit).not.toHaveBeenCalled();
 
-    const forcedResult = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const forcedResult = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
     expect(forcedResult.info).toMatchObject({
       compressionStatus: CompressionStatus.NOOP,
       originalTokenCount: 0,
@@ -438,14 +521,14 @@ describe('ChatCompressionService', () => {
     } as unknown as ContentGenerator);
 
     // force=true bypasses the token threshold gate so we exercise the 5% guard
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
     expect(result.newHistory).toBeNull();
@@ -485,14 +568,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
     expect(result.info.newTokenCount).toBe(250); // 800 - (1600 - 1000) + 50
@@ -539,14 +622,15 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true, // forced
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      // forced
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
     expect(result.newHistory).not.toBeNull();
@@ -586,14 +670,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(
       CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
@@ -629,14 +713,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(
       CompressionStatus.COMPRESSION_FAILED_TOKEN_COUNT_ERROR,
@@ -668,14 +752,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(
       CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY,
@@ -707,14 +791,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(
       CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY,
@@ -749,14 +833,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      true,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: true,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     expect(result.info.compressionStatus).toBe(
       CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
@@ -801,14 +885,14 @@ describe('ChatCompressionService', () => {
       generateContent: mockGenerateContent,
     } as unknown as ContentGenerator);
 
-    const result = await service.compress(
-      mockChat,
-      mockPromptId,
-      false,
-      mockModel,
-      mockConfig,
-      false,
-    );
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
 
     // Should still complete compression despite hook error
     expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
@@ -860,14 +944,15 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        true, // force = true -> Manual trigger
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        // force = true -> Manual trigger
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(mockFirePreCompactEvent).toHaveBeenCalledWith(
         PreCompactTrigger.Manual,
@@ -910,14 +995,15 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        false, // force = false -> Auto trigger
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        // force = false -> Auto trigger
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(mockFirePreCompactEvent).toHaveBeenCalledWith(
         PreCompactTrigger.Auto,
@@ -929,14 +1015,14 @@ describe('ChatCompressionService', () => {
     it('should not fire PreCompact hook when history is empty', async () => {
       vi.mocked(mockChat.getHistory).mockReturnValue([]);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        true,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
       expect(mockFirePreCompactEvent).not.toHaveBeenCalled();
@@ -952,14 +1038,14 @@ describe('ChatCompressionService', () => {
         contextPercentageThreshold: 0,
       });
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        true,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
       expect(mockFirePreCompactEvent).not.toHaveBeenCalled();
@@ -976,14 +1062,14 @@ describe('ChatCompressionService', () => {
       );
       vi.mocked(tokenLimit).mockReturnValue(1000);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
       expect(mockFirePreCompactEvent).not.toHaveBeenCalled();
@@ -1027,14 +1113,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Should still complete compression despite hook error
       expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
@@ -1084,14 +1170,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // PreCompact should be called before SessionStart
       expect(callOrder).toEqual(['PreCompact', 'SessionStart']);
@@ -1133,14 +1219,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Should still complete compression without hook
       expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
@@ -1195,14 +1281,15 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        true, // force = true -> Manual trigger
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        // force = true -> Manual trigger
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(mockFirePostCompactEvent).toHaveBeenCalledWith(
         PostCompactTrigger.Manual,
@@ -1245,14 +1332,15 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        false, // force = false -> Auto trigger
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        // force = false -> Auto trigger
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(mockFirePostCompactEvent).toHaveBeenCalledWith(
         PostCompactTrigger.Auto,
@@ -1292,14 +1380,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        true,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       expect(result.info.compressionStatus).toBe(
         CompressionStatus.COMPRESSION_FAILED_EMPTY_SUMMARY,
@@ -1345,14 +1433,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Should still complete compression despite hook error
       expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
@@ -1405,14 +1493,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Hooks should be called in order: PreCompact -> SessionStart -> PostCompact
       expect(callOrder).toEqual(['PreCompact', 'SessionStart', 'PostCompact']);
@@ -1454,14 +1542,14 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false,
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Should still complete compression without hook
       expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
@@ -1535,14 +1623,15 @@ describe('ChatCompressionService', () => {
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        true, // force=true (manual /compress)
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: true,
+        // force=true (manual /compress)
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
       // Should compress successfully — orphaned funcCall is stripped first, then
       // normal compression runs on the remaining history, historyToKeep is empty
@@ -1558,10 +1647,12 @@ describe('ChatCompressionService', () => {
       expect(callArg.contents.length).toBe(history.length); // (history.length - 1) messages + 1 instruction
     });
 
-    it('should NOT compress orphaned funcCall when force=false (auto-compress)', async () => {
-      // Auto-compress fires BEFORE the matching funcResponse is sent back to the
-      // model. Compressing the funcCall away would orphan the upcoming funcResponse
-      // and cause an API error. So force=false must NOT take this path.
+    it('compresses-most without orphaning when last entry is in-flight funcCall (auto-compress)', async () => {
+      // Auto-compress fires BEFORE the matching funcResponse is sent back to
+      // the model. The trailing funcCall must be retained (its response is
+      // coming); the in-flight fallback compresses everything safely before
+      // it. Pre-refactor this returned NOOP, leaving the chat to grow until
+      // it 400'd.
       const history: Content[] = [
         { role: 'user', parts: [{ text: 'Fix all TypeScript errors.' }] },
         {
@@ -1586,7 +1677,6 @@ describe('ChatCompressionService', () => {
         },
       ];
       vi.mocked(mockChat.getHistory).mockReturnValue(history);
-      // Use a token count above threshold to ensure auto-compress isn't skipped
       vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
         800,
       );
@@ -1595,23 +1685,195 @@ describe('ChatCompressionService', () => {
         contextWindowSize: 1000,
       } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
 
+      const mockGenerateContent = vi.fn().mockResolvedValue({
+        candidates: [
+          { content: { parts: [{ text: 'state snapshot summary' }] } },
+        ],
+        usageMetadata: {
+          promptTokenCount: 2000,
+          candidatesTokenCount: 50,
+          totalTokenCount: 2050,
+        },
+      } as unknown as GenerateContentResponse);
+      vi.mocked(mockConfig.getContentGenerator).mockReturnValue({
+        generateContent: mockGenerateContent,
+      } as unknown as ContentGenerator);
+
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
+
+      expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+      expect(mockGenerateContent).toHaveBeenCalledTimes(1);
+      // Trailing in-flight functionCall is preserved last in the kept slice
+      // so the upcoming functionResponse pairs with it.
+      const newHistory = result.newHistory!;
+      const last = newHistory[newHistory.length - 1];
+      expect(last.role).toBe('model');
+      expect(last.parts?.some((p) => p.functionCall)).toBe(true);
+      // Strict role alternation throughout.
+      for (let i = 1; i < newHistory.length; i++) {
+        expect(newHistory[i].role).not.toBe(newHistory[i - 1].role);
+      }
+    });
+  });
+
+  describe('tool-loop subagent absorption', () => {
+    // The fresh-user split heuristic produces a tiny compress slice when the
+    // history is dominated by tool rounds (every user past the task is a
+    // functionResponse). Without absorption, MIN_COMPRESSION_FRACTION would
+    // NOOP every send and the subagent eventually hits the 400 it was meant
+    // to avoid.
+    it('compresses by absorbing older tool rounds when fresh-user split is too small', async () => {
+      const FILLER = 'A'.repeat(20_000);
+      // Auto-compress fires BEFORE the next functionResponse is pushed, so
+      // the trailing entry is always a model+functionCall with no match yet.
+      // Build a history with N complete pairs followed by one trailing fc.
+      const buildHistory = (completePairs: number): Content[] => {
+        const h: Content[] = [
+          { role: 'user', parts: [{ text: 'env-bootstrap' }] },
+          { role: 'model', parts: [{ text: 'env-ack' }] },
+          { role: 'user', parts: [{ text: 'task: explore' }] },
+        ];
+        for (let r = 0; r < completePairs; r++) {
+          h.push({
+            role: 'model',
+            parts: [
+              { text: `round ${r}: ${FILLER}` },
+              { functionCall: { name: 'glob', args: { pattern: '**/*.md' } } },
+            ],
+          });
+          h.push({
+            role: 'user',
+            parts: [
+              {
+                functionResponse: { name: 'glob', response: { result: 'x' } },
+              },
+            ],
+          });
+        }
+        // Trailing model+fc whose response is about to be sent.
+        h.push({
+          role: 'model',
+          parts: [
+            { text: `round ${completePairs}: ${FILLER}` },
+            { functionCall: { name: 'glob', args: { pattern: '**/*.md' } } },
+          ],
+        });
+        return h;
+      };
+
+      // Five complete tool rounds + 1 trailing fc → 5 pairs in keep; absorbs
+      // 3 older pairs and retains the 2 most recent (plus the trailing fc).
+      vi.mocked(mockChat.getHistory).mockReturnValue(buildHistory(5));
+      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
+        80_000,
+      );
+      vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+        model: 'gemini-pro',
+        contextWindowSize: 100_000,
+      } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
+
+      const mockGenerateContent = vi.fn().mockResolvedValue({
+        candidates: [
+          { content: { parts: [{ text: 'state snapshot summary' }] } },
+        ],
+        usageMetadata: {
+          promptTokenCount: 60_000,
+          candidatesTokenCount: 200,
+          totalTokenCount: 60_200,
+        },
+      } as unknown as GenerateContentResponse);
+      vi.mocked(mockConfig.getContentGenerator).mockReturnValue({
+        generateContent: mockGenerateContent,
+      } as unknown as ContentGenerator);
+
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
+
+      expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+      expect(result.newHistory).not.toBeNull();
+      expect(mockGenerateContent).toHaveBeenCalledTimes(1);
+
+      const newHistory = result.newHistory!;
+      // [summary_user, summary_ack_model, continuation_bridge_user, ...keep]
+      // where keep starts with the retained model+functionCall.
+      expect(newHistory[0].role).toBe('user');
+      expect(newHistory[0].parts?.[0].text).toBe('state snapshot summary');
+      expect(newHistory[1].role).toBe('model');
+      expect(newHistory[2].role).toBe('user');
+      expect(newHistory[2].parts?.[0].text).toMatch(/Continue/);
+      // Retained two complete pairs (4 entries) + trailing model+fc = 5.
+      expect(newHistory.slice(3)).toHaveLength(5);
+      expect(newHistory[3].role).toBe('model');
+      expect(newHistory[3].parts?.some((p) => p.functionCall)).toBe(true);
+      expect(newHistory[4].role).toBe('user');
+      expect(newHistory[4].parts?.some((p) => p.functionResponse)).toBe(true);
+      // Trailing model+fc remains last so the upcoming functionResponse pushed
+      // by sendMessageStream pairs with it correctly.
+      const last = newHistory[newHistory.length - 1];
+      expect(last.role).toBe('model');
+      expect(last.parts?.some((p) => p.functionCall)).toBe(true);
+
+      // Strict role alternation throughout the new history.
+      for (let i = 1; i < newHistory.length; i++) {
+        expect(newHistory[i].role).not.toBe(newHistory[i - 1].role);
+      }
+    });
+
+    it('NOOPs when the keep slice has too few tool rounds to absorb', async () => {
+      const FILLER = 'A'.repeat(20_000);
+      const history: Content[] = [
+        { role: 'user', parts: [{ text: 'env-bootstrap' }] },
+        { role: 'model', parts: [{ text: 'env-ack' }] },
+        { role: 'user', parts: [{ text: 'task' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: FILLER },
+            { functionCall: { name: 'glob', args: {} } },
+          ],
+        },
+      ];
+      vi.mocked(mockChat.getHistory).mockReturnValue(history);
+      // Set originalTokenCount above the threshold gate (0.7 * 30000 = 21000)
+      // so the test actually exercises findCompressSplitPoint and the
+      // MIN_COMPRESSION_FRACTION decision rather than short-circuiting at
+      // the cheap-gate.
+      vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(
+        22_000,
+      );
+      vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+        model: 'gemini-pro',
+        contextWindowSize: 30_000,
+      } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
+
       const mockGenerateContent = vi.fn();
       vi.mocked(mockConfig.getContentGenerator).mockReturnValue({
         generateContent: mockGenerateContent,
       } as unknown as ContentGenerator);
 
-      const result = await service.compress(
-        mockChat,
-        mockPromptId,
-        false, // force=false (auto-compress)
-        mockModel,
-        mockConfig,
-        false,
-      );
+      const result = await service.compress(mockChat, {
+        promptId: mockPromptId,
+        force: false,
+        model: mockModel,
+        config: mockConfig,
+        hasFailedCompressionAttempt: false,
+        originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+      });
 
-      // Must return NOOP — compressing would orphan the upcoming funcResponse
       expect(result.info.compressionStatus).toBe(CompressionStatus.NOOP);
-      expect(result.newHistory).toBeNull();
       expect(mockGenerateContent).not.toHaveBeenCalled();
     });
   });
diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts
index 876d051ac..4e6b19bd3 100644
--- a/packages/core/src/services/chatCompressionService.ts
+++ b/packages/core/src/services/chatCompressionService.ts
@@ -8,7 +8,6 @@ import type { Content } from '@google/genai';
 import type { Config } from '../config/config.js';
 import type { GeminiChat } from '../core/geminiChat.js';
 import { type ChatCompressionInfo, CompressionStatus } from '../core/turn.js';
-import { uiTelemetryService } from '../telemetry/uiTelemetry.js';
 import { DEFAULT_TOKEN_LIMIT } from '../core/tokenLimits.js';
 import { getCompressionPrompt } from '../core/prompts.js';
 import { getResponseText } from '../utils/partUtils.js';
@@ -40,15 +39,79 @@ export const COMPRESSION_PRESERVE_THRESHOLD = 0.3;
  */
 export const MIN_COMPRESSION_FRACTION = 0.05;
 
+/**
+ * When the trailing entry is an in-flight `model+functionCall` and the regular
+ * scan finds no clean split past the target fraction, the splitter falls back
+ * to compressing everything except the last few entries. This constant sets
+ * how many most-recent complete `(model+functionCall, user+functionResponse)`
+ * tool rounds are retained as working context (the trailing in-flight call is
+ * always retained on top of these).
+ */
+export const TOOL_ROUND_RETAIN_COUNT = 2;
+
+const hasFunctionCall = (content: Content | undefined): boolean =>
+  !!content?.parts?.some((part) => !!part.functionCall);
+
+const hasFunctionResponse = (content: Content | undefined): boolean =>
+  !!content?.parts?.some((part) => !!part.functionResponse);
+
+/**
+ * Walk backward from the trailing in-flight `model+functionCall` and return
+ * the index after which the most-recent `retainCount` complete tool-round
+ * pairs sit (plus the trailing fc itself). Used by the splitter's in-flight
+ * fallback path. Stops counting at the first non-pair encountered, so the
+ * retain count is best-effort: if there are fewer complete pairs than
+ * requested, all of them are retained.
+ */
+function splitPointRetainingTrailingPairs(
+  contents: Content[],
+  retainCount: number,
+): number {
+  let pairsFound = 0;
+  let i = contents.length - 2;
+  while (i >= 1 && pairsFound < retainCount) {
+    if (hasFunctionCall(contents[i - 1]) && hasFunctionResponse(contents[i])) {
+      pairsFound += 1;
+      i -= 2;
+    } else {
+      break;
+    }
+  }
+  return contents.length - (2 * pairsFound + 1);
+}
+
 /**
  * Returns the index of the oldest item to keep when compressing. May return
  * contents.length which indicates that everything should be compressed.
  *
+ * The algorithm has two phases:
+ *
+ * 1. **Scan:** walk left-to-right looking for the first non-functionResponse
+ *    user message that lands past `fraction` of total chars. That's the
+ *    "clean" split — the kept slice starts with a fresh user prompt.
+ *
+ * 2. **Fallbacks** (no clean split found): the gate that gets us here has
+ *    already decided we need to compress, so all three fallbacks bias toward
+ *    *more* compression rather than less:
+ *
+ *    - last entry is `model` without functionCall → compress everything.
+ *    - last entry is `user` with functionResponse → compress everything (the
+ *      trailing tool round is complete; no orphans).
+ *    - last entry is `model` with functionCall (in-flight) → compress
+ *      everything except the trailing call plus the last `retainCount`
+ *      complete tool rounds. The kept slice may start with `model+fc`;
+ *      callers must inject a synthetic continuation user message between
+ *      `summary_ack_model` and the kept slice to preserve role alternation.
+ *
+ * The pre-fallback returns of `lastSplitPoint` (compress less) only happen
+ * for malformed histories that don't end in user/model.
+ *
  * Exported for testing purposes.
  */
 export function findCompressSplitPoint(
   contents: Content[],
   fraction: number,
+  retainCount = TOOL_ROUND_RETAIN_COUNT,
 ): number {
   if (fraction <= 0 || fraction >= 1) {
     throw new Error('Fraction must be between 0 and 1');
@@ -58,14 +121,11 @@ export function findCompressSplitPoint(
   const totalCharCount = charCounts.reduce((a, b) => a + b, 0);
   const targetCharCount = totalCharCount * fraction;
 
-  let lastSplitPoint = 0; // 0 is always valid (compress nothing)
+  let lastSplitPoint = 0;
   let cumulativeCharCount = 0;
   for (let i = 0; i < contents.length; i++) {
     const content = contents[i];
-    if (
-      content.role === 'user' &&
-      !content.parts?.some((part) => !!part.functionResponse)
-    ) {
+    if (content.role === 'user' && !hasFunctionResponse(content)) {
       if (cumulativeCharCount >= targetCharCount) {
         return i;
       }
@@ -74,48 +134,57 @@ export function findCompressSplitPoint(
     cumulativeCharCount += charCounts[i];
   }
 
-  // We found no split points after targetCharCount.
-  // Check if it's safe to compress everything.
   const lastContent = contents[contents.length - 1];
-  if (
-    lastContent?.role === 'model' &&
-    !lastContent?.parts?.some((part) => part.functionCall)
-  ) {
+  if (lastContent?.role === 'model') {
+    if (!hasFunctionCall(lastContent)) return contents.length;
+    return splitPointRetainingTrailingPairs(contents, retainCount);
+  }
+  if (lastContent?.role === 'user' && hasFunctionResponse(lastContent)) {
     return contents.length;
   }
-  // Also safe to compress everything if the last message completes a tool call
-  // sequence (all function calls have matching responses).
-  if (
-    lastContent?.role === 'user' &&
-    lastContent?.parts?.some((part) => !!part.functionResponse)
-  ) {
-    return contents.length;
-  }
-
   return lastSplitPoint;
 }
 
+export interface CompressOptions {
+  promptId: string;
+  force: boolean;
+  model: string;
+  config: Config;
+  /**
+   * Whether a previous unforced compression attempt failed for this chat.
+   * Suppresses auto-compaction; manual `/compress` (force=true) overrides.
+   */
+  hasFailedCompressionAttempt: boolean;
+  /**
+   * Most recent prompt token count for this chat. Compared against
+   * `threshold * contextWindowSize` for the auto-compaction gate. Callers
+   * source this from the per-chat counter (main session, subagents alike) —
+   * the service does not read or write any global telemetry.
+   */
+  originalTokenCount: number;
+  signal?: AbortSignal;
+}
+
 export class ChatCompressionService {
   async compress(
     chat: GeminiChat,
-    promptId: string,
-    force: boolean,
-    model: string,
-    config: Config,
-    hasFailedCompressionAttempt: boolean,
-    signal?: AbortSignal,
+    opts: CompressOptions,
   ): Promise<{ newHistory: Content[] | null; info: ChatCompressionInfo }> {
-    const curatedHistory = chat.getHistory(true);
+    const {
+      promptId,
+      force,
+      model,
+      config,
+      hasFailedCompressionAttempt,
+      originalTokenCount,
+      signal,
+    } = opts;
     const threshold =
       config.getChatCompression()?.contextPercentageThreshold ??
       COMPRESSION_TOKEN_THRESHOLD;
 
-    // Regardless of `force`, don't do anything if the history is empty.
-    if (
-      curatedHistory.length === 0 ||
-      threshold <= 0 ||
-      (hasFailedCompressionAttempt && !force)
-    ) {
+    // Cheap gates first — these don't need the curated history.
+    if (threshold <= 0 || (hasFailedCompressionAttempt && !force)) {
       return {
         newHistory: null,
         info: {
@@ -126,9 +195,9 @@ export class ChatCompressionService {
       };
     }
 
-    const originalTokenCount = uiTelemetryService.getLastPromptTokenCount();
-
-    // Don't compress if not forced and we are under the limit.
+    // Don't compress if not forced and we are under the limit. This is the
+    // steady-state path on every send; we want to exit before paying for the
+    // full `getHistory(true)` clone below.
     if (!force) {
       const contextLimit =
         config.getContentGeneratorConfig()?.contextWindowSize ??
@@ -145,6 +214,18 @@ export class ChatCompressionService {
       }
     }
 
+    const curatedHistory = chat.getHistory(true);
+    if (curatedHistory.length === 0) {
+      return {
+        newHistory: null,
+        info: {
+          originalTokenCount: 0,
+          newTokenCount: 0,
+          compressionStatus: CompressionStatus.NOOP,
+        },
+      };
+    }
+
     // Fire PreCompact hook before compression begins
     const hookSystem = config.getHookSystem();
     if (hookSystem) {
@@ -181,6 +262,10 @@ export class ChatCompressionService {
 
     const historyToCompress = historyForSplit.slice(0, splitPoint);
     const historyToKeep = historyForSplit.slice(splitPoint);
+    // The in-flight fallback path may produce a kept slice starting with
+    // model+functionCall; the post-summary history needs a synthetic user
+    // between the summary's model_ack and the kept entries.
+    const keepNeedsContinuationBridge = historyToKeep[0]?.role === 'model';
 
     if (historyToCompress.length === 0) {
       return {
@@ -196,10 +281,6 @@ export class ChatCompressionService {
     // Guard: if historyToCompress is too small relative to the total history,
     // skip compression. This prevents futile API calls where the model receives
     // almost no context and generates a useless "summary" that inflates tokens.
-    //
-    // Note: findCompressSplitPoint already computes charCounts internally but
-    // returns only the split index. We intentionally recompute here to keep
-    // the function signature simple; this is a minor, acceptable duplication.
     const compressCharCount = historyToCompress.reduce(
       (sum, c) => sum + JSON.stringify(c).length,
       0,
@@ -274,6 +355,22 @@ export class ChatCompressionService {
           role: 'model',
           parts: [{ text: 'Got it. Thanks for the additional context!' }],
         },
+        // When the kept slice starts with model+functionCall (because
+        // tool-round absorption pulled the only fresh user message into
+        // compress), inject a synthetic continuation prompt so the joined
+        // history alternates correctly.
+        ...(keepNeedsContinuationBridge
+          ? [
+              {
+                role: 'user' as const,
+                parts: [
+                  {
+                    text: 'Continue with the prior task using the context above.',
+                  },
+                ],
+              },
+            ]
+          : []),
         ...historyToKeep,
       ];
 
@@ -339,8 +436,6 @@ export class ChatCompressionService {
         },
       };
     } else {
-      uiTelemetryService.setLastPromptTokenCount(newTokenCount);
-
       // Fire SessionStart event after successful compression
       try {
         const permissionMode = String(