telemetry: track cached content tokens for accurate context calculation

- Add cachedContentTokenCount tracking in uiTelemetry service - Collect cached_content_token_count from streaming usage metadata - Use cached tokens instead of estimated overhead when available - Fix messages token calculation to avoid 'messages = 0' issue This improves context window display accuracy when using providers that support prefix caching (e.g., DashScope). Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-04-30 12:40:44 +00:00 · 2026-03-18 17:51:50 +08:00 · 2026-03-18 17:51:50 +08:00 · 3bfe34a1dc
commit 3bfe34a1dc
parent d4379d6ee6
3 changed files with 32 additions and 2 deletions
--- a/packages/cli/src/ui/commands/contextCommand.ts
+++ b/packages/cli/src/ui/commands/contextCommand.ts
@ -120,6 +120,10 @@ export const contextCommand: SlashCommand = {

    // Total prompt token count from API (most accurate)
    const apiTotalTokens = uiTelemetryService.getLastPromptTokenCount();
+    // Cached content token count — when available (e.g. DashScope prefix caching),
+    // represents the cached overhead (system prompt + tools). Using this gives a much
+    // more accurate "Messages" count: promptTokens - cachedTokens = actual history tokens.
+    const apiCachedTokens = uiTelemetryService.getLastCachedContentTokenCount();

    // 1. System prompt tokens (without memory, as memory is counted separately)
    const systemPromptText = getCoreSystemPrompt(undefined, modelName);
@ -302,7 +306,16 @@ export const contextCommand: SlashCommand = {
        scaledAllTools +
        displayMemoryFiles +
        Math.round(loadedBodiesTokens * overheadScale);
-      messagesTokens = Math.max(0, totalTokens - scaledOverhead);
+
+      // When the API reports cached content tokens (e.g. DashScope prefix caching),
+      // use them as the actual overhead indicator for a more accurate messages count.
+      // cachedTokens ≈ system prompt + tools tokens actually served from cache.
+      // This avoids the "messages = 0" problem caused by estimation overshoot.
+      if (apiCachedTokens > 0) {
+        messagesTokens = Math.max(0, totalTokens - apiCachedTokens);
+      } else {
+        messagesTokens = Math.max(0, totalTokens - scaledOverhead);
+      }

      freeSpace = Math.max(
        0,