telemetry: track cached content tokens for accurate context calculation

- Add cachedContentTokenCount tracking in uiTelemetry service
- Collect cached_content_token_count from streaming usage metadata
- Use cached tokens instead of estimated overhead when available
- Fix messages token calculation to avoid 'messages = 0' issue

This improves context window display accuracy when using providers
that support prefix caching (e.g., DashScope).

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
pomelo-nwu 2026-03-18 17:51:50 +08:00
parent d4379d6ee6
commit 3bfe34a1dc
3 changed files with 32 additions and 2 deletions

View file

@ -120,6 +120,10 @@ export const contextCommand: SlashCommand = {
// Total prompt token count from API (most accurate)
const apiTotalTokens = uiTelemetryService.getLastPromptTokenCount();
// Cached content token count — when available (e.g. DashScope prefix caching),
// represents the cached overhead (system prompt + tools). Using this gives a much
// more accurate "Messages" count: promptTokens - cachedTokens = actual history tokens.
const apiCachedTokens = uiTelemetryService.getLastCachedContentTokenCount();
// 1. System prompt tokens (without memory, as memory is counted separately)
const systemPromptText = getCoreSystemPrompt(undefined, modelName);
@ -302,7 +306,16 @@ export const contextCommand: SlashCommand = {
scaledAllTools +
displayMemoryFiles +
Math.round(loadedBodiesTokens * overheadScale);
messagesTokens = Math.max(0, totalTokens - scaledOverhead);
// When the API reports cached content tokens (e.g. DashScope prefix caching),
// use them as the actual overhead indicator for a more accurate messages count.
// cachedTokens ≈ system prompt + tools tokens actually served from cache.
// This avoids the "messages = 0" problem caused by estimation overshoot.
if (apiCachedTokens > 0) {
messagesTokens = Math.max(0, totalTokens - apiCachedTokens);
} else {
messagesTokens = Math.max(0, totalTokens - scaledOverhead);
}
freeSpace = Math.max(
0,