mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-04-30 12:40:44 +00:00
telemetry: track cached content tokens for accurate context calculation
- Add cachedContentTokenCount tracking in uiTelemetry service - Collect cached_content_token_count from streaming usage metadata - Use cached tokens instead of estimated overhead when available - Fix messages token calculation to avoid 'messages = 0' issue This improves context window display accuracy when using providers that support prefix caching (e.g., DashScope). Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
parent
d4379d6ee6
commit
3bfe34a1dc
3 changed files with 32 additions and 2 deletions
|
|
@ -120,6 +120,10 @@ export const contextCommand: SlashCommand = {
|
|||
|
||||
// Total prompt token count from API (most accurate)
|
||||
const apiTotalTokens = uiTelemetryService.getLastPromptTokenCount();
|
||||
// Cached content token count — when available (e.g. DashScope prefix caching),
|
||||
// represents the cached overhead (system prompt + tools). Using this gives a much
|
||||
// more accurate "Messages" count: promptTokens - cachedTokens = actual history tokens.
|
||||
const apiCachedTokens = uiTelemetryService.getLastCachedContentTokenCount();
|
||||
|
||||
// 1. System prompt tokens (without memory, as memory is counted separately)
|
||||
const systemPromptText = getCoreSystemPrompt(undefined, modelName);
|
||||
|
|
@ -302,7 +306,16 @@ export const contextCommand: SlashCommand = {
|
|||
scaledAllTools +
|
||||
displayMemoryFiles +
|
||||
Math.round(loadedBodiesTokens * overheadScale);
|
||||
messagesTokens = Math.max(0, totalTokens - scaledOverhead);
|
||||
|
||||
// When the API reports cached content tokens (e.g. DashScope prefix caching),
|
||||
// use them as the actual overhead indicator for a more accurate messages count.
|
||||
// cachedTokens ≈ system prompt + tools tokens actually served from cache.
|
||||
// This avoids the "messages = 0" problem caused by estimation overshoot.
|
||||
if (apiCachedTokens > 0) {
|
||||
messagesTokens = Math.max(0, totalTokens - apiCachedTokens);
|
||||
} else {
|
||||
messagesTokens = Math.max(0, totalTokens - scaledOverhead);
|
||||
}
|
||||
|
||||
freeSpace = Math.max(
|
||||
0,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue