feat(cli): display real-time token consumption during streaming (#2742) (#3329)

* feat(cli): display real-time token consumption during streaming (#2742)

Show ↓/↑ token count in the spinner during model execution:
- ↓ when receiving content, ↑ when waiting for API response
- Accumulates across the whole turn (tool calls don't reset)
- Includes agent/subagent token consumption
- Uses useAnimationFrame hook (50ms polling) to avoid flickering

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>

* fix: address review feedback for real-time token display

- Replace unsafe type assertion with proper type guard in Composer
- Fix license header in useAnimationFrame.ts to match project standard
- Clarify tokenCount is replaced (not accumulated) per USAGE_METADATA event
- Use multi-line JSDoc format for isReceivingContent prop
- Improve re-sync comment in useAnimationFrame hook
- Revert unrelated streamingState dep change in AppContainer

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>

* fix(core): use output-only tokens and accumulate across subagent rounds

Subagent token display had two bugs:
- Used totalTokenCount (input+output) instead of candidatesTokenCount
  (output-only), causing mixed units when aggregated with main stream
- Overwrote tokenCount per round instead of accumulating, so multi-round
  subagents only showed the last round's count

Co-Authored-By: Qwen-Coder <noreply@qwen.ai>

* fix(cli): smooth token counter animation and include tool args

Interpolate displayed token count toward the real value (3/frame for
small gaps, ~20% for medium, 50 for large) so chunked arrivals like
tool-call args no longer cause visible jumps. Also accumulate tool
call args JSON length into the streaming estimate, matching Claude
Code's input_json_delta handling.

Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com>

* fix(cli): scope token animation re-renders to LoadingIndicator

The 50ms useAnimationFrame poll lived in Composer, causing its entire
subtree (InputPrompt, Footer, KeyboardShortcuts) to reconcile 20×/sec
during streaming. Combined with the spinner and streamed text deltas,
ink redrew enough lines to produce visible terminal flicker.

Move the animation hook into LoadingIndicator so only that component
re-renders per frame, and slow polling to 100ms to match the spinner
cadence.

Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com>

* fix: address review nits on token display

1. AgentResultDisplay.tokenCount jsdoc said "(input + output)" but the
   value has been output-only since d393f23df — update the comment so it
   matches the implementation.
2. useAnimationFrame held the previous turn's count in state until the
   next interval tick, briefly flashing stale numbers when a new turn
   reset the ref to 0. Snap displayRef down synchronously on render and
   return Math.min(displayValue, ref.current) so the reset is reflected
   immediately; the interval tick still catches state up afterward.

Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com>

---------

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
Co-authored-by: Qwen-Coder <noreply@qwen.ai>
Co-authored-by: Qwen-Coder <noreply@alibabacloud.com>
This commit is contained in:
qqqys 2026-04-21 17:01:40 +08:00 committed by GitHub
parent 07bd5c41cb
commit c25136f0ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 255 additions and 13 deletions

View file

@ -761,6 +761,8 @@ export const AppContainer = (props: AppContainerProps) => {
activePtyId,
loopDetectionConfirmationRequest,
pendingToolCalls,
streamingResponseLengthRef,
isReceivingContent,
} = useGeminiStream(
config.getGeminiClient(),
historyManager.history,
@ -2117,6 +2119,9 @@ export const AppContainer = (props: AppContainerProps) => {
isFeedbackDialogOpen,
// Per-task token tracking
taskStartTokens,
// Real-time token display
streamingResponseLengthRef,
isReceivingContent,
// Prompt suggestion
promptSuggestion,
dismissPromptSuggestion,
@ -2225,6 +2230,9 @@ export const AppContainer = (props: AppContainerProps) => {
isFeedbackDialogOpen,
// Per-task token tracking
taskStartTokens,
// Real-time token display
streamingResponseLengthRef,
isReceivingContent,
// Prompt suggestion
promptSuggestion,
dismissPromptSuggestion,

View file

@ -112,6 +112,9 @@ const createMockUIState = (overrides: Partial<UIState> = {}): UIState =>
nightly: false,
isTrustedFolder: true,
taskStartTokens: 0,
streamingResponseLengthRef: { current: 0 },
isReceivingContent: false,
pendingGeminiHistoryItems: [],
...overrides,
}) as UIState;

View file

@ -15,7 +15,7 @@ import { useUIState } from '../contexts/UIStateContext.js';
import { useUIActions } from '../contexts/UIActionsContext.js';
import { useVimMode } from '../contexts/VimModeContext.js';
import { useConfig } from '../contexts/ConfigContext.js';
import { StreamingState } from '../types.js';
import { StreamingState, type HistoryItemToolGroup } from '../types.js';
import { ConfigInitDisplay } from '../components/ConfigInitDisplay.js';
import { FeedbackDialog } from '../FeedbackDialog.js';
import { t } from '../../i18n/index.js';
@ -27,17 +27,40 @@ export const Composer = () => {
const uiActions = useUIActions();
const { vimEnabled } = useVimMode();
const { showAutoAcceptIndicator, sessionStats, taskStartTokens } = uiState;
const {
showAutoAcceptIndicator,
streamingResponseLengthRef,
isReceivingContent,
} = uiState;
const tokens = Object.values(sessionStats.metrics?.models ?? {}).reduce(
(acc, model) => ({
prompt: acc.prompt + (model.tokens?.prompt ?? 0),
candidates: acc.candidates + (model.tokens?.candidates ?? 0),
}),
{ prompt: 0, candidates: 0 },
);
// Real-time token animation is performed inside LoadingIndicator itself, so
// the 100ms polling only re-renders that one component — keeping InputPrompt
// and Footer static avoids terminal flicker during streaming.
const isStreaming =
uiState.streamingState === StreamingState.Responding ||
uiState.streamingState === StreamingState.WaitingForConfirmation;
const taskTokens = tokens.candidates - taskStartTokens;
// Aggregate agent tool tokens from executing tool calls. Only changes when
// a subagent reports progress, so it doesn't drive the animation loop.
let agentTokens = 0;
for (const item of uiState.pendingGeminiHistoryItems ?? []) {
if (item.type === 'tool_group') {
const toolGroup = item as HistoryItemToolGroup;
for (const tool of toolGroup.tools) {
const display = tool.resultDisplay;
if (
typeof display === 'object' &&
display !== null &&
'type' in display &&
display.type === 'task_execution' &&
'tokenCount' in display &&
typeof display.tokenCount === 'number'
) {
agentTokens += display.tokenCount;
}
}
}
}
// State for keyboard shortcuts display toggle
const [showShortcuts, setShowShortcuts] = useState(false);
@ -74,7 +97,10 @@ export const Composer = () => {
: uiState.currentLoadingPhrase
}
elapsedTime={uiState.elapsedTime}
candidatesTokens={taskTokens}
candidatesTokens={agentTokens}
streamingCharsRef={streamingResponseLengthRef}
isStreaming={isStreaming}
isReceivingContent={isReceivingContent}
/>
)}

View file

@ -374,5 +374,29 @@ describe('<LoadingIndicator />', () => {
const output = lastFrame();
expect(output).toContain('(5s · ↓ 5.4k tokens · esc to cancel)');
});
it('should show ↑ arrow when waiting for API response', () => {
const { lastFrame } = renderWithContext(
<LoadingIndicator
{...defaultProps}
candidatesTokens={500}
isReceivingContent={false}
/>,
StreamingState.Responding,
);
const output = lastFrame();
expect(output).toContain('↑ 500 tokens');
expect(output).not.toContain('↓');
});
it('should show ↓ arrow when receiving content (default)', () => {
const { lastFrame } = renderWithContext(
<LoadingIndicator {...defaultProps} candidatesTokens={500} />,
StreamingState.Responding,
);
const output = lastFrame();
expect(output).toContain('↓ 500 tokens');
expect(output).not.toContain('↑');
});
});
});

View file

@ -6,6 +6,7 @@
import type { ThoughtSummary } from '@qwen-code/qwen-code-core';
import type React from 'react';
import { useRef } from 'react';
import { Box, Text } from 'ink';
import { theme } from '../semantic-colors.js';
import { useStreamingContext } from '../contexts/StreamingContext.js';
@ -13,6 +14,7 @@ import { StreamingState } from '../types.js';
import { GeminiRespondingSpinner } from './GeminiRespondingSpinner.js';
import { formatDuration, formatTokenCount } from '../utils/formatters.js';
import { useTerminalSize } from '../hooks/useTerminalSize.js';
import { useAnimationFrame } from '../hooks/useAnimationFrame.js';
import { isNarrowWidth } from '../utils/isNarrowWidth.js';
import { t } from '../../i18n/index.js';
@ -22,6 +24,21 @@ interface LoadingIndicatorProps {
rightContent?: React.ReactNode;
thought?: ThoughtSummary | null;
candidatesTokens?: number;
/**
* Live-updating character counter for the streaming response. When provided
* together with `isStreaming`, the indicator animates a token estimate
* (chars / 4) internally, so the animation never re-renders `Composer` or
* the input prompt.
*/
streamingCharsRef?: React.RefObject<number>;
/** Whether to poll `streamingCharsRef` (true during Responding/WaitingForConfirmation). */
isStreaming?: boolean;
/**
* True when receiving content (shows arrow), false when waiting for API
* response (shows arrow).
* @default true
*/
isReceivingContent?: boolean;
}
export const LoadingIndicator: React.FC<LoadingIndicatorProps> = ({
@ -30,25 +47,40 @@ export const LoadingIndicator: React.FC<LoadingIndicatorProps> = ({
rightContent,
thought,
candidatesTokens,
streamingCharsRef,
isStreaming,
isReceivingContent = true,
}) => {
const streamingState = useStreamingContext();
const { columns: terminalWidth } = useTerminalSize();
const isNarrow = isNarrowWidth(terminalWidth);
// Animate the streaming-chars counter locally so only this component
// re-renders on each animation frame (100ms ≈ spinner cadence). Siblings
// like InputPrompt / Footer stay static, which eliminates terminal flicker
// during streaming output.
const fallbackRef = useRef(0);
const animatedChars = useAnimationFrame(
streamingCharsRef ?? fallbackRef,
streamingCharsRef && isStreaming ? 100 : null,
);
if (streamingState === StreamingState.Idle) {
return null;
}
const primaryText = thought?.subject || currentLoadingPhrase;
const outputTokens = candidatesTokens ?? 0;
const streamingTokens = streamingCharsRef ? Math.round(animatedChars / 4) : 0;
const outputTokens = (candidatesTokens ?? 0) + streamingTokens;
const showTokens = !isNarrow && outputTokens > 0;
const tokenArrow = isReceivingContent ? '↓' : '↑';
const timeStr =
elapsedTime < 60 ? `${elapsedTime}s` : formatDuration(elapsedTime * 1000);
const tokenStr = showTokens
? ` · ${formatTokenCount(outputTokens)} tokens`
? ` · ${tokenArrow} ${formatTokenCount(outputTokens)} tokens`
: '';
const cancelAndTimerContent =

View file

@ -144,6 +144,10 @@ export interface UIState {
isFeedbackDialogOpen: boolean;
// Per-task token tracking
taskStartTokens: number;
// Real-time token display: ref to streaming output char length (polled, not state)
streamingResponseLengthRef: React.RefObject<number>;
// True = receiving content (↓), false = waiting for API response (↑)
isReceivingContent: boolean;
// Prompt suggestion
promptSuggestion: string | null;
/** Dismiss prompt suggestion (clears state, aborts speculation) */

View file

@ -0,0 +1,96 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { useEffect, useRef, useState } from 'react';
/**
* Hook that polls a ref at a fixed interval and smoothly animates the
* displayed value toward the real value. This avoids jarring jumps when
* large chunks of characters arrive at once (e.g. tool call args JSON).
*
* Animation rules (matching Claude Code's SpinnerAnimationRow):
* - Gap < 70: increment by 3 per frame
* - Gap 70200: increment by ~20% of gap per frame
* - Gap > 200: increment by 50 per frame
*
* When the real value decreases (e.g. ref reset to 0), the displayed
* value snaps immediately animation only applies to increases.
*
* Pass `null` as intervalMs to pause polling entirely.
*
* @param watchRef - The ref to poll for changes.
* @param intervalMs - How often to check (ms), or null to pause.
* @returns The smoothly animated value.
*/
export function useAnimationFrame(
watchRef: React.RefObject<number>,
intervalMs: number | null = 50,
): number {
const [displayValue, setDisplayValue] = useState(() => watchRef.current);
const displayRef = useRef(watchRef.current);
const targetRef = useRef(watchRef.current);
// Snap down synchronously on render when the external ref drops below the
// last displayed value (e.g. ref reset to 0 at the start of a new turn).
// Without this, the previous turn's count would briefly flash before the
// next interval tick fires. Idempotent under StrictMode double-render.
const currentTarget = watchRef.current;
if (currentTarget < displayRef.current) {
displayRef.current = currentTarget;
targetRef.current = currentTarget;
}
useEffect(() => {
if (intervalMs === null) return;
// Re-sync when the interval resumes or the ref changed externally
// (e.g. ref reset to 0 at new turn start while paused).
const current = watchRef.current;
if (current !== targetRef.current) {
targetRef.current = current;
// Snap down immediately (reset), animate up
if (current < displayRef.current) {
displayRef.current = current;
setDisplayValue(current);
}
}
const id = setInterval(() => {
const realValue = watchRef.current;
targetRef.current = realValue;
// Snap down immediately on reset
if (realValue < displayRef.current) {
displayRef.current = realValue;
setDisplayValue(realValue);
return;
}
const gap = realValue - displayRef.current;
if (gap <= 0) return;
// Smooth interpolation: small gaps crawl, large gaps leap
let increment: number;
if (gap < 70) {
increment = 3;
} else if (gap <= 200) {
increment = Math.max(3, Math.round(gap * 0.2));
} else {
increment = 50;
}
const next = Math.min(displayRef.current + increment, realValue);
displayRef.current = next;
setDisplayValue(next);
}, intervalMs);
return () => clearInterval(id);
}, [watchRef, intervalMs]);
// Return the lower of state vs current ref so a freshly reset ref is
// reflected immediately, before setDisplayValue catches up next tick.
return Math.min(displayValue, currentTarget);
}

View file

@ -241,6 +241,12 @@ export const useGeminiStream = (
const processedMemoryToolsRef = useRef<Set<string>>(new Set());
const submitPromptOnCompleteRef = useRef<(() => Promise<void>) | null>(null);
const modelOverrideRef = useRef<string | undefined>(undefined);
// --- Real-time token display ---
// Accumulates output character count across the whole turn (not per API call).
// Uses a ref to avoid re-renders on every text_delta.
const streamingResponseLengthRef = useRef(0);
// Tracks whether we are receiving content (↓) or waiting for API (↑).
const [isReceivingContent, setIsReceivingContent] = useState(false);
const {
startNewPrompt,
getPromptCount,
@ -671,6 +677,9 @@ export const useGeminiStream = (
// Prevents additional output after a user initiated cancel.
return '';
}
// Track output chars for real-time token estimation & mark as receiving.
streamingResponseLengthRef.current += eventValue.length;
setIsReceivingContent(true);
let newGeminiMessageBuffer = currentGeminiMessageBuffer + eventValue;
if (
pendingHistoryItemRef.current?.type !== 'gemini' &&
@ -1138,6 +1147,14 @@ export const useGeminiStream = (
break;
case ServerGeminiEventType.ToolCallRequest:
toolCallRequests.push(event.value);
// Count tool call args JSON toward token estimation (matches
// Claude Code's input_json_delta handling).
try {
const argsJson = JSON.stringify(event.value.args);
streamingResponseLengthRef.current += argsJson.length;
} catch {
// Best-effort — don't block on serialization errors
}
break;
case ServerGeminiEventType.UserCancelled:
handleUserCancelledEvent(userMessageTimestamp);
@ -1386,6 +1403,13 @@ export const useGeminiStream = (
setIsResponding(true);
setInitError(null);
// Entering "requesting" phase — no content yet for this API call.
setIsReceivingContent(false);
// Reset char counter only on new user queries; tool-result continuations
// keep accumulating so the token count only goes up within a turn.
if (submitType !== SendMessageType.ToolResult) {
streamingResponseLengthRef.current = 0;
}
try {
// Emit user message to dual output sidecar (if enabled).
@ -1977,5 +2001,7 @@ export const useGeminiStream = (
handleApprovalModeChange,
activePtyId,
loopDetectionConfirmationRequest,
streamingResponseLengthRef,
isReceivingContent,
};
};