feat(cli): display real-time token consumption during streaming (#2742) (#3329)

* feat(cli): display real-time token consumption during streaming (#2742) Show ↓/↑ token count in the spinner during model execution: - ↓ when receiving content, ↑ when waiting for API response - Accumulates across the whole turn (tool calls don't reset) - Includes agent/subagent token consumption - Uses useAnimationFrame hook (50ms polling) to avoid flickering Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com> * fix: address review feedback for real-time token display - Replace unsafe type assertion with proper type guard in Composer - Fix license header in useAnimationFrame.ts to match project standard - Clarify tokenCount is replaced (not accumulated) per USAGE_METADATA event - Use multi-line JSDoc format for isReceivingContent prop - Improve re-sync comment in useAnimationFrame hook - Revert unrelated streamingState dep change in AppContainer Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com> * fix(core): use output-only tokens and accumulate across subagent rounds Subagent token display had two bugs: - Used totalTokenCount (input+output) instead of candidatesTokenCount (output-only), causing mixed units when aggregated with main stream - Overwrote tokenCount per round instead of accumulating, so multi-round subagents only showed the last round's count Co-Authored-By: Qwen-Coder <noreply@qwen.ai> * fix(cli): smooth token counter animation and include tool args Interpolate displayed token count toward the real value (3/frame for small gaps, ~20% for medium, 50 for large) so chunked arrivals like tool-call args no longer cause visible jumps. Also accumulate tool call args JSON length into the streaming estimate, matching Claude Code's input_json_delta handling. Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com> * fix(cli): scope token animation re-renders to LoadingIndicator The 50ms useAnimationFrame poll lived in Composer, causing its entire subtree (InputPrompt, Footer, KeyboardShortcuts) to reconcile 20×/sec during streaming. Combined with the spinner and streamed text deltas, ink redrew enough lines to produce visible terminal flicker. Move the animation hook into LoadingIndicator so only that component re-renders per frame, and slow polling to 100ms to match the spinner cadence. Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com> * fix: address review nits on token display 1. AgentResultDisplay.tokenCount jsdoc said "(input + output)" but the value has been output-only since d393f23df — update the comment so it matches the implementation. 2. useAnimationFrame held the previous turn's count in state until the next interval tick, briefly flashing stale numbers when a new turn reset the ref to 0. Snap displayRef down synchronously on render and return Math.min(displayValue, ref.current) so the reset is reflected immediately; the interval tick still catches state up afterward. Co-Authored-By: Qwen-Coder <noreply@alibabacloud.com> --------- Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com> Co-authored-by: Qwen-Coder <noreply@qwen.ai> Co-authored-by: Qwen-Coder <noreply@alibabacloud.com>
2026-05-05 15:31:27 +00:00 · 2026-04-21 17:01:40 +08:00 · 2026-04-21 17:01:40 +08:00 · c25136f0ef
commit c25136f0ef
parent 07bd5c41cb
10 changed files with 255 additions and 13 deletions
--- a/packages/cli/src/ui/AppContainer.tsx
+++ b/packages/cli/src/ui/AppContainer.tsx
@ -761,6 +761,8 @@ export const AppContainer = (props: AppContainerProps) => {
    activePtyId,
    loopDetectionConfirmationRequest,
    pendingToolCalls,
+    streamingResponseLengthRef,
+    isReceivingContent,
  } = useGeminiStream(
    config.getGeminiClient(),
    historyManager.history,
@ -2117,6 +2119,9 @@ export const AppContainer = (props: AppContainerProps) => {
      isFeedbackDialogOpen,
      // Per-task token tracking
      taskStartTokens,
+      // Real-time token display
+      streamingResponseLengthRef,
+      isReceivingContent,
      // Prompt suggestion
      promptSuggestion,
      dismissPromptSuggestion,
@ -2225,6 +2230,9 @@ export const AppContainer = (props: AppContainerProps) => {
      isFeedbackDialogOpen,
      // Per-task token tracking
      taskStartTokens,
+      // Real-time token display
+      streamingResponseLengthRef,
+      isReceivingContent,
      // Prompt suggestion
      promptSuggestion,
      dismissPromptSuggestion,
--- a/packages/cli/src/ui/components/Composer.test.tsx
+++ b/packages/cli/src/ui/components/Composer.test.tsx
@ -112,6 +112,9 @@ const createMockUIState = (overrides: Partial<UIState> = {}): UIState =>
    nightly: false,
    isTrustedFolder: true,
    taskStartTokens: 0,
+    streamingResponseLengthRef: { current: 0 },
+    isReceivingContent: false,
+    pendingGeminiHistoryItems: [],
    ...overrides,
  }) as UIState;

--- a/packages/cli/src/ui/components/Composer.tsx
+++ b/packages/cli/src/ui/components/Composer.tsx
@ -15,7 +15,7 @@ import { useUIState } from '../contexts/UIStateContext.js';
 import { useUIActions } from '../contexts/UIActionsContext.js';
 import { useVimMode } from '../contexts/VimModeContext.js';
 import { useConfig } from '../contexts/ConfigContext.js';
-import { StreamingState } from '../types.js';
+import { StreamingState, type HistoryItemToolGroup } from '../types.js';
 import { ConfigInitDisplay } from '../components/ConfigInitDisplay.js';
 import { FeedbackDialog } from '../FeedbackDialog.js';
 import { t } from '../../i18n/index.js';
@ -27,17 +27,40 @@ export const Composer = () => {
  const uiActions = useUIActions();
  const { vimEnabled } = useVimMode();

-  const { showAutoAcceptIndicator, sessionStats, taskStartTokens } = uiState;
+  const {
+    showAutoAcceptIndicator,
+    streamingResponseLengthRef,
+    isReceivingContent,
+  } = uiState;

-  const tokens = Object.values(sessionStats.metrics?.models ?? {}).reduce(
-    (acc, model) => ({
-      prompt: acc.prompt + (model.tokens?.prompt ?? 0),
-      candidates: acc.candidates + (model.tokens?.candidates ?? 0),
-    }),
-    { prompt: 0, candidates: 0 },
-  );
+  // Real-time token animation is performed inside LoadingIndicator itself, so
+  // the 100ms polling only re-renders that one component — keeping InputPrompt
+  // and Footer static avoids terminal flicker during streaming.
+  const isStreaming =
+    uiState.streamingState === StreamingState.Responding ||
+    uiState.streamingState === StreamingState.WaitingForConfirmation;

-  const taskTokens = tokens.candidates - taskStartTokens;
+  // Aggregate agent tool tokens from executing tool calls. Only changes when
+  // a subagent reports progress, so it doesn't drive the animation loop.
+  let agentTokens = 0;
+  for (const item of uiState.pendingGeminiHistoryItems ?? []) {
+    if (item.type === 'tool_group') {
+      const toolGroup = item as HistoryItemToolGroup;
+      for (const tool of toolGroup.tools) {
+        const display = tool.resultDisplay;
+        if (
+          typeof display === 'object' &&
+          display !== null &&
+          'type' in display &&
+          display.type === 'task_execution' &&
+          'tokenCount' in display &&
+          typeof display.tokenCount === 'number'
+        ) {
+          agentTokens += display.tokenCount;
+        }
+      }
+    }
+  }

  // State for keyboard shortcuts display toggle
  const [showShortcuts, setShowShortcuts] = useState(false);
@ -74,7 +97,10 @@ export const Composer = () => {
              : uiState.currentLoadingPhrase
          }
          elapsedTime={uiState.elapsedTime}
-          candidatesTokens={taskTokens}
+          candidatesTokens={agentTokens}
+          streamingCharsRef={streamingResponseLengthRef}
+          isStreaming={isStreaming}
+          isReceivingContent={isReceivingContent}
        />
      )}

--- a/packages/cli/src/ui/components/LoadingIndicator.test.tsx
+++ b/packages/cli/src/ui/components/LoadingIndicator.test.tsx
@ -374,5 +374,29 @@ describe('<LoadingIndicator />', () => {
      const output = lastFrame();
      expect(output).toContain('(5s · ↓ 5.4k tokens · esc to cancel)');
    });
+
+    it('should show ↑ arrow when waiting for API response', () => {
+      const { lastFrame } = renderWithContext(
+        <LoadingIndicator
+          {...defaultProps}
+          candidatesTokens={500}
+          isReceivingContent={false}
+        />,
+        StreamingState.Responding,
+      );
+      const output = lastFrame();
+      expect(output).toContain('↑ 500 tokens');
+      expect(output).not.toContain('↓');
+    });
+
+    it('should show ↓ arrow when receiving content (default)', () => {
+      const { lastFrame } = renderWithContext(
+        <LoadingIndicator {...defaultProps} candidatesTokens={500} />,
+        StreamingState.Responding,
+      );
+      const output = lastFrame();
+      expect(output).toContain('↓ 500 tokens');
+      expect(output).not.toContain('↑');
+    });
  });
 });
--- a/packages/cli/src/ui/components/LoadingIndicator.tsx
+++ b/packages/cli/src/ui/components/LoadingIndicator.tsx
@ -6,6 +6,7 @@

 import type { ThoughtSummary } from '@qwen-code/qwen-code-core';
 import type React from 'react';
+import { useRef } from 'react';
 import { Box, Text } from 'ink';
 import { theme } from '../semantic-colors.js';
 import { useStreamingContext } from '../contexts/StreamingContext.js';
@ -13,6 +14,7 @@ import { StreamingState } from '../types.js';
 import { GeminiRespondingSpinner } from './GeminiRespondingSpinner.js';
 import { formatDuration, formatTokenCount } from '../utils/formatters.js';
 import { useTerminalSize } from '../hooks/useTerminalSize.js';
+import { useAnimationFrame } from '../hooks/useAnimationFrame.js';
 import { isNarrowWidth } from '../utils/isNarrowWidth.js';
 import { t } from '../../i18n/index.js';

@ -22,6 +24,21 @@ interface LoadingIndicatorProps {
  rightContent?: React.ReactNode;
  thought?: ThoughtSummary | null;
  candidatesTokens?: number;
+  /**
+   * Live-updating character counter for the streaming response. When provided
+   * together with `isStreaming`, the indicator animates a token estimate
+   * (chars / 4) internally, so the animation never re-renders `Composer` or
+   * the input prompt.
+   */
+  streamingCharsRef?: React.RefObject<number>;
+  /** Whether to poll `streamingCharsRef` (true during Responding/WaitingForConfirmation). */
+  isStreaming?: boolean;
+  /**
+   * True when receiving content (shows ↓ arrow), false when waiting for API
+   * response (shows ↑ arrow).
+   * @default true
+   */
+  isReceivingContent?: boolean;
 }

 export const LoadingIndicator: React.FC<LoadingIndicatorProps> = ({
@ -30,25 +47,40 @@ export const LoadingIndicator: React.FC<LoadingIndicatorProps> = ({
  rightContent,
  thought,
  candidatesTokens,
+  streamingCharsRef,
+  isStreaming,
+  isReceivingContent = true,
 }) => {
  const streamingState = useStreamingContext();
  const { columns: terminalWidth } = useTerminalSize();
  const isNarrow = isNarrowWidth(terminalWidth);

+  // Animate the streaming-chars counter locally so only this component
+  // re-renders on each animation frame (100ms ≈ spinner cadence). Siblings
+  // like InputPrompt / Footer stay static, which eliminates terminal flicker
+  // during streaming output.
+  const fallbackRef = useRef(0);
+  const animatedChars = useAnimationFrame(
+    streamingCharsRef ?? fallbackRef,
+    streamingCharsRef && isStreaming ? 100 : null,
+  );
+
  if (streamingState === StreamingState.Idle) {
    return null;
  }

  const primaryText = thought?.subject || currentLoadingPhrase;

-  const outputTokens = candidatesTokens ?? 0;
+  const streamingTokens = streamingCharsRef ? Math.round(animatedChars / 4) : 0;
+  const outputTokens = (candidatesTokens ?? 0) + streamingTokens;
  const showTokens = !isNarrow && outputTokens > 0;
+  const tokenArrow = isReceivingContent ? '↓' : '↑';

  const timeStr =
    elapsedTime < 60 ? `${elapsedTime}s` : formatDuration(elapsedTime * 1000);

  const tokenStr = showTokens
-    ? ` · ↓ ${formatTokenCount(outputTokens)} tokens`
+    ? ` · ${tokenArrow} ${formatTokenCount(outputTokens)} tokens`
    : '';

  const cancelAndTimerContent =
--- a/packages/cli/src/ui/contexts/UIStateContext.tsx
+++ b/packages/cli/src/ui/contexts/UIStateContext.tsx
@ -144,6 +144,10 @@ export interface UIState {
  isFeedbackDialogOpen: boolean;
  // Per-task token tracking
  taskStartTokens: number;
+  // Real-time token display: ref to streaming output char length (polled, not state)
+  streamingResponseLengthRef: React.RefObject<number>;
+  // True = receiving content (↓), false = waiting for API response (↑)
+  isReceivingContent: boolean;
  // Prompt suggestion
  promptSuggestion: string | null;
  /** Dismiss prompt suggestion (clears state, aborts speculation) */
--- a/packages/cli/src/ui/hooks/useAnimationFrame.ts
+++ b/packages/cli/src/ui/hooks/useAnimationFrame.ts
@ -0,0 +1,96 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { useEffect, useRef, useState } from 'react';
+
+/**
+ * Hook that polls a ref at a fixed interval and smoothly animates the
+ * displayed value toward the real value. This avoids jarring jumps when
+ * large chunks of characters arrive at once (e.g. tool call args JSON).
+ *
+ * Animation rules (matching Claude Code's SpinnerAnimationRow):
+ * - Gap < 70:   increment by 3 per frame
+ * - Gap 70–200: increment by ~20% of gap per frame
+ * - Gap > 200:  increment by 50 per frame
+ *
+ * When the real value decreases (e.g. ref reset to 0), the displayed
+ * value snaps immediately — animation only applies to increases.
+ *
+ * Pass `null` as intervalMs to pause polling entirely.
+ *
+ * @param watchRef - The ref to poll for changes.
+ * @param intervalMs - How often to check (ms), or null to pause.
+ * @returns The smoothly animated value.
+ */
+export function useAnimationFrame(
+  watchRef: React.RefObject<number>,
+  intervalMs: number | null = 50,
+): number {
+  const [displayValue, setDisplayValue] = useState(() => watchRef.current);
+  const displayRef = useRef(watchRef.current);
+  const targetRef = useRef(watchRef.current);
+
+  // Snap down synchronously on render when the external ref drops below the
+  // last displayed value (e.g. ref reset to 0 at the start of a new turn).
+  // Without this, the previous turn's count would briefly flash before the
+  // next interval tick fires. Idempotent under StrictMode double-render.
+  const currentTarget = watchRef.current;
+  if (currentTarget < displayRef.current) {
+    displayRef.current = currentTarget;
+    targetRef.current = currentTarget;
+  }
+
+  useEffect(() => {
+    if (intervalMs === null) return;
+
+    // Re-sync when the interval resumes or the ref changed externally
+    // (e.g. ref reset to 0 at new turn start while paused).
+    const current = watchRef.current;
+    if (current !== targetRef.current) {
+      targetRef.current = current;
+      // Snap down immediately (reset), animate up
+      if (current < displayRef.current) {
+        displayRef.current = current;
+        setDisplayValue(current);
+      }
+    }
+
+    const id = setInterval(() => {
+      const realValue = watchRef.current;
+      targetRef.current = realValue;
+
+      // Snap down immediately on reset
+      if (realValue < displayRef.current) {
+        displayRef.current = realValue;
+        setDisplayValue(realValue);
+        return;
+      }
+
+      const gap = realValue - displayRef.current;
+      if (gap <= 0) return;
+
+      // Smooth interpolation: small gaps crawl, large gaps leap
+      let increment: number;
+      if (gap < 70) {
+        increment = 3;
+      } else if (gap <= 200) {
+        increment = Math.max(3, Math.round(gap * 0.2));
+      } else {
+        increment = 50;
+      }
+
+      const next = Math.min(displayRef.current + increment, realValue);
+      displayRef.current = next;
+      setDisplayValue(next);
+    }, intervalMs);
+
+    return () => clearInterval(id);
+  }, [watchRef, intervalMs]);
+
+  // Return the lower of state vs current ref so a freshly reset ref is
+  // reflected immediately, before setDisplayValue catches up next tick.
+  return Math.min(displayValue, currentTarget);
+}
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@ -241,6 +241,12 @@ export const useGeminiStream = (
  const processedMemoryToolsRef = useRef<Set<string>>(new Set());
  const submitPromptOnCompleteRef = useRef<(() => Promise<void>) | null>(null);
  const modelOverrideRef = useRef<string | undefined>(undefined);
+  // --- Real-time token display ---
+  // Accumulates output character count across the whole turn (not per API call).
+  // Uses a ref to avoid re-renders on every text_delta.
+  const streamingResponseLengthRef = useRef(0);
+  // Tracks whether we are receiving content (↓) or waiting for API (↑).
+  const [isReceivingContent, setIsReceivingContent] = useState(false);
  const {
    startNewPrompt,
    getPromptCount,
@ -671,6 +677,9 @@ export const useGeminiStream = (
        // Prevents additional output after a user initiated cancel.
        return '';
      }
+      // Track output chars for real-time token estimation & mark as receiving.
+      streamingResponseLengthRef.current += eventValue.length;
+      setIsReceivingContent(true);
      let newGeminiMessageBuffer = currentGeminiMessageBuffer + eventValue;
      if (
        pendingHistoryItemRef.current?.type !== 'gemini' &&
@ -1138,6 +1147,14 @@ export const useGeminiStream = (
            break;
          case ServerGeminiEventType.ToolCallRequest:
            toolCallRequests.push(event.value);
+            // Count tool call args JSON toward token estimation (matches
+            // Claude Code's input_json_delta handling).
+            try {
+              const argsJson = JSON.stringify(event.value.args);
+              streamingResponseLengthRef.current += argsJson.length;
+            } catch {
+              // Best-effort — don't block on serialization errors
+            }
            break;
          case ServerGeminiEventType.UserCancelled:
            handleUserCancelledEvent(userMessageTimestamp);
@ -1386,6 +1403,13 @@ export const useGeminiStream = (

        setIsResponding(true);
        setInitError(null);
+        // Entering "requesting" phase — no content yet for this API call.
+        setIsReceivingContent(false);
+        // Reset char counter only on new user queries; tool-result continuations
+        // keep accumulating so the token count only goes up within a turn.
+        if (submitType !== SendMessageType.ToolResult) {
+          streamingResponseLengthRef.current = 0;
+        }

        try {
          // Emit user message to dual output sidecar (if enabled).
@ -1977,5 +2001,7 @@ export const useGeminiStream = (
    handleApprovalModeChange,
    activePtyId,
    loopDetectionConfirmationRequest,
+    streamingResponseLengthRef,
+    isReceivingContent,
  };
 };