refactor(llm): make LLM.Usage a fully-additive contract

Defines a single invariant for `LLM.Usage`: every field is non-negative and every meaningful aggregate is a *sum*, never a difference. Total billable input = inputTokens + cacheReadInputTokens + cacheWriteInputTokens. Total billable output = outputTokens + reasoningTokens. Adding two non-negatives cannot underflow, so consumers can no longer reproduce the underflow-then-clamp bug class fixed by #26620. Each protocol mapper now enforces the contract at the provider boundary via `ProviderShared.subtractTokens`, which clamps with `Math.max(0, …)` for defense against provider bugs: - OpenAI Chat / Responses: pull `cached_tokens` out of `prompt_tokens` / `input_tokens`; pull `reasoning_tokens` out of `completion_tokens` / `output_tokens`. The provider's `total_tokens` is preserved verbatim. - Gemini: pull `cachedContentTokenCount` out of `promptTokenCount`. Gemini already split visible candidates from thoughts. - Bedrock: pull `cacheReadInputTokens` and `cacheWriteInputTokens` out of `inputTokens`, matching AWS prompt-caching docs. - Anthropic: already non-overlapping per the Messages API; pass through. Adds `Usage.totalInput` / `Usage.totalOutput` helpers for callers that want the merged view, and a regression test covering the clamp behavior. The reasoning underflow fixed in #26620 was the most visible symptom of a broader semantic inconsistency in this package: providers also disagreed on whether `inputTokens` includes cache reads (Anthropic excluded; OpenAI/Gemini/Bedrock included), which would silently double-subtract the moment v2 wired LLM.Usage into Session.getUsage. Normalizing now, pre-integration, closes both holes in one move.
2026-05-30 20:44:31 +00:00 · 2026-05-10 13:07:58 -04:00 · 2026-05-10 13:07:58 -04:00 · b9451175a6
commit b9451175a6
parent 9c8da69196
11 changed files with 175 additions and 20 deletions
--- a/packages/llm/src/protocols/anthropic-messages.ts
+++ b/packages/llm/src/protocols/anthropic-messages.ts
@ -364,6 +364,14 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
  return "unknown"
 }

+// Anthropic already reports input/cache-read/cache-write as separate
+// non-overlapping categories per the Messages API docs, so the additive
+// `LLM.Usage` contract is satisfied by direct pass-through. Extended
+// thinking tokens are *not* broken out by Anthropic — they're billed as
+// part of `output_tokens`, so `outputTokens` here may include reasoning
+// the same way OpenAI's `output_tokens` does pre-normalization. This is
+// a documented limitation of the Anthropic API surface, not a contract
+// violation.
 const mapUsage = (usage: AnthropicUsage | undefined): Usage | undefined => {
  if (!usage) return undefined
  return new Usage({
--- a/packages/llm/src/protocols/bedrock-converse.ts
+++ b/packages/llm/src/protocols/bedrock-converse.ts
@ -363,12 +363,21 @@ const mapFinishReason = (reason: string): FinishReason => {
  return "unknown"
 }

+// AWS Bedrock Converse reports `inputTokens` as the total prompt with
+// cached and cache-write tokens included (per the Bedrock prompt-caching
+// docs). Pull each subtotal out at the boundary so the additive
+// `LLM.Usage` contract holds. Bedrock does not separately report
+// reasoning tokens for any current model.
 const mapUsage = (usage: BedrockUsageSchema | undefined): Usage | undefined => {
  if (!usage) return undefined
+  const inputTokens = ProviderShared.subtractTokens(
+    ProviderShared.subtractTokens(usage.inputTokens, usage.cacheReadInputTokens),
+    usage.cacheWriteInputTokens,
+  )
  return new Usage({
-    inputTokens: usage.inputTokens,
+    inputTokens,
    outputTokens: usage.outputTokens,
-    totalTokens: ProviderShared.totalTokens(usage.inputTokens, usage.outputTokens, usage.totalTokens),
+    totalTokens: ProviderShared.totalTokens(inputTokens, usage.outputTokens, usage.totalTokens),
    cacheReadInputTokens: usage.cacheReadInputTokens,
    cacheWriteInputTokens: usage.cacheWriteInputTokens,
    native: usage,
--- a/packages/llm/src/protocols/gemini.ts
+++ b/packages/llm/src/protocols/gemini.ts
@ -281,14 +281,21 @@ const fromRequest = Effect.fn("Gemini.fromRequest")(function* (request: LLMReque
 // =============================================================================
 // Stream Parsing
 // =============================================================================
+// Gemini reports `promptTokenCount` as the total prompt with cached
+// content included, but `candidatesTokenCount` already excludes
+// `thoughtsTokenCount` (visible vs reasoning are separate). Pull the
+// cached portion out at the boundary so the additive `LLM.Usage` contract
+// holds across providers.
 const mapUsage = (usage: GeminiUsage | undefined) => {
  if (!usage) return undefined
+  const cached = usage.cachedContentTokenCount
+  const inputTokens = ProviderShared.subtractTokens(usage.promptTokenCount, cached)
  return new Usage({
-    inputTokens: usage.promptTokenCount,
+    inputTokens,
    outputTokens: usage.candidatesTokenCount,
    reasoningTokens: usage.thoughtsTokenCount,
-    cacheReadInputTokens: usage.cachedContentTokenCount,
-    totalTokens: ProviderShared.totalTokens(usage.promptTokenCount, usage.candidatesTokenCount, usage.totalTokenCount),
+    cacheReadInputTokens: cached,
+    totalTokens: ProviderShared.totalTokens(inputTokens, usage.candidatesTokenCount, usage.totalTokenCount),
    native: usage,
  })
 }
--- a/packages/llm/src/protocols/openai-chat.ts
+++ b/packages/llm/src/protocols/openai-chat.ts
@ -290,14 +290,23 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
  return "unknown"
 }

+// OpenAI Chat reports `prompt_tokens` as the total prompt (cached tokens
+// included) and `completion_tokens` as the total output (reasoning tokens
+// included). The additive `LLM.Usage` contract pulls each subtotal out at
+// the boundary so consumers never subtract — eliminating the underflow
+// class addressed by opencode#26620.
 const mapUsage = (usage: OpenAIChatEvent["usage"]): Usage | undefined => {
  if (!usage) return undefined
+  const cached = usage.prompt_tokens_details?.cached_tokens
+  const reasoning = usage.completion_tokens_details?.reasoning_tokens
+  const inputTokens = ProviderShared.subtractTokens(usage.prompt_tokens, cached)
+  const outputTokens = ProviderShared.subtractTokens(usage.completion_tokens, reasoning)
  return new Usage({
-    inputTokens: usage.prompt_tokens,
-    outputTokens: usage.completion_tokens,
-    reasoningTokens: usage.completion_tokens_details?.reasoning_tokens,
-    cacheReadInputTokens: usage.prompt_tokens_details?.cached_tokens,
-    totalTokens: ProviderShared.totalTokens(usage.prompt_tokens, usage.completion_tokens, usage.total_tokens),
+    inputTokens,
+    outputTokens,
+    reasoningTokens: reasoning,
+    cacheReadInputTokens: cached,
+    totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
    native: usage,
  })
 }
--- a/packages/llm/src/protocols/openai-responses.ts
+++ b/packages/llm/src/protocols/openai-responses.ts
@ -276,14 +276,22 @@ const fromRequest = Effect.fn("OpenAIResponses.fromRequest")(function* (request:
 // =============================================================================
 // Stream Parsing
 // =============================================================================
+// OpenAI Responses reports `input_tokens` as the total prompt (cached
+// included) and `output_tokens` as the total output (reasoning included).
+// The additive `LLM.Usage` contract pulls each subtotal out at the boundary
+// so consumers never subtract.
 const mapUsage = (usage: OpenAIResponsesUsage | null | undefined) => {
  if (!usage) return undefined
+  const cached = usage.input_tokens_details?.cached_tokens
+  const reasoning = usage.output_tokens_details?.reasoning_tokens
+  const inputTokens = ProviderShared.subtractTokens(usage.input_tokens, cached)
+  const outputTokens = ProviderShared.subtractTokens(usage.output_tokens, reasoning)
  return new Usage({
-    inputTokens: usage.input_tokens,
-    outputTokens: usage.output_tokens,
-    reasoningTokens: usage.output_tokens_details?.reasoning_tokens,
-    cacheReadInputTokens: usage.input_tokens_details?.cached_tokens,
-    totalTokens: ProviderShared.totalTokens(usage.input_tokens, usage.output_tokens, usage.total_tokens),
+    inputTokens,
+    outputTokens,
+    reasoningTokens: reasoning,
+    cacheReadInputTokens: cached,
+    totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
    native: usage,
  })
 }
--- a/packages/llm/src/protocols/shared.ts
+++ b/packages/llm/src/protocols/shared.ts
@ -42,6 +42,13 @@ export interface ToolAccumulator {
 * supplied total; otherwise falls back to `inputTokens + outputTokens` only
 * when at least one is defined. Returns `undefined` when neither input nor
 * output is known so routes don't publish a misleading `0`.
+ *
+ * Under the additive `LLM.Usage` contract, `inputTokens` and `outputTokens`
+ * are the non-cached input and visible output only. The provider-supplied
+ * `total` is the source of truth when present; the computed fallback
+ * under-counts cache and reasoning by design and exists mainly so
+ * Anthropic-style providers (which don't surface a total) still get a
+ * sensible aggregate on the input + output axes.
 */
 export const totalTokens = (
  inputTokens: number | undefined,
@ -53,6 +60,28 @@ export const totalTokens = (
  return (inputTokens ?? 0) + (outputTokens ?? 0)
 }

+/**
+ * Subtract `subtrahend` from `total`, clamping to zero if the provider
+ * reports a non-sensical breakdown (e.g. `cached_tokens > prompt_tokens`).
+ * Used by protocol mappers to enforce the additive `LLM.Usage` contract:
+ * each provider's "inclusive" subtotals (cached, reasoning) are pulled out
+ * of the parent count at the boundary so downstream consumers never have to
+ * subtract — eliminating the underflow class of bug where a clamped
+ * difference would silently store the wrong value.
+ *
+ * If `total` is `undefined`, returns `undefined` (we don't fabricate
+ * counts). If `subtrahend` is `undefined`, returns `total` unchanged. The
+ * provider-native breakdown stays available on `Usage.native` for debugging.
+ */
+export const subtractTokens = (
+  total: number | undefined,
+  subtrahend: number | undefined,
+): number | undefined => {
+  if (total === undefined) return undefined
+  if (subtrahend === undefined) return total
+  return Math.max(0, total - subtrahend)
+}
+
 export const eventError = (route: string, message: string, raw?: string) =>
  new LLMError({
    module: "ProviderShared",
--- a/packages/llm/src/schema/events.ts
+++ b/packages/llm/src/schema/events.ts
@ -3,6 +3,38 @@ import { ContentBlockID, FinishReason, ProtocolID, ProviderMetadata, ResponseID,
 import { ModelRef } from "./options"
 import { ToolResultValue } from "./messages"

+/**
+ * Token usage reported by an LLM provider, normalized to a fully-additive
+ * contract so consumers never have to subtract.
+ *
+ * **Field semantics** (each non-negative; missing means "not reported"):
+ *
+ * - `inputTokens` — non-cached input tokens (the "fresh" prompt portion).
+ * - `cacheReadInputTokens` — input tokens served from cache.
+ * - `cacheWriteInputTokens` — input tokens written to cache.
+ * - `outputTokens` — visible output tokens (text + tool calls).
+ * - `reasoningTokens` — hidden reasoning / thinking tokens.
+ * - `totalTokens` — provider-supplied total, or sum of input + output as a
+ *   fallback (see `ProviderShared.totalTokens`).
+ * - `native` — the provider's raw usage payload, preserved for debugging.
+ *
+ * **Invariant**: every aggregate of interest is a *sum*, never a difference.
+ * Total billable input = `inputTokens + cacheReadInputTokens +
+ * cacheWriteInputTokens`. Total billable output = `outputTokens +
+ * reasoningTokens`. Adding two non-negatives cannot underflow, so consumers
+ * cannot reproduce the underflow-then-clamp bug class where a stored
+ * negative gets rejected by a strict schema later.
+ *
+ * Each protocol mapper enforces this contract at the provider boundary.
+ * Providers that report cache or reasoning as subsets of input/output
+ * (OpenAI Chat/Responses, Gemini, Bedrock) have those subsets pulled out
+ * once via `ProviderShared.subtractTokens`, with `Math.max(0, …)` clamping
+ * for defense against provider bugs. Providers that already report
+ * separately (Anthropic) pass through. Where a provider doesn't surface a
+ * category at all (e.g. Anthropic does not break out extended-thinking
+ * tokens), the corresponding field is `undefined` and the parent count
+ * carries the combined total — a documented limitation of that API.
+ */
 export class Usage extends Schema.Class<Usage>("LLM.Usage")({
  inputTokens: Schema.optional(Schema.Number),
  outputTokens: Schema.optional(Schema.Number),
@ -13,6 +45,24 @@ export class Usage extends Schema.Class<Usage>("LLM.Usage")({
  native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
 }) {}

+export namespace Usage {
+  type InputFields = Pick<Usage, "inputTokens" | "cacheReadInputTokens" | "cacheWriteInputTokens">
+  type OutputFields = Pick<Usage, "outputTokens" | "reasoningTokens">
+
+  /**
+   * Sum of every input-side category: non-cached input + cache reads +
+   * cache writes. Monotonic; cannot underflow under the additive contract.
+   */
+  export const totalInput = (usage: InputFields) =>
+    (usage.inputTokens ?? 0) + (usage.cacheReadInputTokens ?? 0) + (usage.cacheWriteInputTokens ?? 0)
+
+  /**
+   * Sum of every output-side category: visible output + reasoning.
+   * Monotonic; cannot underflow under the additive contract.
+   */
+  export const totalOutput = (usage: OutputFields) => (usage.outputTokens ?? 0) + (usage.reasoningTokens ?? 0)
+}
+
 export const RequestStart = Schema.Struct({
  type: Schema.tag("request-start"),
  id: ResponseID,
--- a/packages/llm/test/provider/gemini.test.ts
+++ b/packages/llm/test/provider/gemini.test.ts
@ -197,7 +197,10 @@ describe("Gemini route", () => {
      expect(response.text).toBe("Hello!")
      expect(response.reasoning).toBe("thinking")
      expect(response.usage).toMatchObject({
-        inputTokens: 5,
+        // Additive contract: promptTokenCount=5 includes 1 cached, so
+        // inputTokens=4 + cacheReadInputTokens=1. Gemini already splits
+        // candidates from thoughts, so outputTokens=2 + reasoningTokens=1.
+        inputTokens: 4,
        outputTokens: 2,
        reasoningTokens: 1,
        cacheReadInputTokens: 1,
@ -211,7 +214,7 @@ describe("Gemini route", () => {
          type: "request-finish",
          reason: "stop",
          usage: {
-            inputTokens: 5,
+            inputTokens: 4,
            outputTokens: 2,
            reasoningTokens: 1,
            cacheReadInputTokens: 1,
--- a/packages/llm/test/provider/openai-chat.test.ts
+++ b/packages/llm/test/provider/openai-chat.test.ts
@ -231,7 +231,10 @@ describe("OpenAI Chat route", () => {
          type: "request-finish",
          reason: "stop",
          usage: {
-            inputTokens: 5,
+            // Additive contract: prompt_tokens=5 includes 1 cached, so
+            // inputTokens=4 (non-cached) + cacheReadInputTokens=1.
+            // completion_tokens=2 includes 0 reasoning, so outputTokens=2.
+            inputTokens: 4,
            outputTokens: 2,
            reasoningTokens: 0,
            cacheReadInputTokens: 1,
--- a/packages/llm/test/provider/openai-responses.test.ts
+++ b/packages/llm/test/provider/openai-responses.test.ts
@ -343,7 +343,10 @@ describe("OpenAI Responses route", () => {
          reason: "stop",
          providerMetadata: { openai: { responseId: "resp_1", serviceTier: "default" } },
          usage: {
-            inputTokens: 5,
+            // Additive contract: input_tokens=5 includes 1 cached, so
+            // inputTokens=4 + cacheReadInputTokens=1.
+            // output_tokens=2 includes 0 reasoning, so outputTokens=2.
+            inputTokens: 4,
            outputTokens: 2,
            reasoningTokens: 0,
            cacheReadInputTokens: 1,
--- a/packages/llm/test/schema.test.ts
+++ b/packages/llm/test/schema.test.ts
@ -1,6 +1,7 @@
 import { describe, expect, test } from "bun:test"
 import { Schema } from "effect"
-import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID } from "../src/schema"
+import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID, Usage } from "../src/schema"
+import { ProviderShared } from "../src/protocols/shared"

 const model = new ModelRef({
  id: ModelID.make("fake-model"),
@ -48,3 +49,28 @@ describe("llm schema", () => {
    expect(ContentPart.guards.media({ type: "text", text: "hi" })).toBe(false)
  })
 })
+
+describe("LLM.Usage additive contract", () => {
+  test("subtractTokens clamps non-sensical breakdowns to zero", () => {
+    // Defense against a provider reporting cached_tokens > prompt_tokens or
+    // reasoning_tokens > completion_tokens. The clamp prevents the negative
+    // values that triggered opencode#26620 from ever entering the pipeline.
+    expect(ProviderShared.subtractTokens(5, 3)).toBe(2)
+    expect(ProviderShared.subtractTokens(5, 10)).toBe(0)
+    expect(ProviderShared.subtractTokens(5, undefined)).toBe(5)
+    expect(ProviderShared.subtractTokens(undefined, 3)).toBeUndefined()
+    expect(ProviderShared.subtractTokens(undefined, undefined)).toBeUndefined()
+  })
+
+  test("totalInput sums every input-side category", () => {
+    expect(Usage.totalInput(new Usage({ inputTokens: 10, cacheReadInputTokens: 3, cacheWriteInputTokens: 2 }))).toBe(15)
+    expect(Usage.totalInput(new Usage({ inputTokens: 10 }))).toBe(10)
+    expect(Usage.totalInput(new Usage({}))).toBe(0)
+  })
+
+  test("totalOutput sums every output-side category", () => {
+    expect(Usage.totalOutput(new Usage({ outputTokens: 7, reasoningTokens: 4 }))).toBe(11)
+    expect(Usage.totalOutput(new Usage({ outputTokens: 7 }))).toBe(7)
+    expect(Usage.totalOutput(new Usage({}))).toBe(0)
+  })
+})