refactor(llm): make LLM.Usage a fully-additive contract

Defines a single invariant for `LLM.Usage`: every field is non-negative
and every meaningful aggregate is a *sum*, never a difference. Total
billable input = inputTokens + cacheReadInputTokens + cacheWriteInputTokens.
Total billable output = outputTokens + reasoningTokens. Adding two
non-negatives cannot underflow, so consumers can no longer reproduce the
underflow-then-clamp bug class fixed by #26620.

Each protocol mapper now enforces the contract at the provider boundary
via `ProviderShared.subtractTokens`, which clamps with `Math.max(0, …)`
for defense against provider bugs:

- OpenAI Chat / Responses: pull `cached_tokens` out of `prompt_tokens` /
  `input_tokens`; pull `reasoning_tokens` out of `completion_tokens` /
  `output_tokens`. The provider's `total_tokens` is preserved verbatim.
- Gemini: pull `cachedContentTokenCount` out of `promptTokenCount`.
  Gemini already split visible candidates from thoughts.
- Bedrock: pull `cacheReadInputTokens` and `cacheWriteInputTokens` out of
  `inputTokens`, matching AWS prompt-caching docs.
- Anthropic: already non-overlapping per the Messages API; pass through.

Adds `Usage.totalInput` / `Usage.totalOutput` helpers for callers that
want the merged view, and a regression test covering the clamp behavior.

The reasoning underflow fixed in #26620 was the most visible symptom of
a broader semantic inconsistency in this package: providers also disagreed
on whether `inputTokens` includes cache reads (Anthropic excluded;
OpenAI/Gemini/Bedrock included), which would silently double-subtract
the moment v2 wired LLM.Usage into Session.getUsage. Normalizing now,
pre-integration, closes both holes in one move.
This commit is contained in:
Kit Langton 2026-05-10 13:07:58 -04:00
parent 9c8da69196
commit b9451175a6
11 changed files with 175 additions and 20 deletions

View file

@ -364,6 +364,14 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
return "unknown"
}
// Anthropic already reports input/cache-read/cache-write as separate
// non-overlapping categories per the Messages API docs, so the additive
// `LLM.Usage` contract is satisfied by direct pass-through. Extended
// thinking tokens are *not* broken out by Anthropic — they're billed as
// part of `output_tokens`, so `outputTokens` here may include reasoning
// the same way OpenAI's `output_tokens` does pre-normalization. This is
// a documented limitation of the Anthropic API surface, not a contract
// violation.
const mapUsage = (usage: AnthropicUsage | undefined): Usage | undefined => {
if (!usage) return undefined
return new Usage({

View file

@ -363,12 +363,21 @@ const mapFinishReason = (reason: string): FinishReason => {
return "unknown"
}
// AWS Bedrock Converse reports `inputTokens` as the total prompt with
// cached and cache-write tokens included (per the Bedrock prompt-caching
// docs). Pull each subtotal out at the boundary so the additive
// `LLM.Usage` contract holds. Bedrock does not separately report
// reasoning tokens for any current model.
const mapUsage = (usage: BedrockUsageSchema | undefined): Usage | undefined => {
if (!usage) return undefined
const inputTokens = ProviderShared.subtractTokens(
ProviderShared.subtractTokens(usage.inputTokens, usage.cacheReadInputTokens),
usage.cacheWriteInputTokens,
)
return new Usage({
inputTokens: usage.inputTokens,
inputTokens,
outputTokens: usage.outputTokens,
totalTokens: ProviderShared.totalTokens(usage.inputTokens, usage.outputTokens, usage.totalTokens),
totalTokens: ProviderShared.totalTokens(inputTokens, usage.outputTokens, usage.totalTokens),
cacheReadInputTokens: usage.cacheReadInputTokens,
cacheWriteInputTokens: usage.cacheWriteInputTokens,
native: usage,

View file

@ -281,14 +281,21 @@ const fromRequest = Effect.fn("Gemini.fromRequest")(function* (request: LLMReque
// =============================================================================
// Stream Parsing
// =============================================================================
// Gemini reports `promptTokenCount` as the total prompt with cached
// content included, but `candidatesTokenCount` already excludes
// `thoughtsTokenCount` (visible vs reasoning are separate). Pull the
// cached portion out at the boundary so the additive `LLM.Usage` contract
// holds across providers.
const mapUsage = (usage: GeminiUsage | undefined) => {
if (!usage) return undefined
const cached = usage.cachedContentTokenCount
const inputTokens = ProviderShared.subtractTokens(usage.promptTokenCount, cached)
return new Usage({
inputTokens: usage.promptTokenCount,
inputTokens,
outputTokens: usage.candidatesTokenCount,
reasoningTokens: usage.thoughtsTokenCount,
cacheReadInputTokens: usage.cachedContentTokenCount,
totalTokens: ProviderShared.totalTokens(usage.promptTokenCount, usage.candidatesTokenCount, usage.totalTokenCount),
cacheReadInputTokens: cached,
totalTokens: ProviderShared.totalTokens(inputTokens, usage.candidatesTokenCount, usage.totalTokenCount),
native: usage,
})
}

View file

@ -290,14 +290,23 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
return "unknown"
}
// OpenAI Chat reports `prompt_tokens` as the total prompt (cached tokens
// included) and `completion_tokens` as the total output (reasoning tokens
// included). The additive `LLM.Usage` contract pulls each subtotal out at
// the boundary so consumers never subtract — eliminating the underflow
// class addressed by opencode#26620.
const mapUsage = (usage: OpenAIChatEvent["usage"]): Usage | undefined => {
if (!usage) return undefined
const cached = usage.prompt_tokens_details?.cached_tokens
const reasoning = usage.completion_tokens_details?.reasoning_tokens
const inputTokens = ProviderShared.subtractTokens(usage.prompt_tokens, cached)
const outputTokens = ProviderShared.subtractTokens(usage.completion_tokens, reasoning)
return new Usage({
inputTokens: usage.prompt_tokens,
outputTokens: usage.completion_tokens,
reasoningTokens: usage.completion_tokens_details?.reasoning_tokens,
cacheReadInputTokens: usage.prompt_tokens_details?.cached_tokens,
totalTokens: ProviderShared.totalTokens(usage.prompt_tokens, usage.completion_tokens, usage.total_tokens),
inputTokens,
outputTokens,
reasoningTokens: reasoning,
cacheReadInputTokens: cached,
totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
native: usage,
})
}

View file

@ -276,14 +276,22 @@ const fromRequest = Effect.fn("OpenAIResponses.fromRequest")(function* (request:
// =============================================================================
// Stream Parsing
// =============================================================================
// OpenAI Responses reports `input_tokens` as the total prompt (cached
// included) and `output_tokens` as the total output (reasoning included).
// The additive `LLM.Usage` contract pulls each subtotal out at the boundary
// so consumers never subtract.
const mapUsage = (usage: OpenAIResponsesUsage | null | undefined) => {
if (!usage) return undefined
const cached = usage.input_tokens_details?.cached_tokens
const reasoning = usage.output_tokens_details?.reasoning_tokens
const inputTokens = ProviderShared.subtractTokens(usage.input_tokens, cached)
const outputTokens = ProviderShared.subtractTokens(usage.output_tokens, reasoning)
return new Usage({
inputTokens: usage.input_tokens,
outputTokens: usage.output_tokens,
reasoningTokens: usage.output_tokens_details?.reasoning_tokens,
cacheReadInputTokens: usage.input_tokens_details?.cached_tokens,
totalTokens: ProviderShared.totalTokens(usage.input_tokens, usage.output_tokens, usage.total_tokens),
inputTokens,
outputTokens,
reasoningTokens: reasoning,
cacheReadInputTokens: cached,
totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
native: usage,
})
}

View file

@ -42,6 +42,13 @@ export interface ToolAccumulator {
* supplied total; otherwise falls back to `inputTokens + outputTokens` only
* when at least one is defined. Returns `undefined` when neither input nor
* output is known so routes don't publish a misleading `0`.
*
* Under the additive `LLM.Usage` contract, `inputTokens` and `outputTokens`
* are the non-cached input and visible output only. The provider-supplied
* `total` is the source of truth when present; the computed fallback
* under-counts cache and reasoning by design and exists mainly so
* Anthropic-style providers (which don't surface a total) still get a
* sensible aggregate on the input + output axes.
*/
export const totalTokens = (
inputTokens: number | undefined,
@ -53,6 +60,28 @@ export const totalTokens = (
return (inputTokens ?? 0) + (outputTokens ?? 0)
}
/**
* Subtract `subtrahend` from `total`, clamping to zero if the provider
* reports a non-sensical breakdown (e.g. `cached_tokens > prompt_tokens`).
* Used by protocol mappers to enforce the additive `LLM.Usage` contract:
* each provider's "inclusive" subtotals (cached, reasoning) are pulled out
* of the parent count at the boundary so downstream consumers never have to
* subtract eliminating the underflow class of bug where a clamped
* difference would silently store the wrong value.
*
* If `total` is `undefined`, returns `undefined` (we don't fabricate
* counts). If `subtrahend` is `undefined`, returns `total` unchanged. The
* provider-native breakdown stays available on `Usage.native` for debugging.
*/
export const subtractTokens = (
total: number | undefined,
subtrahend: number | undefined,
): number | undefined => {
if (total === undefined) return undefined
if (subtrahend === undefined) return total
return Math.max(0, total - subtrahend)
}
export const eventError = (route: string, message: string, raw?: string) =>
new LLMError({
module: "ProviderShared",

View file

@ -3,6 +3,38 @@ import { ContentBlockID, FinishReason, ProtocolID, ProviderMetadata, ResponseID,
import { ModelRef } from "./options"
import { ToolResultValue } from "./messages"
/**
* Token usage reported by an LLM provider, normalized to a fully-additive
* contract so consumers never have to subtract.
*
* **Field semantics** (each non-negative; missing means "not reported"):
*
* - `inputTokens` non-cached input tokens (the "fresh" prompt portion).
* - `cacheReadInputTokens` input tokens served from cache.
* - `cacheWriteInputTokens` input tokens written to cache.
* - `outputTokens` visible output tokens (text + tool calls).
* - `reasoningTokens` hidden reasoning / thinking tokens.
* - `totalTokens` provider-supplied total, or sum of input + output as a
* fallback (see `ProviderShared.totalTokens`).
* - `native` the provider's raw usage payload, preserved for debugging.
*
* **Invariant**: every aggregate of interest is a *sum*, never a difference.
* Total billable input = `inputTokens + cacheReadInputTokens +
* cacheWriteInputTokens`. Total billable output = `outputTokens +
* reasoningTokens`. Adding two non-negatives cannot underflow, so consumers
* cannot reproduce the underflow-then-clamp bug class where a stored
* negative gets rejected by a strict schema later.
*
* Each protocol mapper enforces this contract at the provider boundary.
* Providers that report cache or reasoning as subsets of input/output
* (OpenAI Chat/Responses, Gemini, Bedrock) have those subsets pulled out
* once via `ProviderShared.subtractTokens`, with `Math.max(0, …)` clamping
* for defense against provider bugs. Providers that already report
* separately (Anthropic) pass through. Where a provider doesn't surface a
* category at all (e.g. Anthropic does not break out extended-thinking
* tokens), the corresponding field is `undefined` and the parent count
* carries the combined total a documented limitation of that API.
*/
export class Usage extends Schema.Class<Usage>("LLM.Usage")({
inputTokens: Schema.optional(Schema.Number),
outputTokens: Schema.optional(Schema.Number),
@ -13,6 +45,24 @@ export class Usage extends Schema.Class<Usage>("LLM.Usage")({
native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
}) {}
export namespace Usage {
type InputFields = Pick<Usage, "inputTokens" | "cacheReadInputTokens" | "cacheWriteInputTokens">
type OutputFields = Pick<Usage, "outputTokens" | "reasoningTokens">
/**
* Sum of every input-side category: non-cached input + cache reads +
* cache writes. Monotonic; cannot underflow under the additive contract.
*/
export const totalInput = (usage: InputFields) =>
(usage.inputTokens ?? 0) + (usage.cacheReadInputTokens ?? 0) + (usage.cacheWriteInputTokens ?? 0)
/**
* Sum of every output-side category: visible output + reasoning.
* Monotonic; cannot underflow under the additive contract.
*/
export const totalOutput = (usage: OutputFields) => (usage.outputTokens ?? 0) + (usage.reasoningTokens ?? 0)
}
export const RequestStart = Schema.Struct({
type: Schema.tag("request-start"),
id: ResponseID,

View file

@ -197,7 +197,10 @@ describe("Gemini route", () => {
expect(response.text).toBe("Hello!")
expect(response.reasoning).toBe("thinking")
expect(response.usage).toMatchObject({
inputTokens: 5,
// Additive contract: promptTokenCount=5 includes 1 cached, so
// inputTokens=4 + cacheReadInputTokens=1. Gemini already splits
// candidates from thoughts, so outputTokens=2 + reasoningTokens=1.
inputTokens: 4,
outputTokens: 2,
reasoningTokens: 1,
cacheReadInputTokens: 1,
@ -211,7 +214,7 @@ describe("Gemini route", () => {
type: "request-finish",
reason: "stop",
usage: {
inputTokens: 5,
inputTokens: 4,
outputTokens: 2,
reasoningTokens: 1,
cacheReadInputTokens: 1,

View file

@ -231,7 +231,10 @@ describe("OpenAI Chat route", () => {
type: "request-finish",
reason: "stop",
usage: {
inputTokens: 5,
// Additive contract: prompt_tokens=5 includes 1 cached, so
// inputTokens=4 (non-cached) + cacheReadInputTokens=1.
// completion_tokens=2 includes 0 reasoning, so outputTokens=2.
inputTokens: 4,
outputTokens: 2,
reasoningTokens: 0,
cacheReadInputTokens: 1,

View file

@ -343,7 +343,10 @@ describe("OpenAI Responses route", () => {
reason: "stop",
providerMetadata: { openai: { responseId: "resp_1", serviceTier: "default" } },
usage: {
inputTokens: 5,
// Additive contract: input_tokens=5 includes 1 cached, so
// inputTokens=4 + cacheReadInputTokens=1.
// output_tokens=2 includes 0 reasoning, so outputTokens=2.
inputTokens: 4,
outputTokens: 2,
reasoningTokens: 0,
cacheReadInputTokens: 1,

View file

@ -1,6 +1,7 @@
import { describe, expect, test } from "bun:test"
import { Schema } from "effect"
import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID } from "../src/schema"
import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID, Usage } from "../src/schema"
import { ProviderShared } from "../src/protocols/shared"
const model = new ModelRef({
id: ModelID.make("fake-model"),
@ -48,3 +49,28 @@ describe("llm schema", () => {
expect(ContentPart.guards.media({ type: "text", text: "hi" })).toBe(false)
})
})
describe("LLM.Usage additive contract", () => {
test("subtractTokens clamps non-sensical breakdowns to zero", () => {
// Defense against a provider reporting cached_tokens > prompt_tokens or
// reasoning_tokens > completion_tokens. The clamp prevents the negative
// values that triggered opencode#26620 from ever entering the pipeline.
expect(ProviderShared.subtractTokens(5, 3)).toBe(2)
expect(ProviderShared.subtractTokens(5, 10)).toBe(0)
expect(ProviderShared.subtractTokens(5, undefined)).toBe(5)
expect(ProviderShared.subtractTokens(undefined, 3)).toBeUndefined()
expect(ProviderShared.subtractTokens(undefined, undefined)).toBeUndefined()
})
test("totalInput sums every input-side category", () => {
expect(Usage.totalInput(new Usage({ inputTokens: 10, cacheReadInputTokens: 3, cacheWriteInputTokens: 2 }))).toBe(15)
expect(Usage.totalInput(new Usage({ inputTokens: 10 }))).toBe(10)
expect(Usage.totalInput(new Usage({}))).toBe(0)
})
test("totalOutput sums every output-side category", () => {
expect(Usage.totalOutput(new Usage({ outputTokens: 7, reasoningTokens: 4 }))).toBe(11)
expect(Usage.totalOutput(new Usage({ outputTokens: 7 }))).toBe(7)
expect(Usage.totalOutput(new Usage({}))).toBe(0)
})
})