diff --git a/packages/llm/src/protocols/anthropic-messages.ts b/packages/llm/src/protocols/anthropic-messages.ts index fba785373d..afef20f1fb 100644 --- a/packages/llm/src/protocols/anthropic-messages.ts +++ b/packages/llm/src/protocols/anthropic-messages.ts @@ -364,6 +364,14 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => { return "unknown" } +// Anthropic already reports input/cache-read/cache-write as separate +// non-overlapping categories per the Messages API docs, so the additive +// `LLM.Usage` contract is satisfied by direct pass-through. Extended +// thinking tokens are *not* broken out by Anthropic — they're billed as +// part of `output_tokens`, so `outputTokens` here may include reasoning +// the same way OpenAI's `output_tokens` does pre-normalization. This is +// a documented limitation of the Anthropic API surface, not a contract +// violation. const mapUsage = (usage: AnthropicUsage | undefined): Usage | undefined => { if (!usage) return undefined return new Usage({ diff --git a/packages/llm/src/protocols/bedrock-converse.ts b/packages/llm/src/protocols/bedrock-converse.ts index 260ee612cd..80620d3463 100644 --- a/packages/llm/src/protocols/bedrock-converse.ts +++ b/packages/llm/src/protocols/bedrock-converse.ts @@ -363,12 +363,21 @@ const mapFinishReason = (reason: string): FinishReason => { return "unknown" } +// AWS Bedrock Converse reports `inputTokens` as the total prompt with +// cached and cache-write tokens included (per the Bedrock prompt-caching +// docs). Pull each subtotal out at the boundary so the additive +// `LLM.Usage` contract holds. Bedrock does not separately report +// reasoning tokens for any current model. const mapUsage = (usage: BedrockUsageSchema | undefined): Usage | undefined => { if (!usage) return undefined + const inputTokens = ProviderShared.subtractTokens( + ProviderShared.subtractTokens(usage.inputTokens, usage.cacheReadInputTokens), + usage.cacheWriteInputTokens, + ) return new Usage({ - inputTokens: usage.inputTokens, + inputTokens, outputTokens: usage.outputTokens, - totalTokens: ProviderShared.totalTokens(usage.inputTokens, usage.outputTokens, usage.totalTokens), + totalTokens: ProviderShared.totalTokens(inputTokens, usage.outputTokens, usage.totalTokens), cacheReadInputTokens: usage.cacheReadInputTokens, cacheWriteInputTokens: usage.cacheWriteInputTokens, native: usage, diff --git a/packages/llm/src/protocols/gemini.ts b/packages/llm/src/protocols/gemini.ts index 140da521a5..fbb03d1fd8 100644 --- a/packages/llm/src/protocols/gemini.ts +++ b/packages/llm/src/protocols/gemini.ts @@ -281,14 +281,21 @@ const fromRequest = Effect.fn("Gemini.fromRequest")(function* (request: LLMReque // ============================================================================= // Stream Parsing // ============================================================================= +// Gemini reports `promptTokenCount` as the total prompt with cached +// content included, but `candidatesTokenCount` already excludes +// `thoughtsTokenCount` (visible vs reasoning are separate). Pull the +// cached portion out at the boundary so the additive `LLM.Usage` contract +// holds across providers. const mapUsage = (usage: GeminiUsage | undefined) => { if (!usage) return undefined + const cached = usage.cachedContentTokenCount + const inputTokens = ProviderShared.subtractTokens(usage.promptTokenCount, cached) return new Usage({ - inputTokens: usage.promptTokenCount, + inputTokens, outputTokens: usage.candidatesTokenCount, reasoningTokens: usage.thoughtsTokenCount, - cacheReadInputTokens: usage.cachedContentTokenCount, - totalTokens: ProviderShared.totalTokens(usage.promptTokenCount, usage.candidatesTokenCount, usage.totalTokenCount), + cacheReadInputTokens: cached, + totalTokens: ProviderShared.totalTokens(inputTokens, usage.candidatesTokenCount, usage.totalTokenCount), native: usage, }) } diff --git a/packages/llm/src/protocols/openai-chat.ts b/packages/llm/src/protocols/openai-chat.ts index 5d42c0a4e9..09165d502d 100644 --- a/packages/llm/src/protocols/openai-chat.ts +++ b/packages/llm/src/protocols/openai-chat.ts @@ -290,14 +290,23 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => { return "unknown" } +// OpenAI Chat reports `prompt_tokens` as the total prompt (cached tokens +// included) and `completion_tokens` as the total output (reasoning tokens +// included). The additive `LLM.Usage` contract pulls each subtotal out at +// the boundary so consumers never subtract — eliminating the underflow +// class addressed by opencode#26620. const mapUsage = (usage: OpenAIChatEvent["usage"]): Usage | undefined => { if (!usage) return undefined + const cached = usage.prompt_tokens_details?.cached_tokens + const reasoning = usage.completion_tokens_details?.reasoning_tokens + const inputTokens = ProviderShared.subtractTokens(usage.prompt_tokens, cached) + const outputTokens = ProviderShared.subtractTokens(usage.completion_tokens, reasoning) return new Usage({ - inputTokens: usage.prompt_tokens, - outputTokens: usage.completion_tokens, - reasoningTokens: usage.completion_tokens_details?.reasoning_tokens, - cacheReadInputTokens: usage.prompt_tokens_details?.cached_tokens, - totalTokens: ProviderShared.totalTokens(usage.prompt_tokens, usage.completion_tokens, usage.total_tokens), + inputTokens, + outputTokens, + reasoningTokens: reasoning, + cacheReadInputTokens: cached, + totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens), native: usage, }) } diff --git a/packages/llm/src/protocols/openai-responses.ts b/packages/llm/src/protocols/openai-responses.ts index 14dc32130c..6a0918efb8 100644 --- a/packages/llm/src/protocols/openai-responses.ts +++ b/packages/llm/src/protocols/openai-responses.ts @@ -276,14 +276,22 @@ const fromRequest = Effect.fn("OpenAIResponses.fromRequest")(function* (request: // ============================================================================= // Stream Parsing // ============================================================================= +// OpenAI Responses reports `input_tokens` as the total prompt (cached +// included) and `output_tokens` as the total output (reasoning included). +// The additive `LLM.Usage` contract pulls each subtotal out at the boundary +// so consumers never subtract. const mapUsage = (usage: OpenAIResponsesUsage | null | undefined) => { if (!usage) return undefined + const cached = usage.input_tokens_details?.cached_tokens + const reasoning = usage.output_tokens_details?.reasoning_tokens + const inputTokens = ProviderShared.subtractTokens(usage.input_tokens, cached) + const outputTokens = ProviderShared.subtractTokens(usage.output_tokens, reasoning) return new Usage({ - inputTokens: usage.input_tokens, - outputTokens: usage.output_tokens, - reasoningTokens: usage.output_tokens_details?.reasoning_tokens, - cacheReadInputTokens: usage.input_tokens_details?.cached_tokens, - totalTokens: ProviderShared.totalTokens(usage.input_tokens, usage.output_tokens, usage.total_tokens), + inputTokens, + outputTokens, + reasoningTokens: reasoning, + cacheReadInputTokens: cached, + totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens), native: usage, }) } diff --git a/packages/llm/src/protocols/shared.ts b/packages/llm/src/protocols/shared.ts index c931353998..79e019097e 100644 --- a/packages/llm/src/protocols/shared.ts +++ b/packages/llm/src/protocols/shared.ts @@ -42,6 +42,13 @@ export interface ToolAccumulator { * supplied total; otherwise falls back to `inputTokens + outputTokens` only * when at least one is defined. Returns `undefined` when neither input nor * output is known so routes don't publish a misleading `0`. + * + * Under the additive `LLM.Usage` contract, `inputTokens` and `outputTokens` + * are the non-cached input and visible output only. The provider-supplied + * `total` is the source of truth when present; the computed fallback + * under-counts cache and reasoning by design and exists mainly so + * Anthropic-style providers (which don't surface a total) still get a + * sensible aggregate on the input + output axes. */ export const totalTokens = ( inputTokens: number | undefined, @@ -53,6 +60,28 @@ export const totalTokens = ( return (inputTokens ?? 0) + (outputTokens ?? 0) } +/** + * Subtract `subtrahend` from `total`, clamping to zero if the provider + * reports a non-sensical breakdown (e.g. `cached_tokens > prompt_tokens`). + * Used by protocol mappers to enforce the additive `LLM.Usage` contract: + * each provider's "inclusive" subtotals (cached, reasoning) are pulled out + * of the parent count at the boundary so downstream consumers never have to + * subtract — eliminating the underflow class of bug where a clamped + * difference would silently store the wrong value. + * + * If `total` is `undefined`, returns `undefined` (we don't fabricate + * counts). If `subtrahend` is `undefined`, returns `total` unchanged. The + * provider-native breakdown stays available on `Usage.native` for debugging. + */ +export const subtractTokens = ( + total: number | undefined, + subtrahend: number | undefined, +): number | undefined => { + if (total === undefined) return undefined + if (subtrahend === undefined) return total + return Math.max(0, total - subtrahend) +} + export const eventError = (route: string, message: string, raw?: string) => new LLMError({ module: "ProviderShared", diff --git a/packages/llm/src/schema/events.ts b/packages/llm/src/schema/events.ts index d0befe246e..6c7d91fe43 100644 --- a/packages/llm/src/schema/events.ts +++ b/packages/llm/src/schema/events.ts @@ -3,6 +3,38 @@ import { ContentBlockID, FinishReason, ProtocolID, ProviderMetadata, ResponseID, import { ModelRef } from "./options" import { ToolResultValue } from "./messages" +/** + * Token usage reported by an LLM provider, normalized to a fully-additive + * contract so consumers never have to subtract. + * + * **Field semantics** (each non-negative; missing means "not reported"): + * + * - `inputTokens` — non-cached input tokens (the "fresh" prompt portion). + * - `cacheReadInputTokens` — input tokens served from cache. + * - `cacheWriteInputTokens` — input tokens written to cache. + * - `outputTokens` — visible output tokens (text + tool calls). + * - `reasoningTokens` — hidden reasoning / thinking tokens. + * - `totalTokens` — provider-supplied total, or sum of input + output as a + * fallback (see `ProviderShared.totalTokens`). + * - `native` — the provider's raw usage payload, preserved for debugging. + * + * **Invariant**: every aggregate of interest is a *sum*, never a difference. + * Total billable input = `inputTokens + cacheReadInputTokens + + * cacheWriteInputTokens`. Total billable output = `outputTokens + + * reasoningTokens`. Adding two non-negatives cannot underflow, so consumers + * cannot reproduce the underflow-then-clamp bug class where a stored + * negative gets rejected by a strict schema later. + * + * Each protocol mapper enforces this contract at the provider boundary. + * Providers that report cache or reasoning as subsets of input/output + * (OpenAI Chat/Responses, Gemini, Bedrock) have those subsets pulled out + * once via `ProviderShared.subtractTokens`, with `Math.max(0, …)` clamping + * for defense against provider bugs. Providers that already report + * separately (Anthropic) pass through. Where a provider doesn't surface a + * category at all (e.g. Anthropic does not break out extended-thinking + * tokens), the corresponding field is `undefined` and the parent count + * carries the combined total — a documented limitation of that API. + */ export class Usage extends Schema.Class("LLM.Usage")({ inputTokens: Schema.optional(Schema.Number), outputTokens: Schema.optional(Schema.Number), @@ -13,6 +45,24 @@ export class Usage extends Schema.Class("LLM.Usage")({ native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)), }) {} +export namespace Usage { + type InputFields = Pick + type OutputFields = Pick + + /** + * Sum of every input-side category: non-cached input + cache reads + + * cache writes. Monotonic; cannot underflow under the additive contract. + */ + export const totalInput = (usage: InputFields) => + (usage.inputTokens ?? 0) + (usage.cacheReadInputTokens ?? 0) + (usage.cacheWriteInputTokens ?? 0) + + /** + * Sum of every output-side category: visible output + reasoning. + * Monotonic; cannot underflow under the additive contract. + */ + export const totalOutput = (usage: OutputFields) => (usage.outputTokens ?? 0) + (usage.reasoningTokens ?? 0) +} + export const RequestStart = Schema.Struct({ type: Schema.tag("request-start"), id: ResponseID, diff --git a/packages/llm/test/provider/gemini.test.ts b/packages/llm/test/provider/gemini.test.ts index 9de4e0dc25..55d77a4e85 100644 --- a/packages/llm/test/provider/gemini.test.ts +++ b/packages/llm/test/provider/gemini.test.ts @@ -197,7 +197,10 @@ describe("Gemini route", () => { expect(response.text).toBe("Hello!") expect(response.reasoning).toBe("thinking") expect(response.usage).toMatchObject({ - inputTokens: 5, + // Additive contract: promptTokenCount=5 includes 1 cached, so + // inputTokens=4 + cacheReadInputTokens=1. Gemini already splits + // candidates from thoughts, so outputTokens=2 + reasoningTokens=1. + inputTokens: 4, outputTokens: 2, reasoningTokens: 1, cacheReadInputTokens: 1, @@ -211,7 +214,7 @@ describe("Gemini route", () => { type: "request-finish", reason: "stop", usage: { - inputTokens: 5, + inputTokens: 4, outputTokens: 2, reasoningTokens: 1, cacheReadInputTokens: 1, diff --git a/packages/llm/test/provider/openai-chat.test.ts b/packages/llm/test/provider/openai-chat.test.ts index 8b0dfc2894..1938580f3b 100644 --- a/packages/llm/test/provider/openai-chat.test.ts +++ b/packages/llm/test/provider/openai-chat.test.ts @@ -231,7 +231,10 @@ describe("OpenAI Chat route", () => { type: "request-finish", reason: "stop", usage: { - inputTokens: 5, + // Additive contract: prompt_tokens=5 includes 1 cached, so + // inputTokens=4 (non-cached) + cacheReadInputTokens=1. + // completion_tokens=2 includes 0 reasoning, so outputTokens=2. + inputTokens: 4, outputTokens: 2, reasoningTokens: 0, cacheReadInputTokens: 1, diff --git a/packages/llm/test/provider/openai-responses.test.ts b/packages/llm/test/provider/openai-responses.test.ts index 5141b44cc2..8f232854a9 100644 --- a/packages/llm/test/provider/openai-responses.test.ts +++ b/packages/llm/test/provider/openai-responses.test.ts @@ -343,7 +343,10 @@ describe("OpenAI Responses route", () => { reason: "stop", providerMetadata: { openai: { responseId: "resp_1", serviceTier: "default" } }, usage: { - inputTokens: 5, + // Additive contract: input_tokens=5 includes 1 cached, so + // inputTokens=4 + cacheReadInputTokens=1. + // output_tokens=2 includes 0 reasoning, so outputTokens=2. + inputTokens: 4, outputTokens: 2, reasoningTokens: 0, cacheReadInputTokens: 1, diff --git a/packages/llm/test/schema.test.ts b/packages/llm/test/schema.test.ts index 46eb85b075..7ef3247f8b 100644 --- a/packages/llm/test/schema.test.ts +++ b/packages/llm/test/schema.test.ts @@ -1,6 +1,7 @@ import { describe, expect, test } from "bun:test" import { Schema } from "effect" -import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID } from "../src/schema" +import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID, Usage } from "../src/schema" +import { ProviderShared } from "../src/protocols/shared" const model = new ModelRef({ id: ModelID.make("fake-model"), @@ -48,3 +49,28 @@ describe("llm schema", () => { expect(ContentPart.guards.media({ type: "text", text: "hi" })).toBe(false) }) }) + +describe("LLM.Usage additive contract", () => { + test("subtractTokens clamps non-sensical breakdowns to zero", () => { + // Defense against a provider reporting cached_tokens > prompt_tokens or + // reasoning_tokens > completion_tokens. The clamp prevents the negative + // values that triggered opencode#26620 from ever entering the pipeline. + expect(ProviderShared.subtractTokens(5, 3)).toBe(2) + expect(ProviderShared.subtractTokens(5, 10)).toBe(0) + expect(ProviderShared.subtractTokens(5, undefined)).toBe(5) + expect(ProviderShared.subtractTokens(undefined, 3)).toBeUndefined() + expect(ProviderShared.subtractTokens(undefined, undefined)).toBeUndefined() + }) + + test("totalInput sums every input-side category", () => { + expect(Usage.totalInput(new Usage({ inputTokens: 10, cacheReadInputTokens: 3, cacheWriteInputTokens: 2 }))).toBe(15) + expect(Usage.totalInput(new Usage({ inputTokens: 10 }))).toBe(10) + expect(Usage.totalInput(new Usage({}))).toBe(0) + }) + + test("totalOutput sums every output-side category", () => { + expect(Usage.totalOutput(new Usage({ outputTokens: 7, reasoningTokens: 4 }))).toBe(11) + expect(Usage.totalOutput(new Usage({ outputTokens: 7 }))).toBe(7) + expect(Usage.totalOutput(new Usage({}))).toBe(0) + }) +})