mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-30 20:44:31 +00:00
refactor(llm): make LLM.Usage a fully-additive contract
Defines a single invariant for `LLM.Usage`: every field is non-negative and every meaningful aggregate is a *sum*, never a difference. Total billable input = inputTokens + cacheReadInputTokens + cacheWriteInputTokens. Total billable output = outputTokens + reasoningTokens. Adding two non-negatives cannot underflow, so consumers can no longer reproduce the underflow-then-clamp bug class fixed by #26620. Each protocol mapper now enforces the contract at the provider boundary via `ProviderShared.subtractTokens`, which clamps with `Math.max(0, …)` for defense against provider bugs: - OpenAI Chat / Responses: pull `cached_tokens` out of `prompt_tokens` / `input_tokens`; pull `reasoning_tokens` out of `completion_tokens` / `output_tokens`. The provider's `total_tokens` is preserved verbatim. - Gemini: pull `cachedContentTokenCount` out of `promptTokenCount`. Gemini already split visible candidates from thoughts. - Bedrock: pull `cacheReadInputTokens` and `cacheWriteInputTokens` out of `inputTokens`, matching AWS prompt-caching docs. - Anthropic: already non-overlapping per the Messages API; pass through. Adds `Usage.totalInput` / `Usage.totalOutput` helpers for callers that want the merged view, and a regression test covering the clamp behavior. The reasoning underflow fixed in #26620 was the most visible symptom of a broader semantic inconsistency in this package: providers also disagreed on whether `inputTokens` includes cache reads (Anthropic excluded; OpenAI/Gemini/Bedrock included), which would silently double-subtract the moment v2 wired LLM.Usage into Session.getUsage. Normalizing now, pre-integration, closes both holes in one move.
This commit is contained in:
parent
9c8da69196
commit
b9451175a6
11 changed files with 175 additions and 20 deletions
|
|
@ -364,6 +364,14 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
|
|||
return "unknown"
|
||||
}
|
||||
|
||||
// Anthropic already reports input/cache-read/cache-write as separate
|
||||
// non-overlapping categories per the Messages API docs, so the additive
|
||||
// `LLM.Usage` contract is satisfied by direct pass-through. Extended
|
||||
// thinking tokens are *not* broken out by Anthropic — they're billed as
|
||||
// part of `output_tokens`, so `outputTokens` here may include reasoning
|
||||
// the same way OpenAI's `output_tokens` does pre-normalization. This is
|
||||
// a documented limitation of the Anthropic API surface, not a contract
|
||||
// violation.
|
||||
const mapUsage = (usage: AnthropicUsage | undefined): Usage | undefined => {
|
||||
if (!usage) return undefined
|
||||
return new Usage({
|
||||
|
|
|
|||
|
|
@ -363,12 +363,21 @@ const mapFinishReason = (reason: string): FinishReason => {
|
|||
return "unknown"
|
||||
}
|
||||
|
||||
// AWS Bedrock Converse reports `inputTokens` as the total prompt with
|
||||
// cached and cache-write tokens included (per the Bedrock prompt-caching
|
||||
// docs). Pull each subtotal out at the boundary so the additive
|
||||
// `LLM.Usage` contract holds. Bedrock does not separately report
|
||||
// reasoning tokens for any current model.
|
||||
const mapUsage = (usage: BedrockUsageSchema | undefined): Usage | undefined => {
|
||||
if (!usage) return undefined
|
||||
const inputTokens = ProviderShared.subtractTokens(
|
||||
ProviderShared.subtractTokens(usage.inputTokens, usage.cacheReadInputTokens),
|
||||
usage.cacheWriteInputTokens,
|
||||
)
|
||||
return new Usage({
|
||||
inputTokens: usage.inputTokens,
|
||||
inputTokens,
|
||||
outputTokens: usage.outputTokens,
|
||||
totalTokens: ProviderShared.totalTokens(usage.inputTokens, usage.outputTokens, usage.totalTokens),
|
||||
totalTokens: ProviderShared.totalTokens(inputTokens, usage.outputTokens, usage.totalTokens),
|
||||
cacheReadInputTokens: usage.cacheReadInputTokens,
|
||||
cacheWriteInputTokens: usage.cacheWriteInputTokens,
|
||||
native: usage,
|
||||
|
|
|
|||
|
|
@ -281,14 +281,21 @@ const fromRequest = Effect.fn("Gemini.fromRequest")(function* (request: LLMReque
|
|||
// =============================================================================
|
||||
// Stream Parsing
|
||||
// =============================================================================
|
||||
// Gemini reports `promptTokenCount` as the total prompt with cached
|
||||
// content included, but `candidatesTokenCount` already excludes
|
||||
// `thoughtsTokenCount` (visible vs reasoning are separate). Pull the
|
||||
// cached portion out at the boundary so the additive `LLM.Usage` contract
|
||||
// holds across providers.
|
||||
const mapUsage = (usage: GeminiUsage | undefined) => {
|
||||
if (!usage) return undefined
|
||||
const cached = usage.cachedContentTokenCount
|
||||
const inputTokens = ProviderShared.subtractTokens(usage.promptTokenCount, cached)
|
||||
return new Usage({
|
||||
inputTokens: usage.promptTokenCount,
|
||||
inputTokens,
|
||||
outputTokens: usage.candidatesTokenCount,
|
||||
reasoningTokens: usage.thoughtsTokenCount,
|
||||
cacheReadInputTokens: usage.cachedContentTokenCount,
|
||||
totalTokens: ProviderShared.totalTokens(usage.promptTokenCount, usage.candidatesTokenCount, usage.totalTokenCount),
|
||||
cacheReadInputTokens: cached,
|
||||
totalTokens: ProviderShared.totalTokens(inputTokens, usage.candidatesTokenCount, usage.totalTokenCount),
|
||||
native: usage,
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -290,14 +290,23 @@ const mapFinishReason = (reason: string | null | undefined): FinishReason => {
|
|||
return "unknown"
|
||||
}
|
||||
|
||||
// OpenAI Chat reports `prompt_tokens` as the total prompt (cached tokens
|
||||
// included) and `completion_tokens` as the total output (reasoning tokens
|
||||
// included). The additive `LLM.Usage` contract pulls each subtotal out at
|
||||
// the boundary so consumers never subtract — eliminating the underflow
|
||||
// class addressed by opencode#26620.
|
||||
const mapUsage = (usage: OpenAIChatEvent["usage"]): Usage | undefined => {
|
||||
if (!usage) return undefined
|
||||
const cached = usage.prompt_tokens_details?.cached_tokens
|
||||
const reasoning = usage.completion_tokens_details?.reasoning_tokens
|
||||
const inputTokens = ProviderShared.subtractTokens(usage.prompt_tokens, cached)
|
||||
const outputTokens = ProviderShared.subtractTokens(usage.completion_tokens, reasoning)
|
||||
return new Usage({
|
||||
inputTokens: usage.prompt_tokens,
|
||||
outputTokens: usage.completion_tokens,
|
||||
reasoningTokens: usage.completion_tokens_details?.reasoning_tokens,
|
||||
cacheReadInputTokens: usage.prompt_tokens_details?.cached_tokens,
|
||||
totalTokens: ProviderShared.totalTokens(usage.prompt_tokens, usage.completion_tokens, usage.total_tokens),
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
reasoningTokens: reasoning,
|
||||
cacheReadInputTokens: cached,
|
||||
totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
|
||||
native: usage,
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -276,14 +276,22 @@ const fromRequest = Effect.fn("OpenAIResponses.fromRequest")(function* (request:
|
|||
// =============================================================================
|
||||
// Stream Parsing
|
||||
// =============================================================================
|
||||
// OpenAI Responses reports `input_tokens` as the total prompt (cached
|
||||
// included) and `output_tokens` as the total output (reasoning included).
|
||||
// The additive `LLM.Usage` contract pulls each subtotal out at the boundary
|
||||
// so consumers never subtract.
|
||||
const mapUsage = (usage: OpenAIResponsesUsage | null | undefined) => {
|
||||
if (!usage) return undefined
|
||||
const cached = usage.input_tokens_details?.cached_tokens
|
||||
const reasoning = usage.output_tokens_details?.reasoning_tokens
|
||||
const inputTokens = ProviderShared.subtractTokens(usage.input_tokens, cached)
|
||||
const outputTokens = ProviderShared.subtractTokens(usage.output_tokens, reasoning)
|
||||
return new Usage({
|
||||
inputTokens: usage.input_tokens,
|
||||
outputTokens: usage.output_tokens,
|
||||
reasoningTokens: usage.output_tokens_details?.reasoning_tokens,
|
||||
cacheReadInputTokens: usage.input_tokens_details?.cached_tokens,
|
||||
totalTokens: ProviderShared.totalTokens(usage.input_tokens, usage.output_tokens, usage.total_tokens),
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
reasoningTokens: reasoning,
|
||||
cacheReadInputTokens: cached,
|
||||
totalTokens: ProviderShared.totalTokens(inputTokens, outputTokens, usage.total_tokens),
|
||||
native: usage,
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,13 @@ export interface ToolAccumulator {
|
|||
* supplied total; otherwise falls back to `inputTokens + outputTokens` only
|
||||
* when at least one is defined. Returns `undefined` when neither input nor
|
||||
* output is known so routes don't publish a misleading `0`.
|
||||
*
|
||||
* Under the additive `LLM.Usage` contract, `inputTokens` and `outputTokens`
|
||||
* are the non-cached input and visible output only. The provider-supplied
|
||||
* `total` is the source of truth when present; the computed fallback
|
||||
* under-counts cache and reasoning by design and exists mainly so
|
||||
* Anthropic-style providers (which don't surface a total) still get a
|
||||
* sensible aggregate on the input + output axes.
|
||||
*/
|
||||
export const totalTokens = (
|
||||
inputTokens: number | undefined,
|
||||
|
|
@ -53,6 +60,28 @@ export const totalTokens = (
|
|||
return (inputTokens ?? 0) + (outputTokens ?? 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* Subtract `subtrahend` from `total`, clamping to zero if the provider
|
||||
* reports a non-sensical breakdown (e.g. `cached_tokens > prompt_tokens`).
|
||||
* Used by protocol mappers to enforce the additive `LLM.Usage` contract:
|
||||
* each provider's "inclusive" subtotals (cached, reasoning) are pulled out
|
||||
* of the parent count at the boundary so downstream consumers never have to
|
||||
* subtract — eliminating the underflow class of bug where a clamped
|
||||
* difference would silently store the wrong value.
|
||||
*
|
||||
* If `total` is `undefined`, returns `undefined` (we don't fabricate
|
||||
* counts). If `subtrahend` is `undefined`, returns `total` unchanged. The
|
||||
* provider-native breakdown stays available on `Usage.native` for debugging.
|
||||
*/
|
||||
export const subtractTokens = (
|
||||
total: number | undefined,
|
||||
subtrahend: number | undefined,
|
||||
): number | undefined => {
|
||||
if (total === undefined) return undefined
|
||||
if (subtrahend === undefined) return total
|
||||
return Math.max(0, total - subtrahend)
|
||||
}
|
||||
|
||||
export const eventError = (route: string, message: string, raw?: string) =>
|
||||
new LLMError({
|
||||
module: "ProviderShared",
|
||||
|
|
|
|||
|
|
@ -3,6 +3,38 @@ import { ContentBlockID, FinishReason, ProtocolID, ProviderMetadata, ResponseID,
|
|||
import { ModelRef } from "./options"
|
||||
import { ToolResultValue } from "./messages"
|
||||
|
||||
/**
|
||||
* Token usage reported by an LLM provider, normalized to a fully-additive
|
||||
* contract so consumers never have to subtract.
|
||||
*
|
||||
* **Field semantics** (each non-negative; missing means "not reported"):
|
||||
*
|
||||
* - `inputTokens` — non-cached input tokens (the "fresh" prompt portion).
|
||||
* - `cacheReadInputTokens` — input tokens served from cache.
|
||||
* - `cacheWriteInputTokens` — input tokens written to cache.
|
||||
* - `outputTokens` — visible output tokens (text + tool calls).
|
||||
* - `reasoningTokens` — hidden reasoning / thinking tokens.
|
||||
* - `totalTokens` — provider-supplied total, or sum of input + output as a
|
||||
* fallback (see `ProviderShared.totalTokens`).
|
||||
* - `native` — the provider's raw usage payload, preserved for debugging.
|
||||
*
|
||||
* **Invariant**: every aggregate of interest is a *sum*, never a difference.
|
||||
* Total billable input = `inputTokens + cacheReadInputTokens +
|
||||
* cacheWriteInputTokens`. Total billable output = `outputTokens +
|
||||
* reasoningTokens`. Adding two non-negatives cannot underflow, so consumers
|
||||
* cannot reproduce the underflow-then-clamp bug class where a stored
|
||||
* negative gets rejected by a strict schema later.
|
||||
*
|
||||
* Each protocol mapper enforces this contract at the provider boundary.
|
||||
* Providers that report cache or reasoning as subsets of input/output
|
||||
* (OpenAI Chat/Responses, Gemini, Bedrock) have those subsets pulled out
|
||||
* once via `ProviderShared.subtractTokens`, with `Math.max(0, …)` clamping
|
||||
* for defense against provider bugs. Providers that already report
|
||||
* separately (Anthropic) pass through. Where a provider doesn't surface a
|
||||
* category at all (e.g. Anthropic does not break out extended-thinking
|
||||
* tokens), the corresponding field is `undefined` and the parent count
|
||||
* carries the combined total — a documented limitation of that API.
|
||||
*/
|
||||
export class Usage extends Schema.Class<Usage>("LLM.Usage")({
|
||||
inputTokens: Schema.optional(Schema.Number),
|
||||
outputTokens: Schema.optional(Schema.Number),
|
||||
|
|
@ -13,6 +45,24 @@ export class Usage extends Schema.Class<Usage>("LLM.Usage")({
|
|||
native: Schema.optional(Schema.Record(Schema.String, Schema.Unknown)),
|
||||
}) {}
|
||||
|
||||
export namespace Usage {
|
||||
type InputFields = Pick<Usage, "inputTokens" | "cacheReadInputTokens" | "cacheWriteInputTokens">
|
||||
type OutputFields = Pick<Usage, "outputTokens" | "reasoningTokens">
|
||||
|
||||
/**
|
||||
* Sum of every input-side category: non-cached input + cache reads +
|
||||
* cache writes. Monotonic; cannot underflow under the additive contract.
|
||||
*/
|
||||
export const totalInput = (usage: InputFields) =>
|
||||
(usage.inputTokens ?? 0) + (usage.cacheReadInputTokens ?? 0) + (usage.cacheWriteInputTokens ?? 0)
|
||||
|
||||
/**
|
||||
* Sum of every output-side category: visible output + reasoning.
|
||||
* Monotonic; cannot underflow under the additive contract.
|
||||
*/
|
||||
export const totalOutput = (usage: OutputFields) => (usage.outputTokens ?? 0) + (usage.reasoningTokens ?? 0)
|
||||
}
|
||||
|
||||
export const RequestStart = Schema.Struct({
|
||||
type: Schema.tag("request-start"),
|
||||
id: ResponseID,
|
||||
|
|
|
|||
|
|
@ -197,7 +197,10 @@ describe("Gemini route", () => {
|
|||
expect(response.text).toBe("Hello!")
|
||||
expect(response.reasoning).toBe("thinking")
|
||||
expect(response.usage).toMatchObject({
|
||||
inputTokens: 5,
|
||||
// Additive contract: promptTokenCount=5 includes 1 cached, so
|
||||
// inputTokens=4 + cacheReadInputTokens=1. Gemini already splits
|
||||
// candidates from thoughts, so outputTokens=2 + reasoningTokens=1.
|
||||
inputTokens: 4,
|
||||
outputTokens: 2,
|
||||
reasoningTokens: 1,
|
||||
cacheReadInputTokens: 1,
|
||||
|
|
@ -211,7 +214,7 @@ describe("Gemini route", () => {
|
|||
type: "request-finish",
|
||||
reason: "stop",
|
||||
usage: {
|
||||
inputTokens: 5,
|
||||
inputTokens: 4,
|
||||
outputTokens: 2,
|
||||
reasoningTokens: 1,
|
||||
cacheReadInputTokens: 1,
|
||||
|
|
|
|||
|
|
@ -231,7 +231,10 @@ describe("OpenAI Chat route", () => {
|
|||
type: "request-finish",
|
||||
reason: "stop",
|
||||
usage: {
|
||||
inputTokens: 5,
|
||||
// Additive contract: prompt_tokens=5 includes 1 cached, so
|
||||
// inputTokens=4 (non-cached) + cacheReadInputTokens=1.
|
||||
// completion_tokens=2 includes 0 reasoning, so outputTokens=2.
|
||||
inputTokens: 4,
|
||||
outputTokens: 2,
|
||||
reasoningTokens: 0,
|
||||
cacheReadInputTokens: 1,
|
||||
|
|
|
|||
|
|
@ -343,7 +343,10 @@ describe("OpenAI Responses route", () => {
|
|||
reason: "stop",
|
||||
providerMetadata: { openai: { responseId: "resp_1", serviceTier: "default" } },
|
||||
usage: {
|
||||
inputTokens: 5,
|
||||
// Additive contract: input_tokens=5 includes 1 cached, so
|
||||
// inputTokens=4 + cacheReadInputTokens=1.
|
||||
// output_tokens=2 includes 0 reasoning, so outputTokens=2.
|
||||
inputTokens: 4,
|
||||
outputTokens: 2,
|
||||
reasoningTokens: 0,
|
||||
cacheReadInputTokens: 1,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import { describe, expect, test } from "bun:test"
|
||||
import { Schema } from "effect"
|
||||
import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID } from "../src/schema"
|
||||
import { ContentPart, LLMEvent, LLMRequest, ModelID, ModelLimits, ModelRef, ProviderID, Usage } from "../src/schema"
|
||||
import { ProviderShared } from "../src/protocols/shared"
|
||||
|
||||
const model = new ModelRef({
|
||||
id: ModelID.make("fake-model"),
|
||||
|
|
@ -48,3 +49,28 @@ describe("llm schema", () => {
|
|||
expect(ContentPart.guards.media({ type: "text", text: "hi" })).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe("LLM.Usage additive contract", () => {
|
||||
test("subtractTokens clamps non-sensical breakdowns to zero", () => {
|
||||
// Defense against a provider reporting cached_tokens > prompt_tokens or
|
||||
// reasoning_tokens > completion_tokens. The clamp prevents the negative
|
||||
// values that triggered opencode#26620 from ever entering the pipeline.
|
||||
expect(ProviderShared.subtractTokens(5, 3)).toBe(2)
|
||||
expect(ProviderShared.subtractTokens(5, 10)).toBe(0)
|
||||
expect(ProviderShared.subtractTokens(5, undefined)).toBe(5)
|
||||
expect(ProviderShared.subtractTokens(undefined, 3)).toBeUndefined()
|
||||
expect(ProviderShared.subtractTokens(undefined, undefined)).toBeUndefined()
|
||||
})
|
||||
|
||||
test("totalInput sums every input-side category", () => {
|
||||
expect(Usage.totalInput(new Usage({ inputTokens: 10, cacheReadInputTokens: 3, cacheWriteInputTokens: 2 }))).toBe(15)
|
||||
expect(Usage.totalInput(new Usage({ inputTokens: 10 }))).toBe(10)
|
||||
expect(Usage.totalInput(new Usage({}))).toBe(0)
|
||||
})
|
||||
|
||||
test("totalOutput sums every output-side category", () => {
|
||||
expect(Usage.totalOutput(new Usage({ outputTokens: 7, reasoningTokens: 4 }))).toBe(11)
|
||||
expect(Usage.totalOutput(new Usage({ outputTokens: 7 }))).toBe(7)
|
||||
expect(Usage.totalOutput(new Usage({}))).toBe(0)
|
||||
})
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue