🔄 synced local 'skyvern/' with remote 'skyvern/'

This commit is contained in:
andrewneilson 2026-04-26 16:39:22 +00:00
parent 9e567c5ab8
commit f8833c05b9
3 changed files with 24 additions and 22 deletions

View file

@ -159,10 +159,8 @@ class CopilotContext(AgentContext):
consecutive_tool_tracker: list[str] = field(default_factory=list)
tool_activity: list[dict[str, Any]] = field(default_factory=list)
# Token usage summed from raw_responses after each streamed run. None
# until the first response that carries a usage object — some providers
# (notably non-OpenAI streaming routes) omit usage entirely, and we want
# eval cost grading to see "no data" rather than "0 tokens".
# ``None`` until usage is observed; ``0`` only when a provider explicitly
# reported zero. Distinct values let cost grading flag missing telemetry.
total_tokens_used: int | None = None
input_tokens_used: int | None = None
output_tokens_used: int | None = None

View file

@ -959,28 +959,29 @@ class _SendTrackingStream:
def _accumulate_usage(result: RunResultStreaming, ctx: Any) -> None:
"""Sum actual token usage from raw_responses onto the context.
"""Sum the SDK's per-iteration usage into ``ctx``.
Called per enforcement iteration in a ``finally:`` so pre-overflow
response tokens are still counted even when ``stream_to_sse`` raises.
First observed usage flips the counters from ``None`` to ``0``; if no
response on this stream carries a usage object the counters stay
``None``, which the eval surfaces as "telemetry missing" rather than
"ran for free".
The SDK aggregates usage into ``context_wrapper.usage`` before tool execution,
so prior-turn tokens survive a mid-tool abort; each ``Runner.run_streamed``
call gets a fresh wrapper, so totals must accumulate on ``ctx`` across
iterations rather than overwrite.
"""
if not hasattr(ctx, "total_tokens_used"):
return
for resp in getattr(result, "raw_responses", []) or []:
usage = getattr(resp, "usage", None)
if usage is None:
continue
if ctx.total_tokens_used is None:
ctx.total_tokens_used = 0
ctx.input_tokens_used = 0
ctx.output_tokens_used = 0
ctx.total_tokens_used += getattr(usage, "total_tokens", 0) or 0
ctx.input_tokens_used += getattr(usage, "input_tokens", 0) or 0
ctx.output_tokens_used += getattr(usage, "output_tokens", 0) or 0
usage = getattr(getattr(result, "context_wrapper", None), "usage", None)
if usage is None:
return
input_tokens = getattr(usage, "input_tokens", 0) or 0
output_tokens = getattr(usage, "output_tokens", 0) or 0
total_tokens = getattr(usage, "total_tokens", 0) or 0
if not (input_tokens or output_tokens or total_tokens):
return
ctx.input_tokens_used = (ctx.input_tokens_used or 0) + input_tokens
ctx.output_tokens_used = (ctx.output_tokens_used or 0) + output_tokens
ctx.total_tokens_used = (ctx.total_tokens_used or 0) + total_tokens
async def _run_streamed_with_deadline(

View file

@ -169,9 +169,12 @@ def resolve_model_config(llm_api_handler: Any) -> tuple[str, RunConfig, str, boo
if "timeout" not in extra_args:
extra_args["timeout"] = settings.LLM_CONFIG_TIMEOUT
# ``include_usage=True`` gates ``stream_options={"include_usage": True}`` on
# streamed chat-completions; without it the final chunk omits token usage.
model_settings = ModelSettings(
temperature=config.temperature,
max_tokens=config.max_completion_tokens or config.max_tokens,
include_usage=True,
extra_body=extra_body or None,
extra_args=extra_args or None,
extra_headers=extra_headers,