mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
🔄 synced local 'skyvern/' with remote 'skyvern/'
This commit is contained in:
parent
9e567c5ab8
commit
f8833c05b9
3 changed files with 24 additions and 22 deletions
|
|
@ -159,10 +159,8 @@ class CopilotContext(AgentContext):
|
|||
consecutive_tool_tracker: list[str] = field(default_factory=list)
|
||||
tool_activity: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
# Token usage summed from raw_responses after each streamed run. None
|
||||
# until the first response that carries a usage object — some providers
|
||||
# (notably non-OpenAI streaming routes) omit usage entirely, and we want
|
||||
# eval cost grading to see "no data" rather than "0 tokens".
|
||||
# ``None`` until usage is observed; ``0`` only when a provider explicitly
|
||||
# reported zero. Distinct values let cost grading flag missing telemetry.
|
||||
total_tokens_used: int | None = None
|
||||
input_tokens_used: int | None = None
|
||||
output_tokens_used: int | None = None
|
||||
|
|
|
|||
|
|
@ -959,28 +959,29 @@ class _SendTrackingStream:
|
|||
|
||||
|
||||
def _accumulate_usage(result: RunResultStreaming, ctx: Any) -> None:
|
||||
"""Sum actual token usage from raw_responses onto the context.
|
||||
"""Sum the SDK's per-iteration usage into ``ctx``.
|
||||
|
||||
Called per enforcement iteration in a ``finally:`` so pre-overflow
|
||||
response tokens are still counted even when ``stream_to_sse`` raises.
|
||||
First observed usage flips the counters from ``None`` to ``0``; if no
|
||||
response on this stream carries a usage object the counters stay
|
||||
``None``, which the eval surfaces as "telemetry missing" rather than
|
||||
"ran for free".
|
||||
The SDK aggregates usage into ``context_wrapper.usage`` before tool execution,
|
||||
so prior-turn tokens survive a mid-tool abort; each ``Runner.run_streamed``
|
||||
call gets a fresh wrapper, so totals must accumulate on ``ctx`` across
|
||||
iterations rather than overwrite.
|
||||
"""
|
||||
if not hasattr(ctx, "total_tokens_used"):
|
||||
return
|
||||
for resp in getattr(result, "raw_responses", []) or []:
|
||||
usage = getattr(resp, "usage", None)
|
||||
if usage is None:
|
||||
continue
|
||||
if ctx.total_tokens_used is None:
|
||||
ctx.total_tokens_used = 0
|
||||
ctx.input_tokens_used = 0
|
||||
ctx.output_tokens_used = 0
|
||||
ctx.total_tokens_used += getattr(usage, "total_tokens", 0) or 0
|
||||
ctx.input_tokens_used += getattr(usage, "input_tokens", 0) or 0
|
||||
ctx.output_tokens_used += getattr(usage, "output_tokens", 0) or 0
|
||||
usage = getattr(getattr(result, "context_wrapper", None), "usage", None)
|
||||
if usage is None:
|
||||
return
|
||||
|
||||
input_tokens = getattr(usage, "input_tokens", 0) or 0
|
||||
output_tokens = getattr(usage, "output_tokens", 0) or 0
|
||||
total_tokens = getattr(usage, "total_tokens", 0) or 0
|
||||
|
||||
if not (input_tokens or output_tokens or total_tokens):
|
||||
return
|
||||
|
||||
ctx.input_tokens_used = (ctx.input_tokens_used or 0) + input_tokens
|
||||
ctx.output_tokens_used = (ctx.output_tokens_used or 0) + output_tokens
|
||||
ctx.total_tokens_used = (ctx.total_tokens_used or 0) + total_tokens
|
||||
|
||||
|
||||
async def _run_streamed_with_deadline(
|
||||
|
|
|
|||
|
|
@ -169,9 +169,12 @@ def resolve_model_config(llm_api_handler: Any) -> tuple[str, RunConfig, str, boo
|
|||
if "timeout" not in extra_args:
|
||||
extra_args["timeout"] = settings.LLM_CONFIG_TIMEOUT
|
||||
|
||||
# ``include_usage=True`` gates ``stream_options={"include_usage": True}`` on
|
||||
# streamed chat-completions; without it the final chunk omits token usage.
|
||||
model_settings = ModelSettings(
|
||||
temperature=config.temperature,
|
||||
max_tokens=config.max_completion_tokens or config.max_tokens,
|
||||
include_usage=True,
|
||||
extra_body=extra_body or None,
|
||||
extra_args=extra_args or None,
|
||||
extra_headers=extra_headers,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue