fix: populate workflow_run.script_run with script identity + complete ai_fallback_triggered coverage (#5635)

This commit is contained in:
pedrohsdb 2026-04-23 18:37:18 -07:00 committed by GitHub
parent 71316dbc01
commit 37f10704e9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 499 additions and 2 deletions

View file

@ -9582,6 +9582,16 @@
"type": "boolean",
"title": "Ai Fallback Triggered",
"default": false
},
"script_id": {
"type": "string",
"nullable": true,
"title": "Script Id"
},
"script_revision_id": {
"type": "string",
"nullable": true,
"title": "Script Revision Id"
}
},
"type": "object",

View file

@ -14353,6 +14353,14 @@
"type": "boolean",
"title": "Ai Fallback Triggered",
"default": false
},
"script_id": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"title": "Script Id"
},
"script_revision_id": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"title": "Script Revision Id"
}
},
"type": "object",

View file

@ -8,6 +8,8 @@ from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
class ScriptRunResponse(UniversalBaseModel):
ai_fallback_triggered: typing.Optional[bool] = None
script_id: typing.Optional[str] = None
script_revision_id: typing.Optional[str] = None
if IS_PYDANTIC_V2:
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2

View file

@ -51,6 +51,33 @@ from skyvern.schemas.runs import ProxyLocationInput, RunType
LOG = structlog.get_logger()
def _merge_script_run(
existing: dict | None,
ai_fallback_triggered: bool | None,
script_id: str | None,
script_revision_id: str | None,
) -> dict:
"""Merge-on-write semantics for `workflow_runs.script_run`.
Callers update different facets of `script_run` at different points in a
run's lifecycle — setup time writes script identity, mid-execution fallback
writes `ai_fallback_triggered=True`. A replace-based update would clobber
whichever facet the caller didn't touch, so merge preserves the other.
Pure function for testability (see `tests/unit/db/
test_workflow_runs_script_run_merge.py`). None-valued params are skipped;
non-None params overwrite the corresponding key in the merged dict.
"""
merged = dict(existing or {})
if ai_fallback_triggered is not None:
merged["ai_fallback_triggered"] = ai_fallback_triggered
if script_id is not None:
merged["script_id"] = script_id
if script_revision_id is not None:
merged["script_revision_id"] = script_revision_id
return merged
class WorkflowRunsRepository(BaseRepository):
"""Database operations for workflow runs."""
@ -178,6 +205,8 @@ class WorkflowRunsRepository(BaseRepository):
failure_reason: str | None = None,
webhook_failure_reason: str | None = None,
ai_fallback_triggered: bool | None = None,
script_id: str | None = None,
script_revision_id: str | None = None,
job_id: str | None = None,
run_with: str | None = None,
sequential_key: str | None = None,
@ -212,8 +241,13 @@ class WorkflowRunsRepository(BaseRepository):
workflow_run.failure_reason = failure_reason
if webhook_failure_reason is not None:
workflow_run.webhook_failure_reason = webhook_failure_reason
if ai_fallback_triggered is not None:
workflow_run.script_run = {"ai_fallback_triggered": ai_fallback_triggered}
if ai_fallback_triggered is not None or script_id is not None or script_revision_id is not None:
workflow_run.script_run = _merge_script_run(
existing=workflow_run.script_run,
ai_fallback_triggered=ai_fallback_triggered,
script_id=script_id,
script_revision_id=script_revision_id,
)
if job_id:
workflow_run.job_id = job_id
if run_with:

View file

@ -1524,6 +1524,12 @@ class WorkflowService:
empty_blocks_detected=script is not None and is_script_run and not script_blocks_by_label,
)
if script_mode_active and script is not None:
# Regression-locked by tests/unit/workflow/test_mark_script_run_loaded.py
# ::test_mark_script_run_loaded_calls_update_with_script_identity.
# If you modify this branch, update that test.
await self._mark_script_run_loaded(workflow_run_id, script)
if block_labels and len(block_labels):
blocks: list[BlockTypeVar] = []
all_labels = {block.label: block for block in all_blocks}
@ -2283,6 +2289,33 @@ class WorkflowService:
organization_id=organization_id,
browser_session_id=browser_session_id,
)
# Record that this run experienced a script → AI fallback if
# the agent execution we just ran was a consequence of a failed
# script attempt. The gate correctly excludes:
# - ai_fallback=False kept-the-failure (execute_safe not reached)
# - requires_agent / uncached / disable_cache / non-cacheable
# routes (valid_to_run_code is False for these)
# - agent-only workflows (is_script_run=False → valid_to_run_code=False)
# and correctly covers all three script-failure modes: script-
# block failed, script threw, and script-ran-but-no-block-found.
# Complements the task-block AI-fallback writers in `services/script_service.py`
# writers which handle a separate task-block AI-fallback surface.
# Perf: a fallback-heavy run issues N writes for N fallbacks.
# Typical runs have 0-3. `_merge_script_run` is idempotent at the
# DB layer. If observed latency regresses, add a context-scoped
# already-flipped cache (tracked separately).
await self._mark_script_fallback_triggered(
workflow_run_id=workflow_run_id,
# `valid_to_run_code` is computed as a chain of `and`s that
# includes `block.label` (str | None), so its static type is
# `Literal[''] | bool` when block.label is an empty string.
# The helper treats it as a pure truthiness gate; `bool(...)`
# narrows the type cleanly for mypy without changing runtime
# semantics.
valid_to_run_code=bool(valid_to_run_code),
block_executed_with_code=block_executed_with_code,
block_label=block.label,
)
# Update fallback episode with agent actions for both success and failure.
# Failed fallbacks are kept for triage — the reviewer will determine
@ -6019,3 +6052,88 @@ class WorkflowService:
if workflow_run.run_with is not None:
return workflow_run.run_with == "code"
return workflow.run_with == "code"
async def _mark_script_run_loaded(self, workflow_run_id: str, script: Script) -> None:
"""Record that a cached script was loaded for this workflow run.
Populates `workflow_run.script_run` with the script's identity at
workflow setup time so API consumers can detect cache use. Sets
`ai_fallback_triggered=False` as the initial state; if a fallback
fires mid-execution, other writers (`services/script_service.py`
and `_mark_script_fallback_triggered` below) merge the flipped
`ai_fallback_triggered=True` on top without clobbering identity
via the merge-on-write behavior in `update_workflow_run`.
Semantic: `script_run != null` after this runs means "a cached
script was loaded for this run at setup time." It does NOT imply
that every (or any) block actually executed from that cache
`block_labels` filtering, `requires_agent`, `disable_cache`, and
non-cacheable block types can still route individual blocks to AI.
See `ScriptRunResponse` docstrings for the full semantic.
Wrapped in try/except (matching `_mark_script_fallback_triggered`) so
a transient DB error on the metadata write doesn't abort workflow
setup. The `script_run` payload is informational reporting state
to API consumers not load-bearing for the run's own execution.
"""
try:
await app.DATABASE.workflow_runs.update_workflow_run(
workflow_run_id=workflow_run_id,
ai_fallback_triggered=False,
script_id=script.script_id,
script_revision_id=script.script_revision_id,
)
except Exception:
LOG.warning(
"Failed to mark script_run loaded at workflow setup",
workflow_run_id=workflow_run_id,
script_id=script.script_id,
script_revision_id=script.script_revision_id,
exc_info=True,
)
async def _mark_script_fallback_triggered(
self,
workflow_run_id: str,
valid_to_run_code: bool,
block_executed_with_code: bool,
block_label: str | None,
) -> None:
"""Flip `ai_fallback_triggered=True` on the run iff the just-executed
agent block was a scriptAI fallback (not an always-agent route).
Gate semantics:
- `valid_to_run_code=True` we attempted script execution for this
block. False rules out always-agent routes (requires_agent,
disable_cache, uncached, non-cacheable block types, agent-only
workflows).
- `block_executed_with_code=False` script didn't succeed. True means
script ran cleanly; no fallback occurred; no flag flip.
Together, a True/False combination means "we tried script, it failed,
we then ran agent." That's the precise definition of a mid-execution
scriptAI fallback.
Caller must only invoke this AFTER `block.execute_safe` actually ran
(i.e., the fallback agent execution happened). Calling before would
risk false positives in the `ai_fallback=False` kept-the-failure
case where `execute_safe` is never reached.
Wrapped in try/except so a transient DB error on the flag flip can't
abort downstream block post-processing. Regression-locked by
`tests/unit/workflow/test_mark_script_fallback_triggered.py`.
"""
if not (valid_to_run_code and not block_executed_with_code):
return
try:
await app.DATABASE.workflow_runs.update_workflow_run(
workflow_run_id=workflow_run_id,
ai_fallback_triggered=True,
)
except Exception:
LOG.warning(
"Failed to mark ai_fallback_triggered after script→AI fallback",
workflow_run_id=workflow_run_id,
block_label=block_label,
exc_info=True,
)

View file

@ -590,8 +590,30 @@ class BlockRunRequest(WorkflowRunRequest):
class ScriptRunResponse(BaseModel):
# True iff a fallback fired during this run, flipping at least one
# block's execution from cached script to the agent. Writers: the two
# `services/script_service.py` fallback paths (script-block failure +
# conditional-agent episode) and the `_execute_single_block` script-
# failure path. `False` here does NOT imply "no AI execution" — blocks
# that were ALWAYS-agent (via `requires_agent`, `disable_cache`, or
# non-cacheable block types) never create a fallback episode and don't
# flip this flag. For per-block routing ground truth, consult the
# `Block execution mode resolved` log emitted at per-block execution
# time in `skyvern/forge/sdk/workflow/service.py`.
ai_fallback_triggered: bool = False
# Identity of the cached script that was loaded for this run at
# workflow setup time. Non-null iff a script was loaded. Does NOT
# imply that every (or any) block actually executed from that cache —
# per-block `block_labels` filtering, `requires_agent`, `disable_cache`,
# or non-cacheable block types can still route individual blocks to AI.
# Populated by the server-side execution path (workflow/service.py) and
# the local CLI entrypoint (services/script_service.run_script). None
# on rows written by older code paths that only recorded
# `ai_fallback_triggered`.
script_id: str | None = None
script_revision_id: str | None = None
class UploadFileResponse(BaseModel):
s3_uri: str = Field(description="S3 URI where the file was uploaded")

View file

@ -2537,6 +2537,8 @@ async def run_script(
workflow_run = await app.DATABASE.workflow_runs.update_workflow_run(
workflow_run_id=workflow_run_id,
ai_fallback_triggered=False,
script_id=script_id,
script_revision_id=script_revision_id,
)
context.workflow_run_id = workflow_run_id
context.organization_id = organization_id

View file

View file

@ -0,0 +1,127 @@
"""Regression guards for the `_merge_script_run` helper in
`workflow_runs` repository.
Merge-on-write is load-bearing: callers update different facets of
`workflow_run.script_run` at different points in a run's lifecycle
(setup time writes script identity; mid-execution fallback writes
`ai_fallback_triggered=True`). Without the merge, the second write
clobbers whichever facet wasn't touched, and consumers that read the
`script_run` API field see stale/missing data.
These tests lock in the three invariants:
1. First write into a null column produces the full populated dict.
2. A subsequent partial update preserves unrelated keys.
3. Calls with all nullable params as None are documented no-ops
(enforced at the caller; this file documents the expectation).
"""
from __future__ import annotations
from skyvern.forge.sdk.db.repositories.workflow_runs import _merge_script_run
def test_merge_script_run_first_write_creates_full_dict() -> None:
"""A server-side code-mode run initializes `script_run` from null.
Helper called with all three fields dict has all three keys."""
result = _merge_script_run(
existing=None,
ai_fallback_triggered=False,
script_id="s_abc",
script_revision_id="sr_xyz",
)
assert result == {
"ai_fallback_triggered": False,
"script_id": "s_abc",
"script_revision_id": "sr_xyz",
}
def test_merge_script_run_fallback_flip_preserves_identity() -> None:
"""Regression guard for the original motivating bug: after
`_mark_script_run_loaded` writes identity at setup, a later
fallback-flip update to `ai_fallback_triggered=True` must NOT
clobber `script_id` / `script_revision_id`. A replace-based
update (pre-this-PR behavior) would have emitted
`{"ai_fallback_triggered": True}` only, breaking consumers that
read script identity from the API."""
existing = {
"ai_fallback_triggered": False,
"script_id": "s_abc",
"script_revision_id": "sr_xyz",
}
result = _merge_script_run(
existing=existing,
ai_fallback_triggered=True,
script_id=None,
script_revision_id=None,
)
assert result == {
"ai_fallback_triggered": True,
"script_id": "s_abc",
"script_revision_id": "sr_xyz",
}
def test_merge_script_run_later_identity_write_preserves_fallback_flag() -> None:
"""Symmetric case: if `ai_fallback_triggered` was written first
(e.g., by a writer that doesn't know about identity yet), a later
identity write must not clobber the fallback bool."""
existing = {"ai_fallback_triggered": True}
result = _merge_script_run(
existing=existing,
ai_fallback_triggered=None,
script_id="s_abc",
script_revision_id="sr_xyz",
)
assert result == {
"ai_fallback_triggered": True,
"script_id": "s_abc",
"script_revision_id": "sr_xyz",
}
def test_merge_script_run_ignores_none_params() -> None:
"""A call with all three params as None returns the existing dict
unchanged. The gate in `update_workflow_run` prevents invoking
`_merge_script_run` in this case, but the helper's own behavior
must be a no-op for defense in depth."""
existing = {"ai_fallback_triggered": True, "script_id": "s_abc"}
result = _merge_script_run(
existing=existing,
ai_fallback_triggered=None,
script_id=None,
script_revision_id=None,
)
assert result == {"ai_fallback_triggered": True, "script_id": "s_abc"}
def test_merge_script_run_false_fallback_is_written_not_skipped() -> None:
"""Guard against the `if ai_fallback_triggered:` pitfall — `False`
is a meaningful value, not a skip signal. Only `None` should skip."""
result = _merge_script_run(
existing={"ai_fallback_triggered": True},
ai_fallback_triggered=False,
script_id=None,
script_revision_id=None,
)
assert result == {"ai_fallback_triggered": False}
def test_merge_script_run_empty_dict_existing_same_as_none() -> None:
"""Legacy rows may have `script_run = {}` (empty dict) instead of
null. Merge treats them identically the helper uses `or {}` so
both produce the same starting point."""
from_none = _merge_script_run(
existing=None,
ai_fallback_triggered=False,
script_id="s_abc",
script_revision_id=None,
)
from_empty = _merge_script_run(
existing={},
ai_fallback_triggered=False,
script_id="s_abc",
script_revision_id=None,
)
assert from_none == from_empty

View file

@ -0,0 +1,113 @@
"""Regression guards for `_mark_script_fallback_triggered` — flips
`ai_fallback_triggered=True` on the run iff a real scriptAI fallback
occurred.
Without this helper, several real fallback paths in `_execute_single_block`
would never flip the run-level flag (see CORR-5/CORR-6/CORR-7 in the
debate). The gate semantics are:
- `valid_to_run_code=True` script execution was attempted for this
block. False rules out always-agent routes.
- `block_executed_with_code=False` the script attempt didn't succeed.
True means clean script execution; no fallback; no flip.
Together those two booleans are the exact condition under which the
agent `execute_safe` call we just ran constituted a scriptAI fallback.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, patch
import pytest
from skyvern.forge.sdk.workflow.service import WorkflowService
@pytest.mark.asyncio
async def test_flips_flag_when_script_attempted_and_failed() -> None:
"""The happy-path fallback case: we tried script, it didn't succeed,
agent ran. Flag must flip."""
service = WorkflowService()
with patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
) as mock_update:
await service._mark_script_fallback_triggered(
workflow_run_id="wr_test",
valid_to_run_code=True,
block_executed_with_code=False,
block_label="some_block",
)
mock_update.assert_awaited_once_with(
workflow_run_id="wr_test",
ai_fallback_triggered=True,
)
@pytest.mark.asyncio
async def test_does_not_flip_when_script_was_not_attempted() -> None:
"""Always-agent route: `valid_to_run_code=False` means this block was
never going to run as script (requires_agent, disable_cache, uncached,
non-cacheable block type, or pure agent workflow). The agent execution
is NOT a fallback; it's the intended execution mode. Flag must not
flip else always-agent blocks in otherwise code-mode runs would
falsely claim 'a fallback happened'."""
service = WorkflowService()
with patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
) as mock_update:
await service._mark_script_fallback_triggered(
workflow_run_id="wr_test",
valid_to_run_code=False,
block_executed_with_code=False,
block_label="agent_only_block",
)
mock_update.assert_not_awaited()
@pytest.mark.asyncio
async def test_does_not_flip_when_script_succeeded() -> None:
"""`block_executed_with_code=True` means the cached script ran cleanly.
No fallback occurred. Flag must not flip."""
service = WorkflowService()
with patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
) as mock_update:
await service._mark_script_fallback_triggered(
workflow_run_id="wr_test",
valid_to_run_code=True,
block_executed_with_code=True,
block_label="cached_block",
)
mock_update.assert_not_awaited()
@pytest.mark.asyncio
async def test_swallows_db_error_with_warning_log() -> None:
"""A transient DB error on the flag flip must not abort block
post-processing. The helper wraps the write in try/except so the
caller's downstream logic (fallback-episode enrichment, etc.) runs
to completion regardless. Asserts both the no-raise contract AND
the warning-log-was-emitted contract."""
service = WorkflowService()
with (
patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
side_effect=RuntimeError("db transient"),
),
patch("skyvern.forge.sdk.workflow.service.LOG.warning") as mock_log,
):
# Must not raise.
await service._mark_script_fallback_triggered(
workflow_run_id="wr_test",
valid_to_run_code=True,
block_executed_with_code=False,
block_label="some_block",
)
mock_log.assert_called_once()
# The first positional arg is the log event name.
assert "ai_fallback_triggered" in mock_log.call_args.args[0]

View file

@ -0,0 +1,61 @@
"""Regression guard for _mark_script_run_loaded — populates script_run on
server-side code-mode runs so the API can detect cache use.
Before this helper existed, `workflow_run.script_run` was null on every
Temporal run even when `execution_mode=code` was resolved at
service.py:1508. See SKY-* / PR #10522 for context.
"""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from skyvern.forge.sdk.workflow.service import WorkflowService
from skyvern.schemas.scripts import Script
@pytest.mark.asyncio
async def test_mark_script_run_loaded_calls_update_with_script_identity() -> None:
"""The helper must issue exactly one update with the script identity +
`ai_fallback_triggered=False` initialization. The call-site caller in
`_execute_workflow_blocks` routes through this helper, so this test
pins down the shape of what hits the DB on every cached-code run."""
service = WorkflowService()
script = MagicMock(spec=Script, script_id="s_abc", script_revision_id="sr_xyz")
with patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
) as mock_update:
await service._mark_script_run_loaded("wr_test", script)
mock_update.assert_awaited_once_with(
workflow_run_id="wr_test",
ai_fallback_triggered=False,
script_id="s_abc",
script_revision_id="sr_xyz",
)
@pytest.mark.asyncio
async def test_mark_script_run_loaded_swallows_db_error_with_warning() -> None:
"""A transient DB error on the setup-time metadata write must not abort
workflow setup. `script_run` is reporting state for API consumers not
load-bearing for the run's own execution — so a failure here should
degrade to a warning, not a hard stop. This mirrors the try/except in
`_mark_script_fallback_triggered`."""
service = WorkflowService()
script = MagicMock(spec=Script, script_id="s_abc", script_revision_id="sr_xyz")
with (
patch(
"skyvern.forge.sdk.workflow.service.app.DATABASE.workflow_runs.update_workflow_run",
new_callable=AsyncMock,
side_effect=RuntimeError("db transient"),
),
patch("skyvern.forge.sdk.workflow.service.LOG.warning") as mock_log,
):
# Must not raise.
await service._mark_script_run_loaded("wr_test", script)
mock_log.assert_called_once()
assert "script_run" in mock_log.call_args.args[0]