feat(SKY-8879) copilot-stack/06: MCP tools surface + orphan-task cancellation (#5517)

This commit is contained in:
Andrew Neilson 2026-04-15 21:20:16 -07:00 committed by GitHub
parent d58ea46163
commit faa2b233cb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 2588 additions and 23 deletions

View file

@ -0,0 +1,96 @@
"""Tests for the action-sequence fingerprint compute and the hard-abort
short-circuit in ``_tool_loop_error``.
The streak counter that drives the abort is owned by
``failure_tracking.update_repeated_failure_state`` (stack 03). These tests
cover only the tools.py-side behavior:
- ``compute_action_sequence_fingerprint`` is stable across runs that fire
the same action shape (independent of reasoning text / status).
- ``compute_action_sequence_fingerprint`` distinguishes different sequences.
- ``_tool_loop_error`` returns a hard-abort message when the streak crosses
``REPEATED_ACTION_STREAK_ABORT_AT`` for a block-running tool.
- The hard abort does NOT fire for non-block-running tools, regardless of
streak height.
"""
from __future__ import annotations
from types import SimpleNamespace
from skyvern.forge.sdk.copilot.failure_tracking import compute_action_sequence_fingerprint
from skyvern.forge.sdk.copilot.tools import (
REPEATED_ACTION_STREAK_ABORT_AT,
_tool_loop_error,
)
def _ctx_with_streak(streak: int) -> SimpleNamespace:
return SimpleNamespace(
consecutive_tool_tracker=[],
repeated_action_fingerprint_streak_count=streak,
)
def test_compute_action_sequence_fingerprint_stable_for_same_sequence() -> None:
trace_a = [
{"action": "input_text", "element": "elem-name", "reasoning": "r1", "status": "failed"},
{"action": "input_text", "element": "elem-email", "reasoning": "r2", "status": "failed"},
{"action": "click", "element": "elem-submit", "reasoning": "r3", "status": "failed"},
]
trace_b = [
{"action": "input_text", "element": "elem-name", "reasoning": "different_text", "status": "failed"},
{"action": "input_text", "element": "elem-email", "reasoning": "other", "status": "failed"},
{"action": "click", "element": "elem-submit", "reasoning": "third", "status": "failed"},
]
fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
assert fp_a is not None
# Reasoning / status are excluded from the fingerprint on purpose — only
# the (action, element) shape matters for detecting a retry loop.
assert fp_a == fp_b
def test_compute_action_sequence_fingerprint_none_when_trace_missing() -> None:
assert compute_action_sequence_fingerprint([]) is None
assert compute_action_sequence_fingerprint([{"status": "completed"}]) is None
assert compute_action_sequence_fingerprint([{"action_trace": []}]) is None
def test_compute_action_sequence_fingerprint_distinguishes_different_sequences() -> None:
trace_a = [{"action": "click", "element": "btn-a"}]
trace_b = [{"action": "click", "element": "btn-b"}]
trace_c = [{"action": "input_text", "element": "btn-a"}]
fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
fp_c = compute_action_sequence_fingerprint([{"action_trace": trace_c}])
assert fp_a != fp_b
assert fp_a != fp_c
assert fp_b != fp_c
def test_tool_loop_error_fires_hard_abort_on_block_running_tools_when_streak_high() -> None:
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT)
error = _tool_loop_error(ctx, "run_blocks_and_collect_debug")
assert error is not None
assert "Repeated-action abort" in error
error_update = _tool_loop_error(ctx, "update_and_run_blocks")
assert error_update is not None
assert "Repeated-action abort" in error_update
def test_tool_loop_error_does_not_fire_for_non_block_running_tools() -> None:
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT + 5)
# Planning/metadata tools keep their existing loop-detection behavior
# and are not affected by the block-run streak.
assert _tool_loop_error(ctx, "update_workflow") is None
assert _tool_loop_error(ctx, "list_credentials") is None
assert _tool_loop_error(ctx, "get_run_results") is None
def test_tool_loop_error_does_not_fire_below_threshold() -> None:
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT - 1)
assert _tool_loop_error(ctx, "run_blocks_and_collect_debug") is None

View file

@ -0,0 +1,148 @@
"""Tests for the copilot orphan-workflow cancellation helpers.
Covers:
- ``_cancel_run_task_if_not_final`` cancels ``run_task`` and writes the
conditional cancel exactly once.
- A SUCCESS path (run_task completes on its own) still calls the conditional
cancel, but because the row is terminal the real helper would be a no-op.
- An SDK-realistic ``asyncio.wait_for`` timeout around the tool coroutine does
not leave ``run_task`` running in the background.
"""
from __future__ import annotations
import asyncio
from typing import Any
import pytest
from skyvern.forge.sdk.copilot.tools import _cancel_run_task_if_not_final
class _FakeService:
def __init__(self) -> None:
self.mark_calls: list[str] = []
self.raise_on_mark: Exception | None = None
async def mark_workflow_run_as_canceled_if_not_final(
self,
workflow_run_id: str,
) -> Any:
self.mark_calls.append(workflow_run_id)
if self.raise_on_mark is not None:
raise self.raise_on_mark
return None
@pytest.mark.asyncio
async def test_cancel_helper_cancels_task_and_writes_conditional_cancel(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from skyvern.forge import app as forge_app
service = _FakeService()
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
async def long_running() -> None:
await asyncio.sleep(60)
run_task = asyncio.create_task(long_running())
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_1")
assert run_task.cancelled() or run_task.done()
assert service.mark_calls == ["wr_1"]
@pytest.mark.asyncio
async def test_cancel_helper_does_not_raise_on_mark_failure(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Secondary errors during cleanup are logged, not propagated — otherwise
they would replace the original timeout/cancellation surface."""
from skyvern.forge import app as forge_app
service = _FakeService()
service.raise_on_mark = RuntimeError("DB is down")
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
async def long_running() -> None:
await asyncio.sleep(60)
run_task = asyncio.create_task(long_running())
# Must not raise despite the mark raising.
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_2")
@pytest.mark.asyncio
async def test_cancel_helper_handles_already_completed_run_task(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When run_task has already finished (natural completion), the helper
should still issue the conditional cancel it is a no-op at the DB layer
if the row is already terminal, so the result is harmless."""
from skyvern.forge import app as forge_app
service = _FakeService()
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
async def quick() -> None:
return
run_task = asyncio.create_task(quick())
await run_task
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_3")
assert service.mark_calls == ["wr_3"]
@pytest.mark.asyncio
async def test_sdk_style_wait_for_timeout_does_not_leak_background_work(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Exercises the production failure mode: the OpenAI Agents SDK wraps the
tool coroutine in ``asyncio.wait_for(..., timeout=N)`` and cancels it on
timeout. Our CancelledError branch must cancel ``run_task`` through the
helper so no orphan work is left behind."""
from skyvern.forge import app as forge_app
service = _FakeService()
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
run_task_ref: dict[str, asyncio.Task] = {}
workflow_work_completed = asyncio.Event()
async def tool_body() -> None:
async def workflow_body() -> None:
try:
await asyncio.sleep(60)
except asyncio.CancelledError:
workflow_work_completed.set()
raise
run_task = asyncio.create_task(workflow_body())
run_task_ref["run_task"] = run_task
try:
# Simulate the inner poll loop.
while True:
await asyncio.sleep(0.05)
except asyncio.CancelledError:
try:
await asyncio.shield(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
except asyncio.CancelledError:
# Detached fallback mirror of the production path.
fallback = asyncio.ensure_future(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
await asyncio.wait_for(asyncio.shield(fallback), timeout=5.0)
raise
tool_task = asyncio.ensure_future(tool_body())
with pytest.raises(asyncio.TimeoutError):
await asyncio.wait_for(tool_task, timeout=0.2)
# Workflow's CancelledError handler should have fired via our helper.
await asyncio.wait_for(workflow_work_completed.wait(), timeout=1.0)
assert "run_task" in run_task_ref
assert run_task_ref["run_task"].cancelled() or run_task_ref["run_task"].done()
assert service.mark_calls == ["wr_sdk"]

View file

@ -6,10 +6,14 @@
- ``execute_workflow_webhook`` returns cleanly when the workflow row has been
soft-deleted mid-run it must not raise ``WorkflowNotFound`` from the
cleanup path.
- The cancellation-safe finalize pattern used in ``execute_workflow``'s outer
``finally`` runs ``_finalize_workflow_run_status`` via ``asyncio.shield``
so an outer cancel mid-body still restores the real terminal status.
"""
from __future__ import annotations
import asyncio
from unittest.mock import AsyncMock, MagicMock
import pytest
@ -113,3 +117,89 @@ async def test_build_status_response_uses_filter_deleted_false_when_allowed(
by_wpid.assert_awaited_once()
assert by_wpid.call_args.kwargs["filter_deleted"] is False
@pytest.mark.asyncio
async def test_shielded_finalize_runs_when_outer_cancelled_mid_body() -> None:
"""Contract test for the ``execute_workflow`` cancellation-safe pattern:
when the try body is cancelled after ``pre_finally_status`` is captured,
the outer ``finally`` must still run ``_finalize_workflow_run_status``
via ``asyncio.shield`` so the row ends up terminal rather than stuck as
transient ``running``. Mirrors the structure of
``WorkflowService.execute_workflow``; if anyone removes the ``shield`` or
moves finalize back into the try, this test breaks.
"""
finalize_calls: list[WorkflowRunStatus] = []
clean_up_called = False
body_entered = asyncio.Event()
async def finalize(status: WorkflowRunStatus) -> None:
# Simulate a non-trivial DB write so shield cancellation-protection
# matters rather than being invisible.
await asyncio.sleep(0.05)
finalize_calls.append(status)
async def clean_up() -> None:
nonlocal clean_up_called
clean_up_called = True
async def simulated_execute_workflow() -> None:
pre_finally_status: WorkflowRunStatus | None = None
try:
pre_finally_status = WorkflowRunStatus.failed
body_entered.set()
# Simulate the finally-block execution phase that our copilot
# cancel lands inside of.
await asyncio.sleep(10)
finally:
if pre_finally_status is not None:
try:
await asyncio.shield(finalize(pre_finally_status))
except Exception:
pass
await clean_up()
task = asyncio.create_task(simulated_execute_workflow())
await body_entered.wait()
task.cancel()
with pytest.raises(asyncio.CancelledError):
await task
assert finalize_calls == [WorkflowRunStatus.failed], (
"shielded finalize must run with the captured pre_finally_status"
)
assert clean_up_called, "clean_up_workflow must still run in the outer finally"
@pytest.mark.asyncio
async def test_shielded_finalize_skipped_when_pre_finally_status_unset() -> None:
"""If cancellation lands before block execution captures
``pre_finally_status``, there's no intended terminal state to restore —
the outer ``finally`` must skip finalize, not call it with ``None``.
"""
finalize_called = False
async def finalize(status: WorkflowRunStatus) -> None:
nonlocal finalize_called
finalize_called = True
async def simulated_execute_workflow() -> None:
pre_finally_status: WorkflowRunStatus | None = None
try:
await asyncio.sleep(10)
pre_finally_status = WorkflowRunStatus.failed # pragma: no cover
finally:
if pre_finally_status is not None:
await asyncio.shield(finalize(pre_finally_status))
task = asyncio.create_task(simulated_execute_workflow())
await asyncio.sleep(0) # let the task enter its body
task.cancel()
with pytest.raises(asyncio.CancelledError):
await task
assert not finalize_called, "finalize must not run when pre_finally_status is unset"

View file

@ -16,6 +16,7 @@ import json
import yaml
from skyvern.forge.sdk.routes.workflow_copilot import _process_workflow_yaml
from skyvern.utils.yaml_loader import safe_load_no_dates
ISO_BLOB = """
@ -75,3 +76,40 @@ def test_safe_load_no_dates_preserves_other_implicit_types() -> None:
assert parsed["a_bool"] is True
assert parsed["a_null"] is None
assert parsed["a_list"] == [1, 2, 3]
def test_process_workflow_yaml_keeps_json_parameter_iso_strings() -> None:
workflow = _process_workflow_yaml(
workflow_id="wf-123",
workflow_permanent_id="wfp-123",
organization_id="org-123",
workflow_yaml="""
title: Test
workflow_definition:
parameters:
- parameter_type: workflow
key: payload
workflow_parameter_type: json
default_value:
id: "12345"
metadata:
created_at: 2023-10-27T10:00:00Z
updated_at: 2023-10-28T14:30:00Z
blocks:
- block_type: navigation
label: step1
url: https://example.com
title: Step 1
navigation_goal: Open the page
""",
)
parameter = workflow.get_parameter("payload")
assert parameter is not None
assert parameter.default_value is not None
metadata = parameter.default_value["metadata"]
assert metadata["created_at"] == "2023-10-27T10:00:00Z"
assert metadata["updated_at"] == "2023-10-28T14:30:00Z"
assert isinstance(metadata["created_at"], str)
assert isinstance(metadata["updated_at"], str)