mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
feat(SKY-8879) copilot-stack/06: MCP tools surface + orphan-task cancellation (#5517)
This commit is contained in:
parent
d58ea46163
commit
faa2b233cb
16 changed files with 2588 additions and 23 deletions
96
tests/unit/test_copilot_action_fingerprint.py
Normal file
96
tests/unit/test_copilot_action_fingerprint.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
"""Tests for the action-sequence fingerprint compute and the hard-abort
|
||||
short-circuit in ``_tool_loop_error``.
|
||||
|
||||
The streak counter that drives the abort is owned by
|
||||
``failure_tracking.update_repeated_failure_state`` (stack 03). These tests
|
||||
cover only the tools.py-side behavior:
|
||||
|
||||
- ``compute_action_sequence_fingerprint`` is stable across runs that fire
|
||||
the same action shape (independent of reasoning text / status).
|
||||
- ``compute_action_sequence_fingerprint`` distinguishes different sequences.
|
||||
- ``_tool_loop_error`` returns a hard-abort message when the streak crosses
|
||||
``REPEATED_ACTION_STREAK_ABORT_AT`` for a block-running tool.
|
||||
- The hard abort does NOT fire for non-block-running tools, regardless of
|
||||
streak height.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
from skyvern.forge.sdk.copilot.failure_tracking import compute_action_sequence_fingerprint
|
||||
from skyvern.forge.sdk.copilot.tools import (
|
||||
REPEATED_ACTION_STREAK_ABORT_AT,
|
||||
_tool_loop_error,
|
||||
)
|
||||
|
||||
|
||||
def _ctx_with_streak(streak: int) -> SimpleNamespace:
|
||||
return SimpleNamespace(
|
||||
consecutive_tool_tracker=[],
|
||||
repeated_action_fingerprint_streak_count=streak,
|
||||
)
|
||||
|
||||
|
||||
def test_compute_action_sequence_fingerprint_stable_for_same_sequence() -> None:
|
||||
trace_a = [
|
||||
{"action": "input_text", "element": "elem-name", "reasoning": "r1", "status": "failed"},
|
||||
{"action": "input_text", "element": "elem-email", "reasoning": "r2", "status": "failed"},
|
||||
{"action": "click", "element": "elem-submit", "reasoning": "r3", "status": "failed"},
|
||||
]
|
||||
trace_b = [
|
||||
{"action": "input_text", "element": "elem-name", "reasoning": "different_text", "status": "failed"},
|
||||
{"action": "input_text", "element": "elem-email", "reasoning": "other", "status": "failed"},
|
||||
{"action": "click", "element": "elem-submit", "reasoning": "third", "status": "failed"},
|
||||
]
|
||||
fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
|
||||
fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
|
||||
assert fp_a is not None
|
||||
# Reasoning / status are excluded from the fingerprint on purpose — only
|
||||
# the (action, element) shape matters for detecting a retry loop.
|
||||
assert fp_a == fp_b
|
||||
|
||||
|
||||
def test_compute_action_sequence_fingerprint_none_when_trace_missing() -> None:
|
||||
assert compute_action_sequence_fingerprint([]) is None
|
||||
assert compute_action_sequence_fingerprint([{"status": "completed"}]) is None
|
||||
assert compute_action_sequence_fingerprint([{"action_trace": []}]) is None
|
||||
|
||||
|
||||
def test_compute_action_sequence_fingerprint_distinguishes_different_sequences() -> None:
|
||||
trace_a = [{"action": "click", "element": "btn-a"}]
|
||||
trace_b = [{"action": "click", "element": "btn-b"}]
|
||||
trace_c = [{"action": "input_text", "element": "btn-a"}]
|
||||
fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
|
||||
fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
|
||||
fp_c = compute_action_sequence_fingerprint([{"action_trace": trace_c}])
|
||||
assert fp_a != fp_b
|
||||
assert fp_a != fp_c
|
||||
assert fp_b != fp_c
|
||||
|
||||
|
||||
def test_tool_loop_error_fires_hard_abort_on_block_running_tools_when_streak_high() -> None:
|
||||
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT)
|
||||
|
||||
error = _tool_loop_error(ctx, "run_blocks_and_collect_debug")
|
||||
assert error is not None
|
||||
assert "Repeated-action abort" in error
|
||||
|
||||
error_update = _tool_loop_error(ctx, "update_and_run_blocks")
|
||||
assert error_update is not None
|
||||
assert "Repeated-action abort" in error_update
|
||||
|
||||
|
||||
def test_tool_loop_error_does_not_fire_for_non_block_running_tools() -> None:
|
||||
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT + 5)
|
||||
|
||||
# Planning/metadata tools keep their existing loop-detection behavior
|
||||
# and are not affected by the block-run streak.
|
||||
assert _tool_loop_error(ctx, "update_workflow") is None
|
||||
assert _tool_loop_error(ctx, "list_credentials") is None
|
||||
assert _tool_loop_error(ctx, "get_run_results") is None
|
||||
|
||||
|
||||
def test_tool_loop_error_does_not_fire_below_threshold() -> None:
|
||||
ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT - 1)
|
||||
assert _tool_loop_error(ctx, "run_blocks_and_collect_debug") is None
|
||||
148
tests/unit/test_copilot_cancel_helpers.py
Normal file
148
tests/unit/test_copilot_cancel_helpers.py
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
"""Tests for the copilot orphan-workflow cancellation helpers.
|
||||
|
||||
Covers:
|
||||
- ``_cancel_run_task_if_not_final`` cancels ``run_task`` and writes the
|
||||
conditional cancel exactly once.
|
||||
- A SUCCESS path (run_task completes on its own) still calls the conditional
|
||||
cancel, but because the row is terminal the real helper would be a no-op.
|
||||
- An SDK-realistic ``asyncio.wait_for`` timeout around the tool coroutine does
|
||||
not leave ``run_task`` running in the background.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from skyvern.forge.sdk.copilot.tools import _cancel_run_task_if_not_final
|
||||
|
||||
|
||||
class _FakeService:
|
||||
def __init__(self) -> None:
|
||||
self.mark_calls: list[str] = []
|
||||
self.raise_on_mark: Exception | None = None
|
||||
|
||||
async def mark_workflow_run_as_canceled_if_not_final(
|
||||
self,
|
||||
workflow_run_id: str,
|
||||
) -> Any:
|
||||
self.mark_calls.append(workflow_run_id)
|
||||
if self.raise_on_mark is not None:
|
||||
raise self.raise_on_mark
|
||||
return None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cancel_helper_cancels_task_and_writes_conditional_cancel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
from skyvern.forge import app as forge_app
|
||||
|
||||
service = _FakeService()
|
||||
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
|
||||
|
||||
async def long_running() -> None:
|
||||
await asyncio.sleep(60)
|
||||
|
||||
run_task = asyncio.create_task(long_running())
|
||||
|
||||
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_1")
|
||||
|
||||
assert run_task.cancelled() or run_task.done()
|
||||
assert service.mark_calls == ["wr_1"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cancel_helper_does_not_raise_on_mark_failure(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Secondary errors during cleanup are logged, not propagated — otherwise
|
||||
they would replace the original timeout/cancellation surface."""
|
||||
from skyvern.forge import app as forge_app
|
||||
|
||||
service = _FakeService()
|
||||
service.raise_on_mark = RuntimeError("DB is down")
|
||||
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
|
||||
|
||||
async def long_running() -> None:
|
||||
await asyncio.sleep(60)
|
||||
|
||||
run_task = asyncio.create_task(long_running())
|
||||
|
||||
# Must not raise despite the mark raising.
|
||||
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_2")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cancel_helper_handles_already_completed_run_task(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""When run_task has already finished (natural completion), the helper
|
||||
should still issue the conditional cancel — it is a no-op at the DB layer
|
||||
if the row is already terminal, so the result is harmless."""
|
||||
from skyvern.forge import app as forge_app
|
||||
|
||||
service = _FakeService()
|
||||
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
|
||||
|
||||
async def quick() -> None:
|
||||
return
|
||||
|
||||
run_task = asyncio.create_task(quick())
|
||||
await run_task
|
||||
|
||||
await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_3")
|
||||
assert service.mark_calls == ["wr_3"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sdk_style_wait_for_timeout_does_not_leak_background_work(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Exercises the production failure mode: the OpenAI Agents SDK wraps the
|
||||
tool coroutine in ``asyncio.wait_for(..., timeout=N)`` and cancels it on
|
||||
timeout. Our CancelledError branch must cancel ``run_task`` through the
|
||||
helper so no orphan work is left behind."""
|
||||
from skyvern.forge import app as forge_app
|
||||
|
||||
service = _FakeService()
|
||||
monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
|
||||
|
||||
run_task_ref: dict[str, asyncio.Task] = {}
|
||||
workflow_work_completed = asyncio.Event()
|
||||
|
||||
async def tool_body() -> None:
|
||||
async def workflow_body() -> None:
|
||||
try:
|
||||
await asyncio.sleep(60)
|
||||
except asyncio.CancelledError:
|
||||
workflow_work_completed.set()
|
||||
raise
|
||||
|
||||
run_task = asyncio.create_task(workflow_body())
|
||||
run_task_ref["run_task"] = run_task
|
||||
try:
|
||||
# Simulate the inner poll loop.
|
||||
while True:
|
||||
await asyncio.sleep(0.05)
|
||||
except asyncio.CancelledError:
|
||||
try:
|
||||
await asyncio.shield(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
|
||||
except asyncio.CancelledError:
|
||||
# Detached fallback mirror of the production path.
|
||||
fallback = asyncio.ensure_future(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
|
||||
await asyncio.wait_for(asyncio.shield(fallback), timeout=5.0)
|
||||
raise
|
||||
|
||||
tool_task = asyncio.ensure_future(tool_body())
|
||||
with pytest.raises(asyncio.TimeoutError):
|
||||
await asyncio.wait_for(tool_task, timeout=0.2)
|
||||
|
||||
# Workflow's CancelledError handler should have fired via our helper.
|
||||
await asyncio.wait_for(workflow_work_completed.wait(), timeout=1.0)
|
||||
|
||||
assert "run_task" in run_task_ref
|
||||
assert run_task_ref["run_task"].cancelled() or run_task_ref["run_task"].done()
|
||||
assert service.mark_calls == ["wr_sdk"]
|
||||
|
|
@ -6,10 +6,14 @@
|
|||
- ``execute_workflow_webhook`` returns cleanly when the workflow row has been
|
||||
soft-deleted mid-run — it must not raise ``WorkflowNotFound`` from the
|
||||
cleanup path.
|
||||
- The cancellation-safe finalize pattern used in ``execute_workflow``'s outer
|
||||
``finally`` runs ``_finalize_workflow_run_status`` via ``asyncio.shield``
|
||||
so an outer cancel mid-body still restores the real terminal status.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
|
@ -113,3 +117,89 @@ async def test_build_status_response_uses_filter_deleted_false_when_allowed(
|
|||
|
||||
by_wpid.assert_awaited_once()
|
||||
assert by_wpid.call_args.kwargs["filter_deleted"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shielded_finalize_runs_when_outer_cancelled_mid_body() -> None:
|
||||
"""Contract test for the ``execute_workflow`` cancellation-safe pattern:
|
||||
when the try body is cancelled after ``pre_finally_status`` is captured,
|
||||
the outer ``finally`` must still run ``_finalize_workflow_run_status``
|
||||
via ``asyncio.shield`` so the row ends up terminal rather than stuck as
|
||||
transient ``running``. Mirrors the structure of
|
||||
``WorkflowService.execute_workflow``; if anyone removes the ``shield`` or
|
||||
moves finalize back into the try, this test breaks.
|
||||
"""
|
||||
|
||||
finalize_calls: list[WorkflowRunStatus] = []
|
||||
clean_up_called = False
|
||||
body_entered = asyncio.Event()
|
||||
|
||||
async def finalize(status: WorkflowRunStatus) -> None:
|
||||
# Simulate a non-trivial DB write so shield cancellation-protection
|
||||
# matters rather than being invisible.
|
||||
await asyncio.sleep(0.05)
|
||||
finalize_calls.append(status)
|
||||
|
||||
async def clean_up() -> None:
|
||||
nonlocal clean_up_called
|
||||
clean_up_called = True
|
||||
|
||||
async def simulated_execute_workflow() -> None:
|
||||
pre_finally_status: WorkflowRunStatus | None = None
|
||||
try:
|
||||
pre_finally_status = WorkflowRunStatus.failed
|
||||
body_entered.set()
|
||||
# Simulate the finally-block execution phase that our copilot
|
||||
# cancel lands inside of.
|
||||
await asyncio.sleep(10)
|
||||
finally:
|
||||
if pre_finally_status is not None:
|
||||
try:
|
||||
await asyncio.shield(finalize(pre_finally_status))
|
||||
except Exception:
|
||||
pass
|
||||
await clean_up()
|
||||
|
||||
task = asyncio.create_task(simulated_execute_workflow())
|
||||
await body_entered.wait()
|
||||
task.cancel()
|
||||
|
||||
with pytest.raises(asyncio.CancelledError):
|
||||
await task
|
||||
|
||||
assert finalize_calls == [WorkflowRunStatus.failed], (
|
||||
"shielded finalize must run with the captured pre_finally_status"
|
||||
)
|
||||
assert clean_up_called, "clean_up_workflow must still run in the outer finally"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shielded_finalize_skipped_when_pre_finally_status_unset() -> None:
|
||||
"""If cancellation lands before block execution captures
|
||||
``pre_finally_status``, there's no intended terminal state to restore —
|
||||
the outer ``finally`` must skip finalize, not call it with ``None``.
|
||||
"""
|
||||
|
||||
finalize_called = False
|
||||
|
||||
async def finalize(status: WorkflowRunStatus) -> None:
|
||||
nonlocal finalize_called
|
||||
finalize_called = True
|
||||
|
||||
async def simulated_execute_workflow() -> None:
|
||||
pre_finally_status: WorkflowRunStatus | None = None
|
||||
try:
|
||||
await asyncio.sleep(10)
|
||||
pre_finally_status = WorkflowRunStatus.failed # pragma: no cover
|
||||
finally:
|
||||
if pre_finally_status is not None:
|
||||
await asyncio.shield(finalize(pre_finally_status))
|
||||
|
||||
task = asyncio.create_task(simulated_execute_workflow())
|
||||
await asyncio.sleep(0) # let the task enter its body
|
||||
task.cancel()
|
||||
|
||||
with pytest.raises(asyncio.CancelledError):
|
||||
await task
|
||||
|
||||
assert not finalize_called, "finalize must not run when pre_finally_status is unset"
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import json
|
|||
|
||||
import yaml
|
||||
|
||||
from skyvern.forge.sdk.routes.workflow_copilot import _process_workflow_yaml
|
||||
from skyvern.utils.yaml_loader import safe_load_no_dates
|
||||
|
||||
ISO_BLOB = """
|
||||
|
|
@ -75,3 +76,40 @@ def test_safe_load_no_dates_preserves_other_implicit_types() -> None:
|
|||
assert parsed["a_bool"] is True
|
||||
assert parsed["a_null"] is None
|
||||
assert parsed["a_list"] == [1, 2, 3]
|
||||
|
||||
|
||||
def test_process_workflow_yaml_keeps_json_parameter_iso_strings() -> None:
|
||||
workflow = _process_workflow_yaml(
|
||||
workflow_id="wf-123",
|
||||
workflow_permanent_id="wfp-123",
|
||||
organization_id="org-123",
|
||||
workflow_yaml="""
|
||||
title: Test
|
||||
workflow_definition:
|
||||
parameters:
|
||||
- parameter_type: workflow
|
||||
key: payload
|
||||
workflow_parameter_type: json
|
||||
default_value:
|
||||
id: "12345"
|
||||
metadata:
|
||||
created_at: 2023-10-27T10:00:00Z
|
||||
updated_at: 2023-10-28T14:30:00Z
|
||||
blocks:
|
||||
- block_type: navigation
|
||||
label: step1
|
||||
url: https://example.com
|
||||
title: Step 1
|
||||
navigation_goal: Open the page
|
||||
""",
|
||||
)
|
||||
|
||||
parameter = workflow.get_parameter("payload")
|
||||
assert parameter is not None
|
||||
assert parameter.default_value is not None
|
||||
|
||||
metadata = parameter.default_value["metadata"]
|
||||
assert metadata["created_at"] == "2023-10-27T10:00:00Z"
|
||||
assert metadata["updated_at"] == "2023-10-28T14:30:00Z"
|
||||
assert isinstance(metadata["created_at"], str)
|
||||
assert isinstance(metadata["updated_at"], str)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue