feat(SKY-8879) copilot-stack/06: MCP tools surface + orphan-task cancellation (#5517)

2026-04-28 03:30:10 +00:00 · 2026-04-15 21:20:16 -07:00 · 2026-04-15 21:20:16 -07:00 · faa2b233cb
commit faa2b233cb
parent d58ea46163
16 changed files with 2588 additions and 23 deletions
--- a/tests/unit/test_copilot_action_fingerprint.py
+++ b/tests/unit/test_copilot_action_fingerprint.py
@ -0,0 +1,96 @@
+"""Tests for the action-sequence fingerprint compute and the hard-abort
+short-circuit in ``_tool_loop_error``.
+
+The streak counter that drives the abort is owned by
+``failure_tracking.update_repeated_failure_state`` (stack 03). These tests
+cover only the tools.py-side behavior:
+
+- ``compute_action_sequence_fingerprint`` is stable across runs that fire
+  the same action shape (independent of reasoning text / status).
+- ``compute_action_sequence_fingerprint`` distinguishes different sequences.
+- ``_tool_loop_error`` returns a hard-abort message when the streak crosses
+  ``REPEATED_ACTION_STREAK_ABORT_AT`` for a block-running tool.
+- The hard abort does NOT fire for non-block-running tools, regardless of
+  streak height.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from skyvern.forge.sdk.copilot.failure_tracking import compute_action_sequence_fingerprint
+from skyvern.forge.sdk.copilot.tools import (
+    REPEATED_ACTION_STREAK_ABORT_AT,
+    _tool_loop_error,
+)
+
+
+def _ctx_with_streak(streak: int) -> SimpleNamespace:
+    return SimpleNamespace(
+        consecutive_tool_tracker=[],
+        repeated_action_fingerprint_streak_count=streak,
+    )
+
+
+def test_compute_action_sequence_fingerprint_stable_for_same_sequence() -> None:
+    trace_a = [
+        {"action": "input_text", "element": "elem-name", "reasoning": "r1", "status": "failed"},
+        {"action": "input_text", "element": "elem-email", "reasoning": "r2", "status": "failed"},
+        {"action": "click", "element": "elem-submit", "reasoning": "r3", "status": "failed"},
+    ]
+    trace_b = [
+        {"action": "input_text", "element": "elem-name", "reasoning": "different_text", "status": "failed"},
+        {"action": "input_text", "element": "elem-email", "reasoning": "other", "status": "failed"},
+        {"action": "click", "element": "elem-submit", "reasoning": "third", "status": "failed"},
+    ]
+    fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
+    fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
+    assert fp_a is not None
+    # Reasoning / status are excluded from the fingerprint on purpose — only
+    # the (action, element) shape matters for detecting a retry loop.
+    assert fp_a == fp_b
+
+
+def test_compute_action_sequence_fingerprint_none_when_trace_missing() -> None:
+    assert compute_action_sequence_fingerprint([]) is None
+    assert compute_action_sequence_fingerprint([{"status": "completed"}]) is None
+    assert compute_action_sequence_fingerprint([{"action_trace": []}]) is None
+
+
+def test_compute_action_sequence_fingerprint_distinguishes_different_sequences() -> None:
+    trace_a = [{"action": "click", "element": "btn-a"}]
+    trace_b = [{"action": "click", "element": "btn-b"}]
+    trace_c = [{"action": "input_text", "element": "btn-a"}]
+    fp_a = compute_action_sequence_fingerprint([{"action_trace": trace_a}])
+    fp_b = compute_action_sequence_fingerprint([{"action_trace": trace_b}])
+    fp_c = compute_action_sequence_fingerprint([{"action_trace": trace_c}])
+    assert fp_a != fp_b
+    assert fp_a != fp_c
+    assert fp_b != fp_c
+
+
+def test_tool_loop_error_fires_hard_abort_on_block_running_tools_when_streak_high() -> None:
+    ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT)
+
+    error = _tool_loop_error(ctx, "run_blocks_and_collect_debug")
+    assert error is not None
+    assert "Repeated-action abort" in error
+
+    error_update = _tool_loop_error(ctx, "update_and_run_blocks")
+    assert error_update is not None
+    assert "Repeated-action abort" in error_update
+
+
+def test_tool_loop_error_does_not_fire_for_non_block_running_tools() -> None:
+    ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT + 5)
+
+    # Planning/metadata tools keep their existing loop-detection behavior
+    # and are not affected by the block-run streak.
+    assert _tool_loop_error(ctx, "update_workflow") is None
+    assert _tool_loop_error(ctx, "list_credentials") is None
+    assert _tool_loop_error(ctx, "get_run_results") is None
+
+
+def test_tool_loop_error_does_not_fire_below_threshold() -> None:
+    ctx = _ctx_with_streak(REPEATED_ACTION_STREAK_ABORT_AT - 1)
+    assert _tool_loop_error(ctx, "run_blocks_and_collect_debug") is None
--- a/tests/unit/test_copilot_cancel_helpers.py
+++ b/tests/unit/test_copilot_cancel_helpers.py
@ -0,0 +1,148 @@
+"""Tests for the copilot orphan-workflow cancellation helpers.
+
+Covers:
+- ``_cancel_run_task_if_not_final`` cancels ``run_task`` and writes the
+  conditional cancel exactly once.
+- A SUCCESS path (run_task completes on its own) still calls the conditional
+  cancel, but because the row is terminal the real helper would be a no-op.
+- An SDK-realistic ``asyncio.wait_for`` timeout around the tool coroutine does
+  not leave ``run_task`` running in the background.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+import pytest
+
+from skyvern.forge.sdk.copilot.tools import _cancel_run_task_if_not_final
+
+
+class _FakeService:
+    def __init__(self) -> None:
+        self.mark_calls: list[str] = []
+        self.raise_on_mark: Exception | None = None
+
+    async def mark_workflow_run_as_canceled_if_not_final(
+        self,
+        workflow_run_id: str,
+    ) -> Any:
+        self.mark_calls.append(workflow_run_id)
+        if self.raise_on_mark is not None:
+            raise self.raise_on_mark
+        return None
+
+
+@pytest.mark.asyncio
+async def test_cancel_helper_cancels_task_and_writes_conditional_cancel(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from skyvern.forge import app as forge_app
+
+    service = _FakeService()
+    monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
+
+    async def long_running() -> None:
+        await asyncio.sleep(60)
+
+    run_task = asyncio.create_task(long_running())
+
+    await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_1")
+
+    assert run_task.cancelled() or run_task.done()
+    assert service.mark_calls == ["wr_1"]
+
+
+@pytest.mark.asyncio
+async def test_cancel_helper_does_not_raise_on_mark_failure(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Secondary errors during cleanup are logged, not propagated — otherwise
+    they would replace the original timeout/cancellation surface."""
+    from skyvern.forge import app as forge_app
+
+    service = _FakeService()
+    service.raise_on_mark = RuntimeError("DB is down")
+    monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
+
+    async def long_running() -> None:
+        await asyncio.sleep(60)
+
+    run_task = asyncio.create_task(long_running())
+
+    # Must not raise despite the mark raising.
+    await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_2")
+
+
+@pytest.mark.asyncio
+async def test_cancel_helper_handles_already_completed_run_task(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When run_task has already finished (natural completion), the helper
+    should still issue the conditional cancel — it is a no-op at the DB layer
+    if the row is already terminal, so the result is harmless."""
+    from skyvern.forge import app as forge_app
+
+    service = _FakeService()
+    monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
+
+    async def quick() -> None:
+        return
+
+    run_task = asyncio.create_task(quick())
+    await run_task
+
+    await _cancel_run_task_if_not_final(run_task, workflow_run_id="wr_3")
+    assert service.mark_calls == ["wr_3"]
+
+
+@pytest.mark.asyncio
+async def test_sdk_style_wait_for_timeout_does_not_leak_background_work(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Exercises the production failure mode: the OpenAI Agents SDK wraps the
+    tool coroutine in ``asyncio.wait_for(..., timeout=N)`` and cancels it on
+    timeout. Our CancelledError branch must cancel ``run_task`` through the
+    helper so no orphan work is left behind."""
+    from skyvern.forge import app as forge_app
+
+    service = _FakeService()
+    monkeypatch.setattr(forge_app, "WORKFLOW_SERVICE", service)
+
+    run_task_ref: dict[str, asyncio.Task] = {}
+    workflow_work_completed = asyncio.Event()
+
+    async def tool_body() -> None:
+        async def workflow_body() -> None:
+            try:
+                await asyncio.sleep(60)
+            except asyncio.CancelledError:
+                workflow_work_completed.set()
+                raise
+
+        run_task = asyncio.create_task(workflow_body())
+        run_task_ref["run_task"] = run_task
+        try:
+            # Simulate the inner poll loop.
+            while True:
+                await asyncio.sleep(0.05)
+        except asyncio.CancelledError:
+            try:
+                await asyncio.shield(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
+            except asyncio.CancelledError:
+                # Detached fallback mirror of the production path.
+                fallback = asyncio.ensure_future(_cancel_run_task_if_not_final(run_task, workflow_run_id="wr_sdk"))
+                await asyncio.wait_for(asyncio.shield(fallback), timeout=5.0)
+            raise
+
+    tool_task = asyncio.ensure_future(tool_body())
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(tool_task, timeout=0.2)
+
+    # Workflow's CancelledError handler should have fired via our helper.
+    await asyncio.wait_for(workflow_work_completed.wait(), timeout=1.0)
+
+    assert "run_task" in run_task_ref
+    assert run_task_ref["run_task"].cancelled() or run_task_ref["run_task"].done()
+    assert service.mark_calls == ["wr_sdk"]
--- a/tests/unit/test_workflow_service_cancel.py
+++ b/tests/unit/test_workflow_service_cancel.py
@ -6,10 +6,14 @@
 - ``execute_workflow_webhook`` returns cleanly when the workflow row has been
  soft-deleted mid-run — it must not raise ``WorkflowNotFound`` from the
  cleanup path.
+- The cancellation-safe finalize pattern used in ``execute_workflow``'s outer
+  ``finally`` runs ``_finalize_workflow_run_status`` via ``asyncio.shield``
+  so an outer cancel mid-body still restores the real terminal status.
 """

 from __future__ import annotations

+import asyncio
 from unittest.mock import AsyncMock, MagicMock

 import pytest
@ -113,3 +117,89 @@ async def test_build_status_response_uses_filter_deleted_false_when_allowed(

    by_wpid.assert_awaited_once()
    assert by_wpid.call_args.kwargs["filter_deleted"] is False
+
+
+@pytest.mark.asyncio
+async def test_shielded_finalize_runs_when_outer_cancelled_mid_body() -> None:
+    """Contract test for the ``execute_workflow`` cancellation-safe pattern:
+    when the try body is cancelled after ``pre_finally_status`` is captured,
+    the outer ``finally`` must still run ``_finalize_workflow_run_status``
+    via ``asyncio.shield`` so the row ends up terminal rather than stuck as
+    transient ``running``. Mirrors the structure of
+    ``WorkflowService.execute_workflow``; if anyone removes the ``shield`` or
+    moves finalize back into the try, this test breaks.
+    """
+
+    finalize_calls: list[WorkflowRunStatus] = []
+    clean_up_called = False
+    body_entered = asyncio.Event()
+
+    async def finalize(status: WorkflowRunStatus) -> None:
+        # Simulate a non-trivial DB write so shield cancellation-protection
+        # matters rather than being invisible.
+        await asyncio.sleep(0.05)
+        finalize_calls.append(status)
+
+    async def clean_up() -> None:
+        nonlocal clean_up_called
+        clean_up_called = True
+
+    async def simulated_execute_workflow() -> None:
+        pre_finally_status: WorkflowRunStatus | None = None
+        try:
+            pre_finally_status = WorkflowRunStatus.failed
+            body_entered.set()
+            # Simulate the finally-block execution phase that our copilot
+            # cancel lands inside of.
+            await asyncio.sleep(10)
+        finally:
+            if pre_finally_status is not None:
+                try:
+                    await asyncio.shield(finalize(pre_finally_status))
+                except Exception:
+                    pass
+            await clean_up()
+
+    task = asyncio.create_task(simulated_execute_workflow())
+    await body_entered.wait()
+    task.cancel()
+
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    assert finalize_calls == [WorkflowRunStatus.failed], (
+        "shielded finalize must run with the captured pre_finally_status"
+    )
+    assert clean_up_called, "clean_up_workflow must still run in the outer finally"
+
+
+@pytest.mark.asyncio
+async def test_shielded_finalize_skipped_when_pre_finally_status_unset() -> None:
+    """If cancellation lands before block execution captures
+    ``pre_finally_status``, there's no intended terminal state to restore —
+    the outer ``finally`` must skip finalize, not call it with ``None``.
+    """
+
+    finalize_called = False
+
+    async def finalize(status: WorkflowRunStatus) -> None:
+        nonlocal finalize_called
+        finalize_called = True
+
+    async def simulated_execute_workflow() -> None:
+        pre_finally_status: WorkflowRunStatus | None = None
+        try:
+            await asyncio.sleep(10)
+            pre_finally_status = WorkflowRunStatus.failed  # pragma: no cover
+        finally:
+            if pre_finally_status is not None:
+                await asyncio.shield(finalize(pre_finally_status))
+
+    task = asyncio.create_task(simulated_execute_workflow())
+    await asyncio.sleep(0)  # let the task enter its body
+    task.cancel()
+
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    assert not finalize_called, "finalize must not run when pre_finally_status is unset"
--- a/tests/unit/test_yaml_loader_no_dates.py
+++ b/tests/unit/test_yaml_loader_no_dates.py
@ -16,6 +16,7 @@ import json

 import yaml

+from skyvern.forge.sdk.routes.workflow_copilot import _process_workflow_yaml
 from skyvern.utils.yaml_loader import safe_load_no_dates

 ISO_BLOB = """
@ -75,3 +76,40 @@ def test_safe_load_no_dates_preserves_other_implicit_types() -> None:
    assert parsed["a_bool"] is True
    assert parsed["a_null"] is None
    assert parsed["a_list"] == [1, 2, 3]
+
+
+def test_process_workflow_yaml_keeps_json_parameter_iso_strings() -> None:
+    workflow = _process_workflow_yaml(
+        workflow_id="wf-123",
+        workflow_permanent_id="wfp-123",
+        organization_id="org-123",
+        workflow_yaml="""
+title: Test
+workflow_definition:
+  parameters:
+    - parameter_type: workflow
+      key: payload
+      workflow_parameter_type: json
+      default_value:
+        id: "12345"
+        metadata:
+          created_at: 2023-10-27T10:00:00Z
+          updated_at: 2023-10-28T14:30:00Z
+  blocks:
+    - block_type: navigation
+      label: step1
+      url: https://example.com
+      title: Step 1
+      navigation_goal: Open the page
+""",
+    )
+
+    parameter = workflow.get_parameter("payload")
+    assert parameter is not None
+    assert parameter.default_value is not None
+
+    metadata = parameter.default_value["metadata"]
+    assert metadata["created_at"] == "2023-10-27T10:00:00Z"
+    assert metadata["updated_at"] == "2023-10-28T14:30:00Z"
+    assert isinstance(metadata["created_at"], str)
+    assert isinstance(metadata["updated_at"], str)