Skyvern/tests/unit/test_copilot_watchdog.py

414 lines
16 KiB
Python

"""Tests for the SKY-9163 progress-based watchdog inside
``_run_blocks_and_collect_debug``.
The full function does too much setup (prepare_workflow, execute_workflow,
parameter-binding invariants) to unit-test end-to-end cheaply. Instead we
target the isolated watchdog surface:
- ``_progress_marker`` — marker stability and field-change sensitivity.
- ``_read_progress_sources`` — correct delegation + graceful handling of
DB failures.
- ``_watchdog_error_message`` — the regression-guard strings (no
"timed out", reconciliation-instruction, per-reason body).
Those three are where the SKY-9163 correctness properties live:
1. A stale marker must be exactly equal across two polls when nothing
changed in the DB (otherwise the watchdog would false-reset on every
poll, making stagnation detection impossible).
2. Any change in ``run.status`` / ``run.modified_at`` / ``step_ts`` /
``block_ts`` must produce a new marker (otherwise the watchdog would
false-trip on a progressing run).
3. The error messages must not read as retry-invites — that was the
original bug. "timed out" / "likely stuck repeating failing actions"
are the exact phrases the LLM used to read as "try again".
"""
from __future__ import annotations
from datetime import datetime, timezone
from types import SimpleNamespace
from typing import Any
import pytest
from skyvern.forge.sdk.copilot.tools import (
RUN_BLOCKS_SAFETY_CEILING_SECONDS,
RUN_BLOCKS_STAGNATION_WINDOW_SECONDS,
_any_quiet_block_requested,
_progress_marker,
_read_progress_sources,
_tool_loop_error,
_watchdog_error_message,
)
def _fake_run(status: str = "running", modified_at: datetime | None = None) -> Any:
"""A bare-minimum stand-in for ``WorkflowRun`` — the marker only reads
``.status`` and ``.modified_at``.
"""
return SimpleNamespace(
status=status,
modified_at=modified_at or datetime(2026, 4, 21, 12, 0, 0, tzinfo=timezone.utc),
browser_session_id=None,
)
# ---------------------------------------------------------------------------
# _progress_marker: stability + per-field sensitivity.
# ---------------------------------------------------------------------------
def test_progress_marker_stable_for_identical_inputs() -> None:
"""If the DB reports identical values on two successive polls, the marker
must compare equal. A marker that drifts on repeated reads would make the
stagnation window unreachable."""
run = _fake_run()
step_ts = datetime(2026, 4, 21, 12, 0, 30, tzinfo=timezone.utc)
block_ts = datetime(2026, 4, 21, 12, 0, 31, tzinfo=timezone.utc)
m1 = _progress_marker(run, step_ts, block_ts)
m2 = _progress_marker(run, step_ts, block_ts)
assert m1 == m2
def test_progress_marker_changes_on_run_status() -> None:
run1 = _fake_run(status="running")
run2 = _fake_run(status="queued")
assert _progress_marker(run1, None, None) != _progress_marker(run2, None, None)
def test_progress_marker_changes_on_run_modified_at() -> None:
run1 = _fake_run(modified_at=datetime(2026, 4, 21, 12, 0, 0, tzinfo=timezone.utc))
run2 = _fake_run(modified_at=datetime(2026, 4, 21, 12, 0, 1, tzinfo=timezone.utc))
assert _progress_marker(run1, None, None) != _progress_marker(run2, None, None)
def test_progress_marker_changes_on_step_ts() -> None:
run = _fake_run()
t1 = datetime(2026, 4, 21, 12, 0, 0, tzinfo=timezone.utc)
t2 = datetime(2026, 4, 21, 12, 0, 5, tzinfo=timezone.utc)
assert _progress_marker(run, t1, None) != _progress_marker(run, t2, None)
def test_progress_marker_changes_on_block_ts() -> None:
run = _fake_run()
t1 = datetime(2026, 4, 21, 12, 0, 0, tzinfo=timezone.utc)
t2 = datetime(2026, 4, 21, 12, 0, 5, tzinfo=timezone.utc)
assert _progress_marker(run, None, t1) != _progress_marker(run, None, t2)
def test_progress_marker_tolerates_none_run() -> None:
"""A transient DB read failure can return ``run=None``. The marker must
still be hashable and comparable."""
m_none = _progress_marker(None, None, None)
assert m_none == (None, None, None, None)
# Two consecutive failed reads produce equal markers → stagnation clock
# keeps ticking (the right behavior when we can't confirm progress).
assert _progress_marker(None, None, None) == _progress_marker(None, None, None)
# ---------------------------------------------------------------------------
# _read_progress_sources: delegation + graceful DB-failure handling.
# ---------------------------------------------------------------------------
class _FakeTasksRepo:
def __init__(
self,
*,
step_ts: datetime | None = None,
block_ts: datetime | None = None,
raise_on_call: Exception | None = None,
) -> None:
self.step_ts = step_ts
self.block_ts = block_ts
self.raise_on_call = raise_on_call
self.call_count = 0
async def get_workflow_run_progress_timestamps(
self,
*,
workflow_run_id: str,
organization_id: str | None = None,
) -> tuple[datetime | None, datetime | None]:
self.call_count += 1
if self.raise_on_call is not None:
raise self.raise_on_call
return self.step_ts, self.block_ts
class _FakeWorkflowRunsRepo:
def __init__(self, run: Any | None = None, raise_on_call: Exception | None = None) -> None:
self.run = run
self.raise_on_call = raise_on_call
async def get_workflow_run(
self,
*,
workflow_run_id: str,
organization_id: str,
) -> Any:
if self.raise_on_call is not None:
raise self.raise_on_call
return self.run
class _FakeDatabase:
def __init__(self, tasks: _FakeTasksRepo, workflow_runs: _FakeWorkflowRunsRepo) -> None:
self.tasks = tasks
self.workflow_runs = workflow_runs
class _FakeCtx:
organization_id = "o_test"
@pytest.mark.asyncio
async def test_read_progress_sources_returns_run_and_timestamps(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from skyvern.forge import app as forge_app
run = _fake_run()
step_ts = datetime(2026, 4, 21, 12, 0, 10, tzinfo=timezone.utc)
block_ts = datetime(2026, 4, 21, 12, 0, 11, tzinfo=timezone.utc)
db = _FakeDatabase(
tasks=_FakeTasksRepo(step_ts=step_ts, block_ts=block_ts),
workflow_runs=_FakeWorkflowRunsRepo(run=run),
)
monkeypatch.setattr(forge_app, "DATABASE", db)
read_run, read_step_ts, read_block_ts = await _read_progress_sources(_FakeCtx(), "wr_1")
assert read_run is run
assert read_step_ts == step_ts
assert read_block_ts == block_ts
@pytest.mark.asyncio
async def test_read_progress_sources_swallows_workflow_run_errors(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""A DB read failure on the workflow-run row must not crash the watchdog —
``_safe_read_workflow_run`` returns None and the poll continues."""
from skyvern.forge import app as forge_app
db = _FakeDatabase(
tasks=_FakeTasksRepo(step_ts=None, block_ts=None),
workflow_runs=_FakeWorkflowRunsRepo(raise_on_call=RuntimeError("DB flake")),
)
monkeypatch.setattr(forge_app, "DATABASE", db)
read_run, read_step_ts, read_block_ts = await _read_progress_sources(_FakeCtx(), "wr_1")
assert read_run is None
assert read_step_ts is None
assert read_block_ts is None
@pytest.mark.asyncio
async def test_read_progress_sources_swallows_progress_timestamps_errors(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""A DB read failure on the aggregate timestamps must also not crash — the
caller still gets the run (if readable) and ``None`` for the timestamps.
"""
from skyvern.forge import app as forge_app
run = _fake_run()
db = _FakeDatabase(
tasks=_FakeTasksRepo(raise_on_call=RuntimeError("aggregate query failed")),
workflow_runs=_FakeWorkflowRunsRepo(run=run),
)
monkeypatch.setattr(forge_app, "DATABASE", db)
read_run, read_step_ts, read_block_ts = await _read_progress_sources(_FakeCtx(), "wr_1")
assert read_run is run
assert read_step_ts is None
assert read_block_ts is None
# ---------------------------------------------------------------------------
# _watchdog_error_message: the regression-guard strings.
# ---------------------------------------------------------------------------
class _ErrorCtx:
"""Minimal ``AgentContext`` stand-in for the error-message path — only
``browser_session_id`` is read, and only by ``_fallback_page_info``."""
organization_id = "o_test"
browser_session_id = None
@pytest.mark.asyncio
async def test_stagnation_error_message_does_not_invite_retry() -> None:
"""The exact SKY-9163 bug: the old copy said "likely stuck repeating
failing actions" which the LLM read as "try again". The stagnation
message must explicitly discourage retry."""
msg = await _watchdog_error_message("stagnation", _ErrorCtx(), "wr_test", _fake_run())
assert "timed out" not in msg.lower()
assert "likely stuck repeating" not in msg.lower()
assert str(RUN_BLOCKS_STAGNATION_WINDOW_SECONDS) in msg
assert "Run ID: wr_test" in msg
assert "get_run_results" in msg
assert "Do NOT re-invoke block-running tools" in msg
@pytest.mark.asyncio
async def test_ceiling_error_message_advises_splitting() -> None:
"""The ceiling path is rare (a runaway run that keeps making progress
past 20 min). Its error must tell the LLM to split the workflow, not
retry — a longer run won't fit either."""
msg = await _watchdog_error_message("ceiling", _ErrorCtx(), "wr_test", _fake_run())
assert "timed out" not in msg.lower()
assert str(RUN_BLOCKS_SAFETY_CEILING_SECONDS) in msg
assert "split" in msg.lower()
assert "Run ID: wr_test" in msg
assert "get_run_results" in msg
@pytest.mark.asyncio
async def test_task_exit_unfinalized_message_reports_last_observed_status() -> None:
"""When ``execute_workflow`` naturally exits but the row isn't terminal,
the error must name the last-observed status so the LLM has a concrete
anchor for the follow-up ``get_run_results`` call."""
run = _fake_run(status="running")
msg = await _watchdog_error_message("task_exit_unfinalized", _ErrorCtx(), "wr_test", run)
assert "timed out" not in msg.lower()
assert "last observed status: running" in msg
assert "Run ID: wr_test" in msg
assert "get_run_results" in msg
@pytest.mark.asyncio
async def test_task_exit_unfinalized_message_tolerates_unreadable_run() -> None:
"""If the post-drain reread also fails (``run is None``), the message must
still be well-formed and mention the unreadable state rather than
crashing on a ``None.status`` access."""
msg = await _watchdog_error_message("task_exit_unfinalized", _ErrorCtx(), "wr_test", None)
assert "unreadable" in msg.lower()
assert "Run ID: wr_test" in msg
assert "get_run_results" in msg
# ---------------------------------------------------------------------------
# _any_quiet_block_requested: stagnation bypass for block types that
# legitimately do long-silent work. Without this bypass, a WAIT block with
# wait_sec >= 90, a slow TEXT_PROMPT LLM call, or a HumanInteractionBlock
# pausing for user input would be falsely reported as stagnation and the
# tool would cancel a healthy run.
# ---------------------------------------------------------------------------
def _workflow_with_block_types(*type_value_label_pairs: tuple[str, str]) -> Any:
"""Build a minimal `last_workflow`-shaped object that
``_any_quiet_block_requested`` can walk. Each pair is
``(block_type_value, label)`` — e.g. ``("wait", "pause1")``.
"""
blocks = [
SimpleNamespace(label=label, block_type=SimpleNamespace(value=block_type_value))
for block_type_value, label in type_value_label_pairs
]
definition = SimpleNamespace(blocks=blocks)
return SimpleNamespace(workflow_definition=definition)
def test_any_quiet_block_requested_wait() -> None:
ctx = SimpleNamespace(last_workflow=_workflow_with_block_types(("wait", "pause1")))
assert _any_quiet_block_requested(ctx, ["pause1"]) is True
def test_any_quiet_block_requested_text_prompt() -> None:
ctx = SimpleNamespace(last_workflow=_workflow_with_block_types(("text_prompt", "prompt1")))
assert _any_quiet_block_requested(ctx, ["prompt1"]) is True
def test_any_quiet_block_requested_human_interaction() -> None:
ctx = SimpleNamespace(last_workflow=_workflow_with_block_types(("human_interaction", "wait_for_user")))
assert _any_quiet_block_requested(ctx, ["wait_for_user"]) is True
def test_any_quiet_block_requested_mixed_requested_labels_match_quiet_one() -> None:
"""When multiple blocks are requested, having any one quiet type is
enough to disable stagnation for the whole invocation."""
ctx = SimpleNamespace(
last_workflow=_workflow_with_block_types(
("navigation", "nav1"),
("wait", "pause1"),
("extraction", "extract1"),
)
)
assert _any_quiet_block_requested(ctx, ["nav1", "pause1", "extract1"]) is True
def test_any_quiet_block_requested_only_task_blocks_returns_false() -> None:
"""The normal case: task-heavy workflows produce regular step writes.
Stagnation is safe to enable."""
ctx = SimpleNamespace(
last_workflow=_workflow_with_block_types(
("navigation", "nav1"),
("extraction", "extract1"),
)
)
assert _any_quiet_block_requested(ctx, ["nav1", "extract1"]) is False
def test_any_quiet_block_requested_label_not_in_requested_ignored() -> None:
"""A WAIT block defined in the workflow but not requested in this
invocation must not disable stagnation."""
ctx = SimpleNamespace(
last_workflow=_workflow_with_block_types(
("wait", "not_requested_pause"),
("navigation", "requested_nav"),
)
)
assert _any_quiet_block_requested(ctx, ["requested_nav"]) is False
def test_any_quiet_block_requested_no_workflow_returns_false() -> None:
"""Defensive: no workflow loaded → no bypass. The loop will use its
default stagnation behavior (safe for the common case)."""
ctx = SimpleNamespace(last_workflow=None)
assert _any_quiet_block_requested(ctx, ["anything"]) is False
def test_any_quiet_block_requested_empty_labels_returns_false() -> None:
ctx = SimpleNamespace(last_workflow=_workflow_with_block_types(("wait", "pause1")))
assert _any_quiet_block_requested(ctx, None) is False
assert _any_quiet_block_requested(ctx, []) is False
# ---------------------------------------------------------------------------
# Reconciliation guard message: regression guard on "timed out" phrasing.
# The guard itself is tested in test_copilot_cancel_helpers.py; this test is
# specifically about the LLM-facing STRING, which previously said "timed out"
# and read as a retry-invite when combined with LLM priors.
# ---------------------------------------------------------------------------
def test_reconciliation_guard_message_does_not_say_timed_out() -> None:
"""The reconciliation guard message is what the LLM reads on the *next*
block-running tool call after a watchdog exit. The stagnation/ceiling/
unfinalized error messages all purge "timed out"; the guard message must
match, otherwise the phrase leaks right back in and the regression is
only cosmetic."""
ctx = SimpleNamespace(
consecutive_tool_tracker=[],
repeated_action_fingerprint_streak_count=0,
last_test_non_retriable_nav_error=None,
pending_reconciliation_run_id="wr_guarded",
)
msg = _tool_loop_error(ctx, "update_and_run_blocks")
assert isinstance(msg, str)
assert "timed out" not in msg.lower()
assert "wr_guarded" in msg
assert "get_run_results" in msg