Skyvern/tests/unit/test_copilot_failure_tracking.py

282 lines
11 KiB
Python

from __future__ import annotations
from types import SimpleNamespace
from typing import Any
import pytest
from skyvern.forge.sdk.copilot.failure_tracking import (
compute_failure_signature,
compute_frontier_fingerprint,
update_repeated_failure_state,
)
class _Block:
def __init__(self, label: str, **config: Any) -> None:
self.label = label
self._config = config
def model_dump(self, **_: Any) -> dict[str, Any]:
return {"label": self.label, **{k: v for k, v in self._config.items() if v is not None}}
def _make_workflow(blocks: list[_Block]) -> Any:
definition = SimpleNamespace(blocks=blocks)
return SimpleNamespace(workflow_definition=definition)
def _make_ctx(
*,
suspicious: bool = False,
failure_reason: str | None = "Timeout on element",
frontier_label: str = "extract",
executed_labels: list[str] | None = None,
workflow: Any = None,
last_signature: str | None = None,
last_fingerprint: str = "",
streak: int = 0,
nudge_streak: int = 0,
) -> SimpleNamespace:
return SimpleNamespace(
last_test_suspicious_success=suspicious,
last_test_failure_reason=failure_reason,
last_frontier_start_label=frontier_label,
last_executed_block_labels=executed_labels or ["open", "extract"],
last_workflow=workflow,
last_failure_signature=last_signature,
last_frontier_fingerprint=last_fingerprint,
repeated_failure_streak_count=streak,
repeated_failure_nudge_emitted_at_streak=nudge_streak,
)
def test_signature_stable_for_same_inputs() -> None:
a = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
b = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
assert a == b and a is not None
def test_signature_differs_on_reason_change() -> None:
a = compute_failure_signature("extract", "Timeout", None, False)
b = compute_failure_signature("extract", "Permission denied", None, False)
assert a != b
def test_signature_collapses_parameter_binding_error() -> None:
# PARAMETER_BINDING_ERROR embeds offending key names in failure_reason;
# different keys must still hash to the same signature so repeats count.
a = compute_failure_signature(
"block_a", "missing parameter 'x' at path foo.bar", [{"category": "PARAMETER_BINDING_ERROR"}], False
)
b = compute_failure_signature(
"block_a", "missing parameter 'y' at path baz.qux", [{"category": "PARAMETER_BINDING_ERROR"}], False
)
assert a == b
def test_fingerprint_empty_without_workflow_definition() -> None:
assert compute_frontier_fingerprint(["open"], None) == ""
def test_fingerprint_changes_when_block_config_changes() -> None:
wf_a = _make_workflow([_Block("open", url="https://a.test")])
wf_b = _make_workflow([_Block("open", url="https://b.test")])
assert compute_frontier_fingerprint(["open"], wf_a.workflow_definition) != compute_frontier_fingerprint(
["open"], wf_b.workflow_definition
)
def test_streak_increments_when_signature_and_fingerprint_repeat() -> None:
wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
signature = compute_failure_signature("extract", "Timeout on element", None, False)
ctx = _make_ctx(
workflow=wf,
last_signature=signature,
last_fingerprint=fingerprint,
streak=1,
)
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_failure_streak_count == 2
assert ctx.last_failure_signature == signature
assert ctx.last_frontier_fingerprint == fingerprint
def test_streak_resets_on_meaningful_success() -> None:
wf = _make_workflow([_Block("open", url="x")])
ctx = _make_ctx(
workflow=wf,
executed_labels=["open"],
last_signature="prior",
last_fingerprint="prior",
streak=4,
nudge_streak=3,
)
update_repeated_failure_state(ctx, {"ok": True, "data": {}})
assert ctx.repeated_failure_streak_count == 0
assert ctx.last_failure_signature is None
assert ctx.repeated_failure_nudge_emitted_at_streak == 0
@pytest.mark.parametrize(
"case_kwargs",
[
pytest.param(
{
"failure_reason": "Totally new error",
"last_signature": "extract|old-reason|",
"last_fingerprint_matches": True,
},
id="new_signature",
),
pytest.param(
{
"failure_reason": "Timeout on element",
"last_signature_matches": True,
"last_fingerprint": "different-fingerprint-from-prior-run",
},
id="new_fingerprint",
),
],
)
def test_streak_and_nudge_reset_on_change(case_kwargs: dict[str, Any]) -> None:
wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
signature = compute_failure_signature("extract", "Timeout on element", None, False)
last_signature = signature if case_kwargs.pop("last_signature_matches", False) else case_kwargs["last_signature"]
last_fingerprint = (
fingerprint if case_kwargs.pop("last_fingerprint_matches", False) else case_kwargs["last_fingerprint"]
)
ctx = _make_ctx(
workflow=wf,
failure_reason=case_kwargs["failure_reason"],
last_signature=last_signature,
last_fingerprint=last_fingerprint,
streak=3,
nudge_streak=2,
)
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_failure_streak_count == 1
assert ctx.repeated_failure_nudge_emitted_at_streak == 0
# --------------------------------------------------------------------------- #
# Action-sequence fingerprint streak #
# --------------------------------------------------------------------------- #
def _make_action_ctx(
*,
pending_fingerprint: str | None,
last_fingerprint: str | None = None,
streak: int = 0,
failure_reason: str | None = "fail",
last_signature: str | None = "sig_prior",
last_frontier_fingerprint: str = "fp_prior",
) -> SimpleNamespace:
"""CtX builder specific to action-sequence tests — mirrors what
``_run_blocks_and_collect_debug`` sets on CopilotContext before the
``update_repeated_failure_state`` call.
"""
wf = _make_workflow([_Block("open", url="x")])
ctx = _make_ctx(
workflow=wf,
failure_reason=failure_reason,
executed_labels=["open"],
last_signature=last_signature,
last_fingerprint=last_frontier_fingerprint,
)
ctx.pending_action_sequence_fingerprint = pending_fingerprint
ctx.last_action_sequence_fingerprint = last_fingerprint
ctx.repeated_action_fingerprint_streak_count = streak
return ctx
def test_action_fingerprint_streak_increments_on_repeat() -> None:
ctx = _make_action_ctx(pending_fingerprint="fp_1")
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 1
assert ctx.last_action_sequence_fingerprint == "fp_1"
ctx.pending_action_sequence_fingerprint = "fp_1"
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 2
ctx.pending_action_sequence_fingerprint = "fp_1"
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 3
def test_action_fingerprint_streak_resets_when_fingerprint_changes() -> None:
ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
# Different action sequence this run — streak resets to 1.
ctx.pending_action_sequence_fingerprint = "fp_2"
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 1
assert ctx.last_action_sequence_fingerprint == "fp_2"
def test_action_fingerprint_streak_preserved_on_transient_empty_trace() -> None:
"""A single run with no action trace (e.g. no failed blocks had a task_id)
between two fingerprint-matching runs shouldn't erase an in-progress
streak. Otherwise a single empty trace would mask a real loop.
"""
ctx = _make_action_ctx(pending_fingerprint=None, last_fingerprint="fp_1", streak=1)
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 1
# Prior fingerprint is cleared because pending is None — the next run's fp
# compares against None, which is the correct "no match" result.
assert ctx.last_action_sequence_fingerprint is None
def test_action_fingerprint_streak_resets_on_meaningful_success() -> None:
ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
ctx.last_test_failure_reason = None
ctx.last_test_suspicious_success = False
update_repeated_failure_state(ctx, {"ok": True, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 0
# Success promotes pending → last so the next failure can compare against it.
assert ctx.last_action_sequence_fingerprint == "fp_1"
assert ctx.pending_action_sequence_fingerprint is None
def test_action_fingerprint_streak_independent_of_frontier_streak() -> None:
"""The action-sequence streak ticks up even when the failure reason text
changes between runs. The frontier-based streak requires the signature
(derived from failure reason / categories) to match, so it would reset;
the action-sequence streak depends only on the action shape.
"""
wf = _make_workflow([_Block("open", url="x")])
fingerprint = compute_frontier_fingerprint(["open"], wf.workflow_definition)
ctx = SimpleNamespace(
last_test_suspicious_success=False,
last_test_failure_reason="Validation failed: name is required",
last_frontier_start_label="open",
last_executed_block_labels=["open"],
last_workflow=wf,
last_failure_signature=None,
last_frontier_fingerprint=fingerprint,
repeated_failure_streak_count=0,
repeated_failure_nudge_emitted_at_streak=0,
pending_action_sequence_fingerprint="fp_same",
last_action_sequence_fingerprint=None,
repeated_action_fingerprint_streak_count=0,
)
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 1
# New failure-reason text — frontier streak resets, action streak continues.
ctx.last_test_failure_reason = "Validation failed: email is required"
ctx.pending_action_sequence_fingerprint = "fp_same"
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
assert ctx.repeated_action_fingerprint_streak_count == 2
assert ctx.repeated_failure_streak_count == 1