mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
282 lines
11 KiB
Python
282 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
from types import SimpleNamespace
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from skyvern.forge.sdk.copilot.failure_tracking import (
|
|
compute_failure_signature,
|
|
compute_frontier_fingerprint,
|
|
update_repeated_failure_state,
|
|
)
|
|
|
|
|
|
class _Block:
|
|
def __init__(self, label: str, **config: Any) -> None:
|
|
self.label = label
|
|
self._config = config
|
|
|
|
def model_dump(self, **_: Any) -> dict[str, Any]:
|
|
return {"label": self.label, **{k: v for k, v in self._config.items() if v is not None}}
|
|
|
|
|
|
def _make_workflow(blocks: list[_Block]) -> Any:
|
|
definition = SimpleNamespace(blocks=blocks)
|
|
return SimpleNamespace(workflow_definition=definition)
|
|
|
|
|
|
def _make_ctx(
|
|
*,
|
|
suspicious: bool = False,
|
|
failure_reason: str | None = "Timeout on element",
|
|
frontier_label: str = "extract",
|
|
executed_labels: list[str] | None = None,
|
|
workflow: Any = None,
|
|
last_signature: str | None = None,
|
|
last_fingerprint: str = "",
|
|
streak: int = 0,
|
|
nudge_streak: int = 0,
|
|
) -> SimpleNamespace:
|
|
return SimpleNamespace(
|
|
last_test_suspicious_success=suspicious,
|
|
last_test_failure_reason=failure_reason,
|
|
last_frontier_start_label=frontier_label,
|
|
last_executed_block_labels=executed_labels or ["open", "extract"],
|
|
last_workflow=workflow,
|
|
last_failure_signature=last_signature,
|
|
last_frontier_fingerprint=last_fingerprint,
|
|
repeated_failure_streak_count=streak,
|
|
repeated_failure_nudge_emitted_at_streak=nudge_streak,
|
|
)
|
|
|
|
|
|
def test_signature_stable_for_same_inputs() -> None:
|
|
a = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
|
|
b = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
|
|
assert a == b and a is not None
|
|
|
|
|
|
def test_signature_differs_on_reason_change() -> None:
|
|
a = compute_failure_signature("extract", "Timeout", None, False)
|
|
b = compute_failure_signature("extract", "Permission denied", None, False)
|
|
assert a != b
|
|
|
|
|
|
def test_signature_collapses_parameter_binding_error() -> None:
|
|
# PARAMETER_BINDING_ERROR embeds offending key names in failure_reason;
|
|
# different keys must still hash to the same signature so repeats count.
|
|
a = compute_failure_signature(
|
|
"block_a", "missing parameter 'x' at path foo.bar", [{"category": "PARAMETER_BINDING_ERROR"}], False
|
|
)
|
|
b = compute_failure_signature(
|
|
"block_a", "missing parameter 'y' at path baz.qux", [{"category": "PARAMETER_BINDING_ERROR"}], False
|
|
)
|
|
assert a == b
|
|
|
|
|
|
def test_fingerprint_empty_without_workflow_definition() -> None:
|
|
assert compute_frontier_fingerprint(["open"], None) == ""
|
|
|
|
|
|
def test_fingerprint_changes_when_block_config_changes() -> None:
|
|
wf_a = _make_workflow([_Block("open", url="https://a.test")])
|
|
wf_b = _make_workflow([_Block("open", url="https://b.test")])
|
|
|
|
assert compute_frontier_fingerprint(["open"], wf_a.workflow_definition) != compute_frontier_fingerprint(
|
|
["open"], wf_b.workflow_definition
|
|
)
|
|
|
|
|
|
def test_streak_increments_when_signature_and_fingerprint_repeat() -> None:
|
|
wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
|
|
fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
|
|
signature = compute_failure_signature("extract", "Timeout on element", None, False)
|
|
|
|
ctx = _make_ctx(
|
|
workflow=wf,
|
|
last_signature=signature,
|
|
last_fingerprint=fingerprint,
|
|
streak=1,
|
|
)
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
|
|
assert ctx.repeated_failure_streak_count == 2
|
|
assert ctx.last_failure_signature == signature
|
|
assert ctx.last_frontier_fingerprint == fingerprint
|
|
|
|
|
|
def test_streak_resets_on_meaningful_success() -> None:
|
|
wf = _make_workflow([_Block("open", url="x")])
|
|
ctx = _make_ctx(
|
|
workflow=wf,
|
|
executed_labels=["open"],
|
|
last_signature="prior",
|
|
last_fingerprint="prior",
|
|
streak=4,
|
|
nudge_streak=3,
|
|
)
|
|
update_repeated_failure_state(ctx, {"ok": True, "data": {}})
|
|
|
|
assert ctx.repeated_failure_streak_count == 0
|
|
assert ctx.last_failure_signature is None
|
|
assert ctx.repeated_failure_nudge_emitted_at_streak == 0
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"case_kwargs",
|
|
[
|
|
pytest.param(
|
|
{
|
|
"failure_reason": "Totally new error",
|
|
"last_signature": "extract|old-reason|",
|
|
"last_fingerprint_matches": True,
|
|
},
|
|
id="new_signature",
|
|
),
|
|
pytest.param(
|
|
{
|
|
"failure_reason": "Timeout on element",
|
|
"last_signature_matches": True,
|
|
"last_fingerprint": "different-fingerprint-from-prior-run",
|
|
},
|
|
id="new_fingerprint",
|
|
),
|
|
],
|
|
)
|
|
def test_streak_and_nudge_reset_on_change(case_kwargs: dict[str, Any]) -> None:
|
|
wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
|
|
fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
|
|
signature = compute_failure_signature("extract", "Timeout on element", None, False)
|
|
|
|
last_signature = signature if case_kwargs.pop("last_signature_matches", False) else case_kwargs["last_signature"]
|
|
last_fingerprint = (
|
|
fingerprint if case_kwargs.pop("last_fingerprint_matches", False) else case_kwargs["last_fingerprint"]
|
|
)
|
|
|
|
ctx = _make_ctx(
|
|
workflow=wf,
|
|
failure_reason=case_kwargs["failure_reason"],
|
|
last_signature=last_signature,
|
|
last_fingerprint=last_fingerprint,
|
|
streak=3,
|
|
nudge_streak=2,
|
|
)
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
|
|
assert ctx.repeated_failure_streak_count == 1
|
|
assert ctx.repeated_failure_nudge_emitted_at_streak == 0
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Action-sequence fingerprint streak #
|
|
# --------------------------------------------------------------------------- #
|
|
|
|
|
|
def _make_action_ctx(
|
|
*,
|
|
pending_fingerprint: str | None,
|
|
last_fingerprint: str | None = None,
|
|
streak: int = 0,
|
|
failure_reason: str | None = "fail",
|
|
last_signature: str | None = "sig_prior",
|
|
last_frontier_fingerprint: str = "fp_prior",
|
|
) -> SimpleNamespace:
|
|
"""CtX builder specific to action-sequence tests — mirrors what
|
|
``_run_blocks_and_collect_debug`` sets on CopilotContext before the
|
|
``update_repeated_failure_state`` call.
|
|
"""
|
|
wf = _make_workflow([_Block("open", url="x")])
|
|
ctx = _make_ctx(
|
|
workflow=wf,
|
|
failure_reason=failure_reason,
|
|
executed_labels=["open"],
|
|
last_signature=last_signature,
|
|
last_fingerprint=last_frontier_fingerprint,
|
|
)
|
|
ctx.pending_action_sequence_fingerprint = pending_fingerprint
|
|
ctx.last_action_sequence_fingerprint = last_fingerprint
|
|
ctx.repeated_action_fingerprint_streak_count = streak
|
|
return ctx
|
|
|
|
|
|
def test_action_fingerprint_streak_increments_on_repeat() -> None:
|
|
ctx = _make_action_ctx(pending_fingerprint="fp_1")
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 1
|
|
assert ctx.last_action_sequence_fingerprint == "fp_1"
|
|
|
|
ctx.pending_action_sequence_fingerprint = "fp_1"
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 2
|
|
|
|
ctx.pending_action_sequence_fingerprint = "fp_1"
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 3
|
|
|
|
|
|
def test_action_fingerprint_streak_resets_when_fingerprint_changes() -> None:
|
|
ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
|
|
# Different action sequence this run — streak resets to 1.
|
|
ctx.pending_action_sequence_fingerprint = "fp_2"
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 1
|
|
assert ctx.last_action_sequence_fingerprint == "fp_2"
|
|
|
|
|
|
def test_action_fingerprint_streak_preserved_on_transient_empty_trace() -> None:
|
|
"""A single run with no action trace (e.g. no failed blocks had a task_id)
|
|
between two fingerprint-matching runs shouldn't erase an in-progress
|
|
streak. Otherwise a single empty trace would mask a real loop.
|
|
"""
|
|
ctx = _make_action_ctx(pending_fingerprint=None, last_fingerprint="fp_1", streak=1)
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 1
|
|
# Prior fingerprint is cleared because pending is None — the next run's fp
|
|
# compares against None, which is the correct "no match" result.
|
|
assert ctx.last_action_sequence_fingerprint is None
|
|
|
|
|
|
def test_action_fingerprint_streak_resets_on_meaningful_success() -> None:
|
|
ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
|
|
ctx.last_test_failure_reason = None
|
|
ctx.last_test_suspicious_success = False
|
|
update_repeated_failure_state(ctx, {"ok": True, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 0
|
|
# Success promotes pending → last so the next failure can compare against it.
|
|
assert ctx.last_action_sequence_fingerprint == "fp_1"
|
|
assert ctx.pending_action_sequence_fingerprint is None
|
|
|
|
|
|
def test_action_fingerprint_streak_independent_of_frontier_streak() -> None:
|
|
"""The action-sequence streak ticks up even when the failure reason text
|
|
changes between runs. The frontier-based streak requires the signature
|
|
(derived from failure reason / categories) to match, so it would reset;
|
|
the action-sequence streak depends only on the action shape.
|
|
"""
|
|
wf = _make_workflow([_Block("open", url="x")])
|
|
fingerprint = compute_frontier_fingerprint(["open"], wf.workflow_definition)
|
|
|
|
ctx = SimpleNamespace(
|
|
last_test_suspicious_success=False,
|
|
last_test_failure_reason="Validation failed: name is required",
|
|
last_frontier_start_label="open",
|
|
last_executed_block_labels=["open"],
|
|
last_workflow=wf,
|
|
last_failure_signature=None,
|
|
last_frontier_fingerprint=fingerprint,
|
|
repeated_failure_streak_count=0,
|
|
repeated_failure_nudge_emitted_at_streak=0,
|
|
pending_action_sequence_fingerprint="fp_same",
|
|
last_action_sequence_fingerprint=None,
|
|
repeated_action_fingerprint_streak_count=0,
|
|
)
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 1
|
|
|
|
# New failure-reason text — frontier streak resets, action streak continues.
|
|
ctx.last_test_failure_reason = "Validation failed: email is required"
|
|
ctx.pending_action_sequence_fingerprint = "fp_same"
|
|
update_repeated_failure_state(ctx, {"ok": False, "data": {}})
|
|
assert ctx.repeated_action_fingerprint_streak_count == 2
|
|
assert ctx.repeated_failure_streak_count == 1
|