Skyvern/tests/unit/test_copilot_failure_tracking.py

from __future__ import annotations

from types import SimpleNamespace
from typing import Any

import pytest

from skyvern.forge.sdk.copilot.failure_tracking import (
    compute_failure_signature,
    compute_frontier_fingerprint,
    update_repeated_failure_state,
)


class _Block:
    def __init__(self, label: str, **config: Any) -> None:
        self.label = label
        self._config = config

    def model_dump(self, **_: Any) -> dict[str, Any]:
        return {"label": self.label, **{k: v for k, v in self._config.items() if v is not None}}


def _make_workflow(blocks: list[_Block]) -> Any:
    definition = SimpleNamespace(blocks=blocks)
    return SimpleNamespace(workflow_definition=definition)


def _make_ctx(
    *,
    suspicious: bool = False,
    failure_reason: str | None = "Timeout on element",
    frontier_label: str = "extract",
    executed_labels: list[str] | None = None,
    workflow: Any = None,
    last_signature: str | None = None,
    last_fingerprint: str = "",
    streak: int = 0,
    nudge_streak: int = 0,
) -> SimpleNamespace:
    return SimpleNamespace(
        last_test_suspicious_success=suspicious,
        last_test_failure_reason=failure_reason,
        last_frontier_start_label=frontier_label,
        last_executed_block_labels=executed_labels or ["open", "extract"],
        last_workflow=workflow,
        last_failure_signature=last_signature,
        last_frontier_fingerprint=last_fingerprint,
        repeated_failure_streak_count=streak,
        repeated_failure_nudge_emitted_at_streak=nudge_streak,
    )


def test_signature_stable_for_same_inputs() -> None:
    a = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
    b = compute_failure_signature("extract", "Timeout", [{"category": "NETWORK_ERROR"}], False)
    assert a == b and a is not None


def test_signature_differs_on_reason_change() -> None:
    a = compute_failure_signature("extract", "Timeout", None, False)
    b = compute_failure_signature("extract", "Permission denied", None, False)
    assert a != b


def test_signature_collapses_parameter_binding_error() -> None:
    # PARAMETER_BINDING_ERROR embeds offending key names in failure_reason;
    # different keys must still hash to the same signature so repeats count.
    a = compute_failure_signature(
        "block_a", "missing parameter 'x' at path foo.bar", [{"category": "PARAMETER_BINDING_ERROR"}], False
    )
    b = compute_failure_signature(
        "block_a", "missing parameter 'y' at path baz.qux", [{"category": "PARAMETER_BINDING_ERROR"}], False
    )
    assert a == b


def test_fingerprint_empty_without_workflow_definition() -> None:
    assert compute_frontier_fingerprint(["open"], None) == ""


def test_fingerprint_changes_when_block_config_changes() -> None:
    wf_a = _make_workflow([_Block("open", url="https://a.test")])
    wf_b = _make_workflow([_Block("open", url="https://b.test")])

    assert compute_frontier_fingerprint(["open"], wf_a.workflow_definition) != compute_frontier_fingerprint(
        ["open"], wf_b.workflow_definition
    )


def test_streak_increments_when_signature_and_fingerprint_repeat() -> None:
    wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
    fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
    signature = compute_failure_signature("extract", "Timeout on element", None, False)

    ctx = _make_ctx(
        workflow=wf,
        last_signature=signature,
        last_fingerprint=fingerprint,
        streak=1,
    )
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})

    assert ctx.repeated_failure_streak_count == 2
    assert ctx.last_failure_signature == signature
    assert ctx.last_frontier_fingerprint == fingerprint


def test_streak_resets_on_meaningful_success() -> None:
    wf = _make_workflow([_Block("open", url="x")])
    ctx = _make_ctx(
        workflow=wf,
        executed_labels=["open"],
        last_signature="prior",
        last_fingerprint="prior",
        streak=4,
        nudge_streak=3,
    )
    update_repeated_failure_state(ctx, {"ok": True, "data": {}})

    assert ctx.repeated_failure_streak_count == 0
    assert ctx.last_failure_signature is None
    assert ctx.repeated_failure_nudge_emitted_at_streak == 0


@pytest.mark.parametrize(
    "case_kwargs",
    [
        pytest.param(
            {
                "failure_reason": "Totally new error",
                "last_signature": "extract|old-reason|",
                "last_fingerprint_matches": True,
            },
            id="new_signature",
        ),
        pytest.param(
            {
                "failure_reason": "Timeout on element",
                "last_signature_matches": True,
                "last_fingerprint": "different-fingerprint-from-prior-run",
            },
            id="new_fingerprint",
        ),
    ],
)
def test_streak_and_nudge_reset_on_change(case_kwargs: dict[str, Any]) -> None:
    wf = _make_workflow([_Block("open", url="x"), _Block("extract", schema="s")])
    fingerprint = compute_frontier_fingerprint(["open", "extract"], wf.workflow_definition)
    signature = compute_failure_signature("extract", "Timeout on element", None, False)

    last_signature = signature if case_kwargs.pop("last_signature_matches", False) else case_kwargs["last_signature"]
    last_fingerprint = (
        fingerprint if case_kwargs.pop("last_fingerprint_matches", False) else case_kwargs["last_fingerprint"]
    )

    ctx = _make_ctx(
        workflow=wf,
        failure_reason=case_kwargs["failure_reason"],
        last_signature=last_signature,
        last_fingerprint=last_fingerprint,
        streak=3,
        nudge_streak=2,
    )
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})

    assert ctx.repeated_failure_streak_count == 1
    assert ctx.repeated_failure_nudge_emitted_at_streak == 0


# --------------------------------------------------------------------------- #
# Action-sequence fingerprint streak                                          #
# --------------------------------------------------------------------------- #


def _make_action_ctx(
    *,
    pending_fingerprint: str | None,
    last_fingerprint: str | None = None,
    streak: int = 0,
    failure_reason: str | None = "fail",
    last_signature: str | None = "sig_prior",
    last_frontier_fingerprint: str = "fp_prior",
) -> SimpleNamespace:
    """CtX builder specific to action-sequence tests — mirrors what
    ``_run_blocks_and_collect_debug`` sets on CopilotContext before the
    ``update_repeated_failure_state`` call.
    """
    wf = _make_workflow([_Block("open", url="x")])
    ctx = _make_ctx(
        workflow=wf,
        failure_reason=failure_reason,
        executed_labels=["open"],
        last_signature=last_signature,
        last_fingerprint=last_frontier_fingerprint,
    )
    ctx.pending_action_sequence_fingerprint = pending_fingerprint
    ctx.last_action_sequence_fingerprint = last_fingerprint
    ctx.repeated_action_fingerprint_streak_count = streak
    return ctx


def test_action_fingerprint_streak_increments_on_repeat() -> None:
    ctx = _make_action_ctx(pending_fingerprint="fp_1")
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 1
    assert ctx.last_action_sequence_fingerprint == "fp_1"

    ctx.pending_action_sequence_fingerprint = "fp_1"
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 2

    ctx.pending_action_sequence_fingerprint = "fp_1"
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 3


def test_action_fingerprint_streak_resets_when_fingerprint_changes() -> None:
    ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
    # Different action sequence this run — streak resets to 1.
    ctx.pending_action_sequence_fingerprint = "fp_2"
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 1
    assert ctx.last_action_sequence_fingerprint == "fp_2"


def test_action_fingerprint_streak_preserved_on_transient_empty_trace() -> None:
    """A single run with no action trace (e.g. no failed blocks had a task_id)
    between two fingerprint-matching runs shouldn't erase an in-progress
    streak. Otherwise a single empty trace would mask a real loop.
    """
    ctx = _make_action_ctx(pending_fingerprint=None, last_fingerprint="fp_1", streak=1)
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 1
    # Prior fingerprint is cleared because pending is None — the next run's fp
    # compares against None, which is the correct "no match" result.
    assert ctx.last_action_sequence_fingerprint is None


def test_action_fingerprint_streak_resets_on_meaningful_success() -> None:
    ctx = _make_action_ctx(pending_fingerprint="fp_1", last_fingerprint="fp_1", streak=2)
    ctx.last_test_failure_reason = None
    ctx.last_test_suspicious_success = False
    update_repeated_failure_state(ctx, {"ok": True, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 0
    # Success promotes pending → last so the next failure can compare against it.
    assert ctx.last_action_sequence_fingerprint == "fp_1"
    assert ctx.pending_action_sequence_fingerprint is None


def test_action_fingerprint_streak_independent_of_frontier_streak() -> None:
    """The action-sequence streak ticks up even when the failure reason text
    changes between runs. The frontier-based streak requires the signature
    (derived from failure reason / categories) to match, so it would reset;
    the action-sequence streak depends only on the action shape.
    """
    wf = _make_workflow([_Block("open", url="x")])
    fingerprint = compute_frontier_fingerprint(["open"], wf.workflow_definition)

    ctx = SimpleNamespace(
        last_test_suspicious_success=False,
        last_test_failure_reason="Validation failed: name is required",
        last_frontier_start_label="open",
        last_executed_block_labels=["open"],
        last_workflow=wf,
        last_failure_signature=None,
        last_frontier_fingerprint=fingerprint,
        repeated_failure_streak_count=0,
        repeated_failure_nudge_emitted_at_streak=0,
        pending_action_sequence_fingerprint="fp_same",
        last_action_sequence_fingerprint=None,
        repeated_action_fingerprint_streak_count=0,
    )
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 1

    # New failure-reason text — frontier streak resets, action streak continues.
    ctx.last_test_failure_reason = "Validation failed: email is required"
    ctx.pending_action_sequence_fingerprint = "fp_same"
    update_repeated_failure_state(ctx, {"ok": False, "data": {}})
    assert ctx.repeated_action_fingerprint_streak_count == 2
    assert ctx.repeated_failure_streak_count == 1