Skyvern/tests/unit/test_copilot_non_retriable_nav.py

"""Tests for non-retriable navigation error handling in the copilot layer.

Covers SKY-9136: when the browser layer raises ``FailedToNavigateToUrl`` with
a DNS / cert / SSL / invalid-URL pattern (``SKIP_INNER_NAV_RETRY_ERRORS``),
the copilot must surface the real error instead of "Unknown error", must not
keep retrying, and must fail deterministically even if the model tries to
narrate a completion.
"""

from __future__ import annotations

from types import SimpleNamespace

import pytest

from skyvern.forge.sdk.copilot.context import CopilotContext
from skyvern.forge.sdk.copilot.enforcement import (
    POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE,
    CopilotNonRetriableNavError,
    _check_enforcement,
    _extract_url_from_nav_error,
    _maybe_raise_non_retriable_nav,
    _needs_failed_test_nudge,
    _needs_repeated_null_data_nudge,
    _needs_suspicious_success_nudge,
    _non_retriable_nav_error_nudge,
    _repeated_frontier_failure_nudge,
)
from skyvern.forge.sdk.copilot.tools import (
    _detect_non_retriable_nav_error,
    _record_run_blocks_result,
    _record_workflow_update_result,
    _tool_loop_error,
)

_DNS_FAILURE_REASON = (
    "Failed to navigate to url https://www.example.invalid/path. Error message: net::ERR_NAME_NOT_RESOLVED"
)
_CERT_FAILURE_REASON = "Failed to navigate to url https://expired.example. Error message: net::ERR_CERT_DATE_INVALID"
_GENERIC_FAILURE_REASON = "Timeout waiting for element #submit"


def _fresh_context() -> CopilotContext:
    return CopilotContext(
        organization_id="o",
        workflow_id="w",
        workflow_permanent_id="wp",
        workflow_yaml="",
        browser_session_id=None,
        stream=SimpleNamespace(),  # type: ignore[arg-type]
    )


# ---------------------------------------------------------------------------
# _detect_non_retriable_nav_error
# ---------------------------------------------------------------------------


def test_detect_matches_dns_error_in_block_failure_reason() -> None:
    result = {"ok": False, "data": {"blocks": [{"failure_reason": _DNS_FAILURE_REASON}]}}
    assert _detect_non_retriable_nav_error(result) == _DNS_FAILURE_REASON


def test_detect_matches_cert_error_in_run_level_failure_reason() -> None:
    result = {"ok": False, "data": {"failure_reason": _CERT_FAILURE_REASON, "blocks": []}}
    assert _detect_non_retriable_nav_error(result) == _CERT_FAILURE_REASON


def test_detect_matches_invalid_url_error() -> None:
    invalid_url = "Failed to navigate to url not-a-url. Error message: net::ERR_INVALID_URL"
    result = {"ok": False, "data": {"blocks": [{"failure_reason": invalid_url}]}}
    assert _detect_non_retriable_nav_error(result) == invalid_url


def test_detect_matches_name_resolution_failed() -> None:
    reason = "net::ERR_NAME_RESOLUTION_FAILED happened mid-flight"
    result = {"ok": False, "data": {"blocks": [{"failure_reason": reason}]}}
    assert _detect_non_retriable_nav_error(result) == reason


def test_detect_matches_ssl_error() -> None:
    reason = "SSL error: net::ERR_SSL_PROTOCOL_ERROR"
    result = {"ok": False, "data": {"blocks": [{"failure_reason": reason}]}}
    assert _detect_non_retriable_nav_error(result) == reason


def test_detect_returns_none_for_generic_failure() -> None:
    result = {"ok": False, "data": {"blocks": [{"failure_reason": _GENERIC_FAILURE_REASON}]}}
    assert _detect_non_retriable_nav_error(result) is None


def test_detect_returns_none_for_missing_data() -> None:
    assert _detect_non_retriable_nav_error({"ok": False}) is None


def test_detect_returns_none_for_empty_blocks() -> None:
    assert _detect_non_retriable_nav_error({"ok": False, "data": {"blocks": []}}) is None


def test_detect_prefers_run_level_over_block_level() -> None:
    # When both match, the run-level reason wins (it is the authoritative
    # aggregate failure the workflow service recorded).
    result = {
        "ok": False,
        "data": {
            "failure_reason": _DNS_FAILURE_REASON,
            "blocks": [{"failure_reason": _CERT_FAILURE_REASON}],
        },
    }
    assert _detect_non_retriable_nav_error(result) == _DNS_FAILURE_REASON


# ---------------------------------------------------------------------------
# _record_run_blocks_result — context flag plumbing
# ---------------------------------------------------------------------------


def test_record_sets_flag_on_dns_failure() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    _record_run_blocks_result(
        ctx,
        {
            "ok": False,
            "data": {"blocks": [{"failure_reason": _DNS_FAILURE_REASON}]},
        },
    )
    assert ctx.last_test_non_retriable_nav_error == _DNS_FAILURE_REASON
    assert ctx.last_test_ok is False


def test_record_leaves_flag_none_for_generic_failure() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    _record_run_blocks_result(
        ctx,
        {
            "ok": False,
            "data": {"blocks": [{"failure_reason": _GENERIC_FAILURE_REASON}]},
        },
    )
    assert ctx.last_test_non_retriable_nav_error is None


def test_record_clears_flag_on_every_call() -> None:
    # Stale state from a prior run must not leak into the next run's context.
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_non_retriable_nav_error = "stale DNS error"
    _record_run_blocks_result(
        ctx,
        {
            "ok": False,
            "data": {"blocks": [{"failure_reason": _GENERIC_FAILURE_REASON}]},
        },
    )
    assert ctx.last_test_non_retriable_nav_error is None


def test_workflow_update_clears_non_retriable_flag_and_signature_latch() -> None:
    # Codex review P2-2: after a DNS-failed run, if the agent edits the
    # workflow (e.g. fixing the URL), the stale flag must not survive the
    # edit — otherwise an exhausted POST_UPDATE_NUDGE on the new draft
    # would raise CopilotNonRetriableNavError with the OLD run's error
    # message, telling the user to verify a URL they just corrected.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.non_retriable_nav_error_last_emitted_signature = "dns_signature_123"
    ctx.last_test_ok = False
    ctx.workflow_yaml = "updated yaml"

    _record_workflow_update_result(
        ctx,
        {
            "ok": True,
            "data": {"block_count": 2},
            "_workflow": SimpleNamespace(workflow_id="wf_new"),
        },
    )
    assert ctx.last_test_non_retriable_nav_error is None
    assert ctx.non_retriable_nav_error_last_emitted_signature is None
    # Consistency check: the other per-test fields are also reset (pre-existing behavior).
    assert ctx.last_test_ok is None
    assert ctx.last_test_failure_reason is None


def test_workflow_update_does_not_clear_flag_on_failed_update() -> None:
    # Only a SUCCESSFUL update invalidates prior test state — a failed
    # validation attempt leaves the existing flags alone.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.non_retriable_nav_error_last_emitted_signature = "dns_signature_123"
    ctx.last_test_ok = False

    _record_workflow_update_result(
        ctx,
        {"ok": False, "error": "validation failed"},
    )
    assert ctx.last_test_non_retriable_nav_error == _DNS_FAILURE_REASON
    assert ctx.non_retriable_nav_error_last_emitted_signature == "dns_signature_123"


def test_record_clears_signature_latch_on_meaningful_success() -> None:
    # After a real success, the stop nudge must be able to re-fire if a new
    # bad URL fails later in the same session.
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.non_retriable_nav_error_last_emitted_signature = "some previous signature"
    _record_run_blocks_result(
        ctx,
        {
            "ok": True,
            "data": {
                "blocks": [
                    {
                        "label": "extract",
                        "block_type": "extraction",
                        "status": "completed",
                        "extracted_data": [{"name": "widget", "price": 1.0}],
                    }
                ],
            },
        },
    )
    assert ctx.non_retriable_nav_error_last_emitted_signature is None


# ---------------------------------------------------------------------------
# _non_retriable_nav_error_nudge — signature latch
# ---------------------------------------------------------------------------


def test_nudge_helper_fires_first_time() -> None:
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    result = _non_retriable_nav_error_nudge(ctx)
    assert result is not None
    nudge, signature = result
    assert nudge == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE
    assert signature  # non-empty


def test_nudge_helper_does_not_refire_same_signature() -> None:
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    first = _non_retriable_nav_error_nudge(ctx)
    assert first is not None
    ctx.non_retriable_nav_error_last_emitted_signature = first[1]
    assert _non_retriable_nav_error_nudge(ctx) is None


def test_nudge_helper_refires_on_different_signature() -> None:
    # User corrects from one bad URL to another bad URL in the same session
    # without a successful run in between; the stop nudge must re-fire.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    first = _non_retriable_nav_error_nudge(ctx)
    assert first is not None
    ctx.non_retriable_nav_error_last_emitted_signature = first[1]

    ctx.last_test_non_retriable_nav_error = _CERT_FAILURE_REASON
    second = _non_retriable_nav_error_nudge(ctx)
    assert second is not None
    assert second[1] != first[1]


def test_nudge_helper_returns_none_when_flag_unset() -> None:
    ctx = _fresh_context()
    assert _non_retriable_nav_error_nudge(ctx) is None


# ---------------------------------------------------------------------------
# _check_enforcement — branch ordering and latching
# ---------------------------------------------------------------------------


def test_check_enforcement_returns_non_retriable_nudge_before_failed_test() -> None:
    # Conditions that would normally trigger POST_FAILED_TEST_NUDGE (last_test
    # failed, test_after_update_done=True) must yield the non-retriable stop
    # nudge instead when the flag is set.
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    ctx.last_test_failure_reason = _DNS_FAILURE_REASON
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    nudge = _check_enforcement(ctx)
    assert nudge == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE
    # Emission latched for future calls with same signature.
    assert ctx.non_retriable_nav_error_last_emitted_signature


def test_check_enforcement_one_shot_per_signature() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    ctx.last_test_failure_reason = _DNS_FAILURE_REASON
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    first = _check_enforcement(ctx)
    assert first == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE
    # Same signature on next iteration: nudge returns None (and does NOT
    # fall through to POST_FAILED_TEST_NUDGE thanks to competing-branch
    # suppression).
    assert _check_enforcement(ctx) is None


def test_check_enforcement_pre_empts_post_navigate_nudge() -> None:
    # Codex review P2-1: a navigate-hygiene state (navigate_called=True,
    # observation_after_navigate=False) must not steal the nudge slot when
    # a non-retriable nav error is also present — POST_NAVIGATE_NUDGE tells
    # the agent to observe the page, which does not resolve a DNS failure
    # and merely delays the deterministic stop by at least one iteration.
    ctx = _fresh_context()
    ctx.navigate_called = True
    ctx.observation_after_navigate = False
    ctx.navigate_enforcement_done = False
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _check_enforcement(ctx) == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE


def test_check_enforcement_pre_empts_post_update_nudge() -> None:
    # Similar P2-1 case: an update-without-test state must not pre-empt the
    # terminal stop. In practice P2-2 clears the flag on update, but if a
    # race leaves the flag set, stop-nudge still wins.
    ctx = _fresh_context()
    ctx.update_workflow_called = True
    ctx.test_after_update_done = False
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _check_enforcement(ctx) == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE


# ---------------------------------------------------------------------------
# Competing-branch suppression — each failure helper
# ---------------------------------------------------------------------------


def test_needs_failed_test_nudge_suppressed_when_flag_set() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _needs_failed_test_nudge(ctx) is False


def test_needs_failed_test_nudge_still_fires_without_flag() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    assert _needs_failed_test_nudge(ctx) is True


def test_needs_suspicious_success_nudge_suppressed_when_flag_set() -> None:
    ctx = _fresh_context()
    ctx.last_test_suspicious_success = True
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _needs_suspicious_success_nudge(ctx) is False


def test_needs_repeated_null_data_nudge_suppressed_when_flag_set() -> None:
    ctx = _fresh_context()
    ctx.last_test_suspicious_success = True
    ctx.null_data_streak_count = 5
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _needs_repeated_null_data_nudge(ctx) is False


def test_repeated_frontier_failure_nudge_suppressed_when_flag_set() -> None:
    ctx = _fresh_context()
    ctx.repeated_failure_streak_count = 5
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    assert _repeated_frontier_failure_nudge(ctx) is None


# ---------------------------------------------------------------------------
# _extract_url_from_nav_error
# ---------------------------------------------------------------------------


def test_extract_url_parses_standard_format() -> None:
    url = _extract_url_from_nav_error(_DNS_FAILURE_REASON)
    assert url == "https://www.example.invalid/path"


def test_extract_url_returns_none_on_malformed_message() -> None:
    assert _extract_url_from_nav_error("some unrelated error text") is None


# ---------------------------------------------------------------------------
# _maybe_raise_non_retriable_nav — deterministic exit-path
# ---------------------------------------------------------------------------


def test_maybe_raise_noops_when_flag_unset() -> None:
    ctx = _fresh_context()
    _maybe_raise_non_retriable_nav(ctx)  # must not raise


def test_maybe_raise_noops_when_last_test_is_ok() -> None:
    # A prior successful run does NOT suppress the exception (per CORR-3),
    # but the MOST RECENT run being a real success does — because that
    # means this turn's test did not hit the non-retriable path.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.last_test_ok = True
    _maybe_raise_non_retriable_nav(ctx)  # must not raise


def test_maybe_raise_raises_when_flag_and_last_test_failed() -> None:
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.last_test_ok = False
    with pytest.raises(CopilotNonRetriableNavError) as excinfo:
        _maybe_raise_non_retriable_nav(ctx)
    assert excinfo.value.error_message == _DNS_FAILURE_REASON
    assert excinfo.value.url == "https://www.example.invalid/path"


def test_maybe_raise_raises_when_last_test_ok_is_none() -> None:
    # The guard is ``last_test_ok is not True`` (not ``is False``), so an
    # ambiguous None (e.g. a suspicious-success run) with the flag set still
    # surfaces the cached nav failure rather than letting the loop return
    # silently. Locks in the tri-state semantics.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.last_test_ok = None
    with pytest.raises(CopilotNonRetriableNavError):
        _maybe_raise_non_retriable_nav(ctx)


def test_maybe_raise_raises_when_prior_run_succeeded_but_current_failed() -> None:
    # Codex CORR-3: the guard must NOT be gated on session history. A fresh
    # non-retriable nav failure on the most recent run still raises, even if
    # an earlier run in the same session succeeded.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    ctx.last_test_ok = False  # most recent run
    # Simulate a prior successful run in the session — there is no
    # `any_test_ok_ever` flag; the helper only inspects current state.
    with pytest.raises(CopilotNonRetriableNavError):
        _maybe_raise_non_retriable_nav(ctx)


# ---------------------------------------------------------------------------
# Sanity: exception carries the expected attributes for the agent handler
# ---------------------------------------------------------------------------


def test_exception_carries_url_and_error_message() -> None:
    exc = CopilotNonRetriableNavError(url="https://x.test", error_message="some reason")
    assert exc.url == "https://x.test"
    assert exc.error_message == "some reason"
    assert "some reason" in str(exc)


# ---------------------------------------------------------------------------
# Sanity: when flag is set, no failure-nudge branch competes
# ---------------------------------------------------------------------------


def test_all_competing_branches_silent_after_latch() -> None:
    # Reproduce the full steady state after the one-shot stop nudge has
    # latched: ctx has a non-retriable error, last_test_ok=False,
    # test_after_update_done=True, and all counters are set such that the
    # other branches would normally fire. Assert all four helpers are silent.
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    ctx.last_test_suspicious_success = True
    ctx.null_data_streak_count = 5
    ctx.repeated_failure_streak_count = 5
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON

    assert _needs_failed_test_nudge(ctx) is False
    assert _needs_suspicious_success_nudge(ctx) is False
    assert _needs_repeated_null_data_nudge(ctx) is False
    assert _repeated_frontier_failure_nudge(ctx) is None


def test_without_flag_competing_branches_still_active() -> None:
    # Inverse: same setup but without the flag — all relevant branches fire.
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    ctx.last_test_ok = False
    assert _needs_failed_test_nudge(ctx) is True


# ---------------------------------------------------------------------------
# Integration-ish: record -> check -> exception flow
# ---------------------------------------------------------------------------


def test_full_flow_record_then_check_then_raise() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    # Simulate a failed run with a DNS error.
    _record_run_blocks_result(
        ctx,
        {
            "ok": False,
            "data": {"blocks": [{"failure_reason": _DNS_FAILURE_REASON}]},
        },
    )
    # Enforcement fires the stop nudge.
    nudge = _check_enforcement(ctx)
    assert nudge == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE
    # Same signature on next iteration: no nudge, and the exit-path guard
    # raises because last_test_ok is still False.
    assert _check_enforcement(ctx) is None
    with pytest.raises(CopilotNonRetriableNavError):
        _maybe_raise_non_retriable_nav(ctx)


def test_full_flow_cleared_after_successful_run() -> None:
    ctx = _fresh_context()
    ctx.test_after_update_done = True
    _record_run_blocks_result(
        ctx,
        {"ok": False, "data": {"blocks": [{"failure_reason": _DNS_FAILURE_REASON}]}},
    )
    assert _check_enforcement(ctx) == POST_NON_RETRIABLE_NAV_ERROR_STOP_NUDGE
    # Then a real success happens.
    _record_run_blocks_result(
        ctx,
        {
            "ok": True,
            "data": {
                "blocks": [
                    {
                        "label": "extract",
                        "block_type": "extraction",
                        "status": "completed",
                        "extracted_data": [{"x": 1}],
                    }
                ],
            },
        },
    )
    # Signature latch cleared so a later bad URL can re-fire.
    assert ctx.non_retriable_nav_error_last_emitted_signature is None
    # Last-test fields now reflect success; the exit-path guard does nothing.
    _maybe_raise_non_retriable_nav(ctx)  # must not raise


# ---------------------------------------------------------------------------
# Within-turn fail-fast guard — _tool_loop_error
# ---------------------------------------------------------------------------


def test_tool_loop_error_blocks_update_and_run_blocks_after_dns_failure() -> None:
    # Observed repro: agent called update_and_run_blocks on www.example.invalid,
    # got DNS failure, then the LLM tried again with the bare host on the next
    # tool call — still within the same agent turn. The enforcement-loop stop
    # nudge only fires between turns, so the guard must live at the tool
    # entrypoint to actually prevent the speculative retry.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    err = _tool_loop_error(ctx, "update_and_run_blocks")
    assert err is not None
    assert "permanent navigation error" in err
    assert "net::ERR_NAME_NOT_RESOLVED" in err
    assert "Do NOT retry" in err


def test_tool_loop_error_blocks_run_blocks_and_collect_debug_after_dns_failure() -> None:
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    err = _tool_loop_error(ctx, "run_blocks_and_collect_debug")
    assert err is not None
    assert "permanent navigation error" in err


def test_tool_loop_error_does_not_block_planning_tools() -> None:
    # get_run_results / list_credentials / update_workflow are scoped out of
    # BLOCK_RUNNING_TOOLS and should remain callable so the agent can inspect
    # the failure and decide how to respond to the user.
    ctx = _fresh_context()
    ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON
    for tool in ("get_run_results", "list_credentials", "update_workflow"):
        assert _tool_loop_error(ctx, tool) is None, f"{tool} should not be blocked"


def test_tool_loop_error_does_not_block_when_flag_unset() -> None:
    ctx = _fresh_context()
    assert _tool_loop_error(ctx, "update_and_run_blocks") is None
    assert _tool_loop_error(ctx, "run_blocks_and_collect_debug") is None