Skyvern/tests/unit/test_copilot_response_coverage_gate.py

"""Tests for the response-aware coverage gate in _check_enforcement.

Covers a regression where a 2-action user goal slipped past enforcement
because the old `premature_completion_nudge_done` latch was bypassed by a
no-op turn (model emits REPLY JSON without calling any update tool). The
new gate peeks at the final response text to distinguish REPLY with a
coverage gap (nudge), REPLY with progress-narration prose (nudge), and
ASK_QUESTION (always allowed).
"""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from typing import Any

from skyvern.forge.sdk.copilot.enforcement import (
    MAX_FORMAT_NUDGES,
    MAX_INTERMEDIATE_NUDGES,
    POST_FORMAT_NUDGE,
    POST_INTERMEDIATE_SUCCESS_NUDGE,
    _check_enforcement,
    _is_progress_narration,
    _response_coverage_nudge,
)


class _Ctx:
    """Minimal stand-in for CopilotContext used in enforcement checks."""

    def __init__(self) -> None:
        self.navigate_called = False
        self.observation_after_navigate = False
        self.navigate_enforcement_done = False
        self.update_workflow_called = False
        self.test_after_update_done = False
        self.post_update_nudge_count = 0
        self.coverage_nudge_count = 0
        self.format_nudge_count = 0
        self.user_message = ""
        self.last_update_block_count = None
        self.last_test_ok = None
        self.last_test_failure_reason = None
        self.last_test_suspicious_success = False
        self.last_test_anti_bot = None
        self.failed_test_nudge_count = 0
        self.explore_without_workflow_nudge_count = 0
        self.null_data_streak_count = 0
        self.repeated_failure_streak_count = 0
        self.repeated_failure_nudge_emitted_at_streak = 0


@dataclass
class _FakeRunResult:
    """Stand-in for RunResultStreaming — exposes only what extract_final_text uses."""

    final_output: Any = None
    new_items: list[Any] = field(default_factory=list)


def _reply_result(user_response: str) -> _FakeRunResult:
    return _FakeRunResult(
        final_output=json.dumps({"type": "REPLY", "user_response": user_response}),
    )


def _ask_question_result(question: str) -> _FakeRunResult:
    return _FakeRunResult(
        final_output=json.dumps({"type": "ASK_QUESTION", "user_response": question}),
    )


def _post_success_ctx(user_message: str, block_count: int = 1) -> _Ctx:
    """Build a ctx in the 'workflow test passed' state that would previously
    have triggered the intermediate-success nudge."""
    ctx = _Ctx()
    ctx.user_message = user_message
    ctx.update_workflow_called = True
    ctx.test_after_update_done = True
    ctx.last_test_ok = True
    ctx.last_update_block_count = block_count
    return ctx


# ---------------------------------------------------------------------------
# _response_coverage_nudge — direct unit tests
# ---------------------------------------------------------------------------


def test_reply_with_coverage_gap_fires_nudge() -> None:
    ctx = _post_success_ctx("go to example.com and download the regulations")
    parsed = {"type": "REPLY", "user_response": "I created a nav block."}
    nudge = _response_coverage_nudge(ctx, parsed)
    assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
    assert ctx.coverage_nudge_count == 1


def test_coverage_nudge_respects_counter_cap() -> None:
    ctx = _post_success_ctx("go to X and download Y")
    parsed = {"type": "REPLY", "user_response": "one block draft"}
    for _ in range(MAX_INTERMEDIATE_NUDGES):
        assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
    # One more call — the cap should now let the response through.
    assert _response_coverage_nudge(ctx, parsed) is None


def test_ask_question_always_passes_through_even_with_coverage_gap() -> None:
    ctx = _post_success_ctx("go to site and download file")
    parsed = {"type": "ASK_QUESTION", "user_response": "Which file do you mean?"}
    assert _response_coverage_nudge(ctx, parsed) is None
    assert ctx.coverage_nudge_count == 0


def test_reply_without_coverage_gap_passes_through() -> None:
    # 2-action goal and 2 blocks — no gap.
    ctx = _post_success_ctx("go to X and download Y", block_count=2)
    parsed = {"type": "REPLY", "user_response": "Done. I created a 2-block workflow."}
    assert _response_coverage_nudge(ctx, parsed) is None


def test_reply_before_any_successful_test_passes_through() -> None:
    ctx = _Ctx()
    ctx.user_message = "go to X and download Y"
    # last_test_ok is None — no successful test yet.
    parsed = {"type": "REPLY", "user_response": "Working on it."}
    assert _response_coverage_nudge(ctx, parsed) is None


def test_reply_after_failed_test_passes_through() -> None:
    ctx = _post_success_ctx("go to X and download Y")
    ctx.last_test_ok = False  # test failed
    parsed = {"type": "REPLY", "user_response": "The test failed."}
    assert _response_coverage_nudge(ctx, parsed) is None


# ---------------------------------------------------------------------------
# Progress-narration heuristic
# ---------------------------------------------------------------------------


def test_is_progress_narration_detects_future_tense() -> None:
    # Exact phrasing from the regression trace that escaped enforcement.
    text = (
        "I ran the first block (open_home). The navigation block completed. "
        "I did not attempt further blocks yet. Next I will proceed to run the "
        "remaining blocks to locate and download the regulations unless "
        "you want a change."
    )
    assert _is_progress_narration(text)


def test_is_progress_narration_ignores_clean_reply() -> None:
    assert not _is_progress_narration("I created a 2-block workflow that extracts the top posts.")
    assert not _is_progress_narration("The workflow is ready. 3 blocks: nav, extract, summarize.")


def test_is_progress_narration_empty_inputs() -> None:
    assert not _is_progress_narration("")
    assert not _is_progress_narration(None)  # type: ignore[arg-type]


def test_format_nudge_fires_for_progress_narration_without_coverage_gap() -> None:
    # 2 blocks, so no coverage gap. But the text is future-tense progress.
    ctx = _post_success_ctx("go to X and download Y", block_count=2)
    parsed = {
        "type": "REPLY",
        "user_response": "I ran the first block. Next I will proceed to add the rest.",
    }
    nudge = _response_coverage_nudge(ctx, parsed)
    assert nudge == POST_FORMAT_NUDGE
    assert ctx.format_nudge_count == 1


def test_format_nudge_respects_counter_cap() -> None:
    ctx = _post_success_ctx("go to X and download Y", block_count=2)
    parsed = {"type": "REPLY", "user_response": "Next I will proceed."}
    for _ in range(MAX_FORMAT_NUDGES):
        assert _response_coverage_nudge(ctx, parsed) == POST_FORMAT_NUDGE
    assert _response_coverage_nudge(ctx, parsed) is None


def test_coverage_nudge_takes_priority_over_format_nudge() -> None:
    # Coverage gap AND progress narration — coverage fires first, counter advances.
    ctx = _post_success_ctx("go to X and download Y", block_count=1)
    parsed = {"type": "REPLY", "user_response": "Next I will proceed with more blocks."}
    assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
    assert ctx.coverage_nudge_count == 1
    assert ctx.format_nudge_count == 0


# ---------------------------------------------------------------------------
# Integrated _check_enforcement — no-op-turn bypass closed (main regression)
# ---------------------------------------------------------------------------


def test_no_op_turn_bypass_closed_goes_to_phrasing() -> None:
    """Simulate the regression's final turn: workflow has 1 block, test passed,
    model emits REPLY without any new tool calls. Before the fix, the latch
    blocked re-nudging. After the fix, the response-aware gate fires — in
    this specific message the lexical coverage heuristic matches only
    'download' (not 'goes to' — the bigram is 'goes to' vs 'go to'), so the
    coverage branch lets it through. The progress-narration format branch
    catches the future-tense REPLY instead."""
    ctx = _post_success_ctx("make a workflow that goes to example.com and downloads the latest regulations")
    result = _reply_result(
        "I ran the first block (open_home). The navigation block completed. "
        "I did not attempt further blocks yet. Next I will proceed."
    )
    nudge = _check_enforcement(ctx, result)
    # Either branch is a valid fix for the regression — verify the format
    # branch specifically since the coverage heuristic misses this phrasing.
    assert nudge == POST_FORMAT_NUDGE


def test_no_op_turn_bypass_closed_multi_action() -> None:
    """Same structural bug, with a message the coverage heuristic does match
    (explicit 'go to' + 'download'). The coverage branch fires."""
    ctx = _post_success_ctx("go to example.com and download the regulations")
    result = _reply_result("Ran one block; will do the rest next.")
    nudge = _check_enforcement(ctx, result)
    assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE


def test_ask_question_reaches_user_after_any_state() -> None:
    """Regression guard for CORR-2: once the intermediate-success latch was
    removed, we must still let ASK_QUESTION through even when coverage is
    incomplete, so the agent can ask for credentials / disambiguate."""
    ctx = _post_success_ctx("login and download my records")
    result = _ask_question_result("Which credential should I use for this login?")
    assert _check_enforcement(ctx, result) is None


def test_check_enforcement_without_result_skips_response_peek() -> None:
    """Pre-screenshot-handoff path passes result=None. State-based branches
    still fire; response peek is skipped."""
    ctx = _Ctx()
    ctx.navigate_called = True  # but no observation_after_navigate
    # navigate_enforcement_done is still False
    nudge = _check_enforcement(ctx, None)
    assert nudge is not None  # navigate nudge fires


def test_check_enforcement_clean_reply_passes_through() -> None:
    ctx = _post_success_ctx("go to example.com and extract the top 3 stories", block_count=2)
    result = _reply_result("I created a 2-block workflow that extracts the top 3 stories.")
    assert _check_enforcement(ctx, result) is None