Skyvern/tests/unit/test_copilot_response_coverage_gate.py

246 lines
10 KiB
Python

"""Tests for the response-aware coverage gate in _check_enforcement.
Covers a regression where a 2-action user goal slipped past enforcement
because the old `premature_completion_nudge_done` latch was bypassed by a
no-op turn (model emits REPLY JSON without calling any update tool). The
new gate peeks at the final response text to distinguish REPLY with a
coverage gap (nudge), REPLY with progress-narration prose (nudge), and
ASK_QUESTION (always allowed).
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from typing import Any
from skyvern.forge.sdk.copilot.enforcement import (
MAX_FORMAT_NUDGES,
MAX_INTERMEDIATE_NUDGES,
POST_FORMAT_NUDGE,
POST_INTERMEDIATE_SUCCESS_NUDGE,
_check_enforcement,
_is_progress_narration,
_response_coverage_nudge,
)
class _Ctx:
"""Minimal stand-in for CopilotContext used in enforcement checks."""
def __init__(self) -> None:
self.navigate_called = False
self.observation_after_navigate = False
self.navigate_enforcement_done = False
self.update_workflow_called = False
self.test_after_update_done = False
self.post_update_nudge_count = 0
self.coverage_nudge_count = 0
self.format_nudge_count = 0
self.user_message = ""
self.last_update_block_count = None
self.last_test_ok = None
self.last_test_failure_reason = None
self.last_test_suspicious_success = False
self.last_test_anti_bot = None
self.failed_test_nudge_count = 0
self.explore_without_workflow_nudge_count = 0
self.null_data_streak_count = 0
self.repeated_failure_streak_count = 0
self.repeated_failure_nudge_emitted_at_streak = 0
@dataclass
class _FakeRunResult:
"""Stand-in for RunResultStreaming — exposes only what extract_final_text uses."""
final_output: Any = None
new_items: list[Any] = field(default_factory=list)
def _reply_result(user_response: str) -> _FakeRunResult:
return _FakeRunResult(
final_output=json.dumps({"type": "REPLY", "user_response": user_response}),
)
def _ask_question_result(question: str) -> _FakeRunResult:
return _FakeRunResult(
final_output=json.dumps({"type": "ASK_QUESTION", "user_response": question}),
)
def _post_success_ctx(user_message: str, block_count: int = 1) -> _Ctx:
"""Build a ctx in the 'workflow test passed' state that would previously
have triggered the intermediate-success nudge."""
ctx = _Ctx()
ctx.user_message = user_message
ctx.update_workflow_called = True
ctx.test_after_update_done = True
ctx.last_test_ok = True
ctx.last_update_block_count = block_count
return ctx
# ---------------------------------------------------------------------------
# _response_coverage_nudge — direct unit tests
# ---------------------------------------------------------------------------
def test_reply_with_coverage_gap_fires_nudge() -> None:
ctx = _post_success_ctx("go to example.com and download the regulations")
parsed = {"type": "REPLY", "user_response": "I created a nav block."}
nudge = _response_coverage_nudge(ctx, parsed)
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
assert ctx.coverage_nudge_count == 1
def test_coverage_nudge_respects_counter_cap() -> None:
ctx = _post_success_ctx("go to X and download Y")
parsed = {"type": "REPLY", "user_response": "one block draft"}
for _ in range(MAX_INTERMEDIATE_NUDGES):
assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
# One more call — the cap should now let the response through.
assert _response_coverage_nudge(ctx, parsed) is None
def test_ask_question_always_passes_through_even_with_coverage_gap() -> None:
ctx = _post_success_ctx("go to site and download file")
parsed = {"type": "ASK_QUESTION", "user_response": "Which file do you mean?"}
assert _response_coverage_nudge(ctx, parsed) is None
assert ctx.coverage_nudge_count == 0
def test_reply_without_coverage_gap_passes_through() -> None:
# 2-action goal and 2 blocks — no gap.
ctx = _post_success_ctx("go to X and download Y", block_count=2)
parsed = {"type": "REPLY", "user_response": "Done. I created a 2-block workflow."}
assert _response_coverage_nudge(ctx, parsed) is None
def test_reply_before_any_successful_test_passes_through() -> None:
ctx = _Ctx()
ctx.user_message = "go to X and download Y"
# last_test_ok is None — no successful test yet.
parsed = {"type": "REPLY", "user_response": "Working on it."}
assert _response_coverage_nudge(ctx, parsed) is None
def test_reply_after_failed_test_passes_through() -> None:
ctx = _post_success_ctx("go to X and download Y")
ctx.last_test_ok = False # test failed
parsed = {"type": "REPLY", "user_response": "The test failed."}
assert _response_coverage_nudge(ctx, parsed) is None
# ---------------------------------------------------------------------------
# Progress-narration heuristic
# ---------------------------------------------------------------------------
def test_is_progress_narration_detects_future_tense() -> None:
# Exact phrasing from the regression trace that escaped enforcement.
text = (
"I ran the first block (open_home). The navigation block completed. "
"I did not attempt further blocks yet. Next I will proceed to run the "
"remaining blocks to locate and download the regulations unless "
"you want a change."
)
assert _is_progress_narration(text)
def test_is_progress_narration_ignores_clean_reply() -> None:
assert not _is_progress_narration("I created a 2-block workflow that extracts the top posts.")
assert not _is_progress_narration("The workflow is ready. 3 blocks: nav, extract, summarize.")
def test_is_progress_narration_empty_inputs() -> None:
assert not _is_progress_narration("")
assert not _is_progress_narration(None) # type: ignore[arg-type]
def test_format_nudge_fires_for_progress_narration_without_coverage_gap() -> None:
# 2 blocks, so no coverage gap. But the text is future-tense progress.
ctx = _post_success_ctx("go to X and download Y", block_count=2)
parsed = {
"type": "REPLY",
"user_response": "I ran the first block. Next I will proceed to add the rest.",
}
nudge = _response_coverage_nudge(ctx, parsed)
assert nudge == POST_FORMAT_NUDGE
assert ctx.format_nudge_count == 1
def test_format_nudge_respects_counter_cap() -> None:
ctx = _post_success_ctx("go to X and download Y", block_count=2)
parsed = {"type": "REPLY", "user_response": "Next I will proceed."}
for _ in range(MAX_FORMAT_NUDGES):
assert _response_coverage_nudge(ctx, parsed) == POST_FORMAT_NUDGE
assert _response_coverage_nudge(ctx, parsed) is None
def test_coverage_nudge_takes_priority_over_format_nudge() -> None:
# Coverage gap AND progress narration — coverage fires first, counter advances.
ctx = _post_success_ctx("go to X and download Y", block_count=1)
parsed = {"type": "REPLY", "user_response": "Next I will proceed with more blocks."}
assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
assert ctx.coverage_nudge_count == 1
assert ctx.format_nudge_count == 0
# ---------------------------------------------------------------------------
# Integrated _check_enforcement — no-op-turn bypass closed (main regression)
# ---------------------------------------------------------------------------
def test_no_op_turn_bypass_closed_goes_to_phrasing() -> None:
"""Simulate the regression's final turn: workflow has 1 block, test passed,
model emits REPLY without any new tool calls. Before the fix, the latch
blocked re-nudging. After the fix, the response-aware gate fires — in
this specific message the lexical coverage heuristic matches only
'download' (not 'goes to' — the bigram is 'goes to' vs 'go to'), so the
coverage branch lets it through. The progress-narration format branch
catches the future-tense REPLY instead."""
ctx = _post_success_ctx("make a workflow that goes to example.com and downloads the latest regulations")
result = _reply_result(
"I ran the first block (open_home). The navigation block completed. "
"I did not attempt further blocks yet. Next I will proceed."
)
nudge = _check_enforcement(ctx, result)
# Either branch is a valid fix for the regression — verify the format
# branch specifically since the coverage heuristic misses this phrasing.
assert nudge == POST_FORMAT_NUDGE
def test_no_op_turn_bypass_closed_multi_action() -> None:
"""Same structural bug, with a message the coverage heuristic does match
(explicit 'go to' + 'download'). The coverage branch fires."""
ctx = _post_success_ctx("go to example.com and download the regulations")
result = _reply_result("Ran one block; will do the rest next.")
nudge = _check_enforcement(ctx, result)
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
def test_ask_question_reaches_user_after_any_state() -> None:
"""Regression guard for CORR-2: once the intermediate-success latch was
removed, we must still let ASK_QUESTION through even when coverage is
incomplete, so the agent can ask for credentials / disambiguate."""
ctx = _post_success_ctx("login and download my records")
result = _ask_question_result("Which credential should I use for this login?")
assert _check_enforcement(ctx, result) is None
def test_check_enforcement_without_result_skips_response_peek() -> None:
"""Pre-screenshot-handoff path passes result=None. State-based branches
still fire; response peek is skipped."""
ctx = _Ctx()
ctx.navigate_called = True # but no observation_after_navigate
# navigate_enforcement_done is still False
nudge = _check_enforcement(ctx, None)
assert nudge is not None # navigate nudge fires
def test_check_enforcement_clean_reply_passes_through() -> None:
ctx = _post_success_ctx("go to example.com and extract the top 3 stories", block_count=2)
result = _reply_result("I created a 2-block workflow that extracts the top 3 stories.")
assert _check_enforcement(ctx, result) is None