mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
246 lines
10 KiB
Python
246 lines
10 KiB
Python
"""Tests for the response-aware coverage gate in _check_enforcement.
|
|
|
|
Covers a regression where a 2-action user goal slipped past enforcement
|
|
because the old `premature_completion_nudge_done` latch was bypassed by a
|
|
no-op turn (model emits REPLY JSON without calling any update tool). The
|
|
new gate peeks at the final response text to distinguish REPLY with a
|
|
coverage gap (nudge), REPLY with progress-narration prose (nudge), and
|
|
ASK_QUESTION (always allowed).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
from skyvern.forge.sdk.copilot.enforcement import (
|
|
MAX_FORMAT_NUDGES,
|
|
MAX_INTERMEDIATE_NUDGES,
|
|
POST_FORMAT_NUDGE,
|
|
POST_INTERMEDIATE_SUCCESS_NUDGE,
|
|
_check_enforcement,
|
|
_is_progress_narration,
|
|
_response_coverage_nudge,
|
|
)
|
|
|
|
|
|
class _Ctx:
|
|
"""Minimal stand-in for CopilotContext used in enforcement checks."""
|
|
|
|
def __init__(self) -> None:
|
|
self.navigate_called = False
|
|
self.observation_after_navigate = False
|
|
self.navigate_enforcement_done = False
|
|
self.update_workflow_called = False
|
|
self.test_after_update_done = False
|
|
self.post_update_nudge_count = 0
|
|
self.coverage_nudge_count = 0
|
|
self.format_nudge_count = 0
|
|
self.user_message = ""
|
|
self.last_update_block_count = None
|
|
self.last_test_ok = None
|
|
self.last_test_failure_reason = None
|
|
self.last_test_suspicious_success = False
|
|
self.last_test_anti_bot = None
|
|
self.failed_test_nudge_count = 0
|
|
self.explore_without_workflow_nudge_count = 0
|
|
self.null_data_streak_count = 0
|
|
self.repeated_failure_streak_count = 0
|
|
self.repeated_failure_nudge_emitted_at_streak = 0
|
|
|
|
|
|
@dataclass
|
|
class _FakeRunResult:
|
|
"""Stand-in for RunResultStreaming — exposes only what extract_final_text uses."""
|
|
|
|
final_output: Any = None
|
|
new_items: list[Any] = field(default_factory=list)
|
|
|
|
|
|
def _reply_result(user_response: str) -> _FakeRunResult:
|
|
return _FakeRunResult(
|
|
final_output=json.dumps({"type": "REPLY", "user_response": user_response}),
|
|
)
|
|
|
|
|
|
def _ask_question_result(question: str) -> _FakeRunResult:
|
|
return _FakeRunResult(
|
|
final_output=json.dumps({"type": "ASK_QUESTION", "user_response": question}),
|
|
)
|
|
|
|
|
|
def _post_success_ctx(user_message: str, block_count: int = 1) -> _Ctx:
|
|
"""Build a ctx in the 'workflow test passed' state that would previously
|
|
have triggered the intermediate-success nudge."""
|
|
ctx = _Ctx()
|
|
ctx.user_message = user_message
|
|
ctx.update_workflow_called = True
|
|
ctx.test_after_update_done = True
|
|
ctx.last_test_ok = True
|
|
ctx.last_update_block_count = block_count
|
|
return ctx
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _response_coverage_nudge — direct unit tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_reply_with_coverage_gap_fires_nudge() -> None:
|
|
ctx = _post_success_ctx("go to example.com and download the regulations")
|
|
parsed = {"type": "REPLY", "user_response": "I created a nav block."}
|
|
nudge = _response_coverage_nudge(ctx, parsed)
|
|
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
assert ctx.coverage_nudge_count == 1
|
|
|
|
|
|
def test_coverage_nudge_respects_counter_cap() -> None:
|
|
ctx = _post_success_ctx("go to X and download Y")
|
|
parsed = {"type": "REPLY", "user_response": "one block draft"}
|
|
for _ in range(MAX_INTERMEDIATE_NUDGES):
|
|
assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
# One more call — the cap should now let the response through.
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
|
|
|
|
def test_ask_question_always_passes_through_even_with_coverage_gap() -> None:
|
|
ctx = _post_success_ctx("go to site and download file")
|
|
parsed = {"type": "ASK_QUESTION", "user_response": "Which file do you mean?"}
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
assert ctx.coverage_nudge_count == 0
|
|
|
|
|
|
def test_reply_without_coverage_gap_passes_through() -> None:
|
|
# 2-action goal and 2 blocks — no gap.
|
|
ctx = _post_success_ctx("go to X and download Y", block_count=2)
|
|
parsed = {"type": "REPLY", "user_response": "Done. I created a 2-block workflow."}
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
|
|
|
|
def test_reply_before_any_successful_test_passes_through() -> None:
|
|
ctx = _Ctx()
|
|
ctx.user_message = "go to X and download Y"
|
|
# last_test_ok is None — no successful test yet.
|
|
parsed = {"type": "REPLY", "user_response": "Working on it."}
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
|
|
|
|
def test_reply_after_failed_test_passes_through() -> None:
|
|
ctx = _post_success_ctx("go to X and download Y")
|
|
ctx.last_test_ok = False # test failed
|
|
parsed = {"type": "REPLY", "user_response": "The test failed."}
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Progress-narration heuristic
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_is_progress_narration_detects_future_tense() -> None:
|
|
# Exact phrasing from the regression trace that escaped enforcement.
|
|
text = (
|
|
"I ran the first block (open_home). The navigation block completed. "
|
|
"I did not attempt further blocks yet. Next I will proceed to run the "
|
|
"remaining blocks to locate and download the regulations unless "
|
|
"you want a change."
|
|
)
|
|
assert _is_progress_narration(text)
|
|
|
|
|
|
def test_is_progress_narration_ignores_clean_reply() -> None:
|
|
assert not _is_progress_narration("I created a 2-block workflow that extracts the top posts.")
|
|
assert not _is_progress_narration("The workflow is ready. 3 blocks: nav, extract, summarize.")
|
|
|
|
|
|
def test_is_progress_narration_empty_inputs() -> None:
|
|
assert not _is_progress_narration("")
|
|
assert not _is_progress_narration(None) # type: ignore[arg-type]
|
|
|
|
|
|
def test_format_nudge_fires_for_progress_narration_without_coverage_gap() -> None:
|
|
# 2 blocks, so no coverage gap. But the text is future-tense progress.
|
|
ctx = _post_success_ctx("go to X and download Y", block_count=2)
|
|
parsed = {
|
|
"type": "REPLY",
|
|
"user_response": "I ran the first block. Next I will proceed to add the rest.",
|
|
}
|
|
nudge = _response_coverage_nudge(ctx, parsed)
|
|
assert nudge == POST_FORMAT_NUDGE
|
|
assert ctx.format_nudge_count == 1
|
|
|
|
|
|
def test_format_nudge_respects_counter_cap() -> None:
|
|
ctx = _post_success_ctx("go to X and download Y", block_count=2)
|
|
parsed = {"type": "REPLY", "user_response": "Next I will proceed."}
|
|
for _ in range(MAX_FORMAT_NUDGES):
|
|
assert _response_coverage_nudge(ctx, parsed) == POST_FORMAT_NUDGE
|
|
assert _response_coverage_nudge(ctx, parsed) is None
|
|
|
|
|
|
def test_coverage_nudge_takes_priority_over_format_nudge() -> None:
|
|
# Coverage gap AND progress narration — coverage fires first, counter advances.
|
|
ctx = _post_success_ctx("go to X and download Y", block_count=1)
|
|
parsed = {"type": "REPLY", "user_response": "Next I will proceed with more blocks."}
|
|
assert _response_coverage_nudge(ctx, parsed) == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
assert ctx.coverage_nudge_count == 1
|
|
assert ctx.format_nudge_count == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Integrated _check_enforcement — no-op-turn bypass closed (main regression)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_no_op_turn_bypass_closed_goes_to_phrasing() -> None:
|
|
"""Simulate the regression's final turn: workflow has 1 block, test passed,
|
|
model emits REPLY without any new tool calls. Before the fix, the latch
|
|
blocked re-nudging. After the fix, the response-aware gate fires — in
|
|
this specific message the lexical coverage heuristic matches only
|
|
'download' (not 'goes to' — the bigram is 'goes to' vs 'go to'), so the
|
|
coverage branch lets it through. The progress-narration format branch
|
|
catches the future-tense REPLY instead."""
|
|
ctx = _post_success_ctx("make a workflow that goes to example.com and downloads the latest regulations")
|
|
result = _reply_result(
|
|
"I ran the first block (open_home). The navigation block completed. "
|
|
"I did not attempt further blocks yet. Next I will proceed."
|
|
)
|
|
nudge = _check_enforcement(ctx, result)
|
|
# Either branch is a valid fix for the regression — verify the format
|
|
# branch specifically since the coverage heuristic misses this phrasing.
|
|
assert nudge == POST_FORMAT_NUDGE
|
|
|
|
|
|
def test_no_op_turn_bypass_closed_multi_action() -> None:
|
|
"""Same structural bug, with a message the coverage heuristic does match
|
|
(explicit 'go to' + 'download'). The coverage branch fires."""
|
|
ctx = _post_success_ctx("go to example.com and download the regulations")
|
|
result = _reply_result("Ran one block; will do the rest next.")
|
|
nudge = _check_enforcement(ctx, result)
|
|
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
|
|
|
|
def test_ask_question_reaches_user_after_any_state() -> None:
|
|
"""Regression guard for CORR-2: once the intermediate-success latch was
|
|
removed, we must still let ASK_QUESTION through even when coverage is
|
|
incomplete, so the agent can ask for credentials / disambiguate."""
|
|
ctx = _post_success_ctx("login and download my records")
|
|
result = _ask_question_result("Which credential should I use for this login?")
|
|
assert _check_enforcement(ctx, result) is None
|
|
|
|
|
|
def test_check_enforcement_without_result_skips_response_peek() -> None:
|
|
"""Pre-screenshot-handoff path passes result=None. State-based branches
|
|
still fire; response peek is skipped."""
|
|
ctx = _Ctx()
|
|
ctx.navigate_called = True # but no observation_after_navigate
|
|
# navigate_enforcement_done is still False
|
|
nudge = _check_enforcement(ctx, None)
|
|
assert nudge is not None # navigate nudge fires
|
|
|
|
|
|
def test_check_enforcement_clean_reply_passes_through() -> None:
|
|
ctx = _post_success_ctx("go to example.com and extract the top 3 stories", block_count=2)
|
|
result = _reply_result("I created a 2-block workflow that extracts the top 3 stories.")
|
|
assert _check_enforcement(ctx, result) is None
|