mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 11:40:32 +00:00
781 lines
30 KiB
Python
781 lines
30 KiB
Python
"""Tests for enforcement pruning and null-data handling.
|
|
|
|
These cover three regressions observed in trace 019d7b5c884dff0ff648680b9f31f715:
|
|
1. Extraction returning all-null fields was treated as success.
|
|
2. Context grew linearly because old tool outputs kept full content.
|
|
3. No escalation when the agent looped on the same null-data failure.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from skyvern.forge.sdk.copilot.enforcement import (
|
|
KEEP_RECENT_TOOL_OUTPUTS,
|
|
NULL_DATA_STREAK_ESCALATE_AT,
|
|
POST_REPEATED_NULL_DATA_NUDGE,
|
|
POST_SUSPICIOUS_SUCCESS_NUDGE,
|
|
_check_enforcement,
|
|
_needs_repeated_null_data_nudge,
|
|
_needs_suspicious_success_nudge,
|
|
_prune_input_list,
|
|
_summarize_tool_output,
|
|
)
|
|
from skyvern.forge.sdk.copilot.tools import (
|
|
_analyze_run_blocks,
|
|
_is_meaningful_extracted_data,
|
|
_record_run_blocks_result,
|
|
)
|
|
|
|
|
|
class _Ctx:
|
|
"""Minimal stand-in for CopilotContext used in enforcement checks.
|
|
|
|
Keep this in sync with ``AgentContext`` enforcement-state fields — missing
|
|
attributes would show up as AttributeError in the branches that use bare
|
|
access rather than ``getattr``.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self.navigate_called = False
|
|
self.observation_after_navigate = False
|
|
self.navigate_enforcement_done = False
|
|
self.update_workflow_called = False
|
|
self.test_after_update_done = False
|
|
self.post_update_nudge_count = 0
|
|
self.coverage_nudge_count = 0
|
|
self.format_nudge_count = 0
|
|
self.user_message = ""
|
|
self.last_update_block_count = None
|
|
self.last_test_ok = None
|
|
self.last_test_failure_reason = None
|
|
self.last_test_suspicious_success = False
|
|
self.last_test_anti_bot = None
|
|
self.last_failure_category_top = None
|
|
self.failed_test_nudge_count = 0
|
|
self.explore_without_workflow_nudge_count = 0
|
|
self.null_data_streak_count = 0
|
|
self.repeated_failure_streak_count = 0
|
|
self.repeated_failure_nudge_emitted_at_streak = 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _is_meaningful_extracted_data
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_meaningful_data_none() -> None:
|
|
assert _is_meaningful_extracted_data(None) is False
|
|
|
|
|
|
def test_meaningful_data_empty_dict() -> None:
|
|
assert _is_meaningful_extracted_data({}) is False
|
|
|
|
|
|
def test_meaningful_data_all_null_dict() -> None:
|
|
# The regression: {"price": None} used to count as meaningful because
|
|
# the dict itself is truthy. It must NOT count as meaningful.
|
|
assert _is_meaningful_extracted_data({"price": None}) is False
|
|
|
|
|
|
def test_meaningful_data_nested_all_null() -> None:
|
|
assert _is_meaningful_extracted_data({"a": None, "b": {"c": None}}) is False
|
|
|
|
|
|
def test_meaningful_data_one_real_value() -> None:
|
|
assert _is_meaningful_extracted_data({"price": "260.48", "other": None}) is True
|
|
|
|
|
|
def test_meaningful_data_empty_list() -> None:
|
|
assert _is_meaningful_extracted_data([]) is False
|
|
|
|
|
|
def test_meaningful_data_list_of_nulls() -> None:
|
|
assert _is_meaningful_extracted_data([None, None]) is False
|
|
|
|
|
|
def test_meaningful_data_scalar_zero() -> None:
|
|
# A literal 0 is still meaningful output — it's a value, not absence of data.
|
|
assert _is_meaningful_extracted_data(0) is True
|
|
|
|
|
|
def test_meaningful_data_empty_string() -> None:
|
|
assert _is_meaningful_extracted_data("") is False
|
|
|
|
|
|
def test_meaningful_data_string() -> None:
|
|
assert _is_meaningful_extracted_data("$260.48") is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _analyze_run_blocks — envelope-unwrap for EXTRACTION blocks
|
|
#
|
|
# ExtractionBlock stores TaskOutput.from_task() on block.output. Envelope
|
|
# fields (task_id, status, *_screenshot_artifact_ids) are always populated on
|
|
# a completed run and would short-circuit _is_meaningful_extracted_data to
|
|
# True even when the real payload fields (extracted_information,
|
|
# downloaded_files, downloaded_file_urls) are empty. The meaningful-data
|
|
# check must judge against the payload slice, not the envelope.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_EMPTY_EXTRACTION_ENVELOPE: dict[str, Any] = {
|
|
"task_id": "tsk_00000000000000000001",
|
|
"status": "completed",
|
|
"extracted_information": [],
|
|
"failure_reason": None,
|
|
"errors": [],
|
|
"failure_category": None,
|
|
"downloaded_files": [],
|
|
"downloaded_file_urls": None,
|
|
"task_screenshots": None,
|
|
"workflow_screenshots": None,
|
|
"task_screenshot_artifact_ids": ["a_00000000000000000001", "a_00000000000000000002"],
|
|
"workflow_screenshot_artifact_ids": ["a_00000000000000000001", "a_00000000000000000003"],
|
|
}
|
|
|
|
|
|
def _run_result(blocks: list[dict[str, Any]], ok: bool = True) -> dict[str, Any]:
|
|
return {"ok": ok, "data": {"blocks": blocks}}
|
|
|
|
|
|
def _envelope(**overrides: Any) -> dict[str, Any]:
|
|
"""Return a fresh copy of the empty-extraction envelope with field overrides."""
|
|
return {**_EMPTY_EXTRACTION_ENVELOPE, **overrides}
|
|
|
|
|
|
def _extraction_block(extracted_data: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"label": "extract_flights",
|
|
"block_type": "EXTRACTION",
|
|
"status": "completed",
|
|
"extracted_data": extracted_data,
|
|
}
|
|
|
|
|
|
def _text_prompt_block(extracted_data: Any) -> dict[str, Any]:
|
|
return {
|
|
"label": "summarize",
|
|
"block_type": "TEXT_PROMPT",
|
|
"status": "completed",
|
|
"extracted_data": extracted_data,
|
|
}
|
|
|
|
|
|
# Case id -> (envelope overrides, expected empty_data_blocks)
|
|
#
|
|
# empty_payload_trace_repro: extracted_information=[], downloaded_files=[],
|
|
# downloaded_file_urls=None, envelope metadata populated. Envelope-as-a-whole
|
|
# is truthy; real payload is empty; gate must flip. (SKY-9143 repro.)
|
|
# download_only_files / download_only_urls: legitimate extraction success where the
|
|
# block produced files but no structured payload — must NOT flip the gate.
|
|
_EXTRACTION_ENVELOPE_CASES: list[tuple[str, dict[str, Any], bool]] = [
|
|
("empty_payload_trace_repro", {}, True),
|
|
("real_extraction", {"extracted_information": [{"price": "260.48"}]}, False),
|
|
(
|
|
"download_only_files",
|
|
{"downloaded_files": [{"url": "https://example.com/a.pdf", "checksum": "abc123"}]},
|
|
False,
|
|
),
|
|
(
|
|
"download_only_urls",
|
|
{"extracted_information": None, "downloaded_file_urls": ["https://example.com/a.pdf"]},
|
|
False,
|
|
),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"overrides,expected_empty",
|
|
[(ovr, exp) for _, ovr, exp in _EXTRACTION_ENVELOPE_CASES],
|
|
ids=[case_id for case_id, _, _ in _EXTRACTION_ENVELOPE_CASES],
|
|
)
|
|
def test_analyze_extraction_envelope(overrides: dict[str, Any], expected_empty: bool) -> None:
|
|
_, empty, _ = _analyze_run_blocks(_run_result([_extraction_block(_envelope(**overrides))]))
|
|
assert empty is expected_empty
|
|
|
|
|
|
def test_analyze_text_prompt_default_schema_is_not_empty() -> None:
|
|
# TEXT_PROMPT blocks return the raw LLM response dict (no Task envelope).
|
|
# Default schema is {"llm_response": "<text>"}.
|
|
_, empty, _ = _analyze_run_blocks(_run_result([_text_prompt_block({"llm_response": "the sentiment is positive"})]))
|
|
assert empty is False
|
|
|
|
|
|
def test_analyze_text_prompt_user_schema_named_extracted_information_is_not_sliced() -> None:
|
|
# Guard against a too-broad unwrap: a user's json_schema may name a
|
|
# top-level field "extracted_information". The helper must not mistake
|
|
# that for an EXTRACTION envelope and discard sibling fields.
|
|
block = _text_prompt_block({"extracted_information": "ignored because this is TEXT_PROMPT", "summary": "x"})
|
|
_, empty, _ = _analyze_run_blocks(_run_result([block]))
|
|
assert empty is False
|
|
|
|
|
|
def test_analyze_text_prompt_all_null_is_empty() -> None:
|
|
# Symmetric to {"price": None} — a text-prompt response with all-null
|
|
# fields counts as no meaningful output.
|
|
_, empty, _ = _analyze_run_blocks(_run_result([_text_prompt_block({"summary": None})]))
|
|
assert empty is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _record_run_blocks_result — end-to-end flip of last_test_ok on empty envelope
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _fresh_ctx_for_record() -> Any:
|
|
"""SimpleNamespace shaped for _record_run_blocks_result + update_repeated_failure_state.
|
|
|
|
Uses getattr-with-default-compatible defaults so the function under test
|
|
populates the interesting fields without tripping AttributeError on the
|
|
downstream update_repeated_failure_state call.
|
|
"""
|
|
from types import SimpleNamespace
|
|
|
|
return SimpleNamespace(
|
|
last_test_ok=True,
|
|
last_test_failure_reason=None,
|
|
last_test_suspicious_success=False,
|
|
last_test_anti_bot=None,
|
|
last_failure_category_top=None,
|
|
last_test_non_retriable_nav_error=None,
|
|
null_data_streak_count=0,
|
|
failed_test_nudge_count=0,
|
|
last_failed_workflow_yaml=None,
|
|
non_retriable_nav_error_last_emitted_signature=None,
|
|
workflow_yaml=None,
|
|
last_workflow=None,
|
|
last_frontier_start_label=None,
|
|
last_executed_block_labels=[],
|
|
last_failure_signature=None,
|
|
last_frontier_fingerprint=None,
|
|
repeated_failure_streak_count=0,
|
|
repeated_failure_nudge_emitted_at_streak=0,
|
|
pending_action_sequence_fingerprint=None,
|
|
last_action_sequence_fingerprint=None,
|
|
repeated_action_fingerprint_streak_count=0,
|
|
)
|
|
|
|
|
|
def test_record_run_blocks_result_flips_last_test_ok_on_empty_extraction_envelope() -> None:
|
|
# End-to-end: a run reporting ok=true but whose sole EXTRACTION block
|
|
# produced the empty envelope must push last_test_ok from True to None,
|
|
# so _verified_workflow_or_none blocks the proposal. This is the user-
|
|
# visible SKY-9143 regression.
|
|
ctx = _fresh_ctx_for_record()
|
|
result = _run_result([_extraction_block(_envelope())])
|
|
_record_run_blocks_result(ctx, result)
|
|
assert ctx.last_test_ok is None
|
|
assert ctx.last_test_suspicious_success is True
|
|
assert ctx.last_test_failure_reason is not None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Repeated null-data escalation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_suspicious_success_fires_when_flag_set() -> None:
|
|
ctx = _Ctx()
|
|
ctx.last_test_suspicious_success = True
|
|
ctx.null_data_streak_count = 1
|
|
assert _needs_suspicious_success_nudge(ctx) is True
|
|
assert _needs_repeated_null_data_nudge(ctx) is False
|
|
|
|
|
|
def test_repeated_null_data_fires_at_threshold() -> None:
|
|
ctx = _Ctx()
|
|
ctx.last_test_suspicious_success = True
|
|
ctx.null_data_streak_count = NULL_DATA_STREAK_ESCALATE_AT
|
|
assert _needs_repeated_null_data_nudge(ctx) is True
|
|
|
|
|
|
def test_check_enforcement_returns_repeated_nudge_at_threshold() -> None:
|
|
ctx = _Ctx()
|
|
ctx.last_test_suspicious_success = True
|
|
ctx.null_data_streak_count = NULL_DATA_STREAK_ESCALATE_AT
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge == POST_REPEATED_NULL_DATA_NUDGE
|
|
|
|
|
|
def test_check_enforcement_returns_regular_suspicious_nudge_below_threshold() -> None:
|
|
ctx = _Ctx()
|
|
ctx.last_test_suspicious_success = True
|
|
ctx.null_data_streak_count = 1
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge == POST_SUSPICIOUS_SUCCESS_NUDGE
|
|
|
|
|
|
def test_repeated_null_data_requires_suspicious_flag() -> None:
|
|
# If the current test wasn't a suspicious success, don't fire even with a high streak.
|
|
ctx = _Ctx()
|
|
ctx.last_test_suspicious_success = False
|
|
ctx.null_data_streak_count = 99
|
|
assert _needs_repeated_null_data_nudge(ctx) is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool-output pruning
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _fco(call_id: str, output: str) -> dict:
|
|
return {"type": "function_call_output", "call_id": call_id, "output": output}
|
|
|
|
|
|
def test_recent_outputs_preserved_full() -> None:
|
|
# Build KEEP_RECENT_TOOL_OUTPUTS + 1 items so exactly one is "old".
|
|
items = []
|
|
short = '{"ok":true,"data":{"overall_status":"completed"}}'
|
|
for i in range(KEEP_RECENT_TOOL_OUTPUTS + 1):
|
|
items.append(_fco(f"c{i}", short))
|
|
|
|
pruned = _prune_input_list(items)
|
|
# Each recent item is unchanged (they're all short and JSON).
|
|
for i in range(1, KEEP_RECENT_TOOL_OUTPUTS + 1):
|
|
assert pruned[i]["output"] == short
|
|
|
|
|
|
def test_old_large_output_is_summarized() -> None:
|
|
# An older, large JSON tool output gets compressed into a synopsis.
|
|
heavy_payload = {
|
|
"ok": True,
|
|
"data": {
|
|
"workflow_run_id": "wr_123",
|
|
"overall_status": "completed",
|
|
"blocks": [
|
|
{
|
|
"label": "open_quote_page",
|
|
"status": "completed",
|
|
"block_type": "GOTO_URL",
|
|
"extracted_data": None,
|
|
},
|
|
{
|
|
"label": "extract_stock_price",
|
|
"status": "completed",
|
|
"block_type": "EXTRACTION",
|
|
"extracted_data": {"price": None},
|
|
"failure_reason": None,
|
|
},
|
|
],
|
|
"visible_elements_html": "<html>" + ("x" * 4000) + "</html>",
|
|
"screenshot_base64": "[base64 image omitted]",
|
|
},
|
|
}
|
|
heavy_output = json.dumps(heavy_payload)
|
|
assert len(heavy_output) > 4000
|
|
|
|
items = [_fco("c_old", heavy_output)]
|
|
# Add enough recent outputs to push the first one out of the recent window.
|
|
for i in range(KEEP_RECENT_TOOL_OUTPUTS):
|
|
items.append(_fco(f"c_new_{i}", '{"ok":true,"data":{"overall_status":"completed"}}'))
|
|
|
|
pruned = _prune_input_list(items)
|
|
summarized = pruned[0]["output"]
|
|
# The summary must be drastically shorter than the original.
|
|
assert len(summarized) < 1000
|
|
# It must preserve the key signal fields so the agent can still reason about past calls.
|
|
parsed = json.loads(summarized)
|
|
assert parsed["ok"] is True
|
|
assert parsed["overall_status"] == "completed"
|
|
assert parsed["workflow_run_id"] == "wr_123"
|
|
assert parsed["_summarized"]
|
|
assert len(parsed["blocks"]) == 2
|
|
assert parsed["blocks"][1]["label"] == "extract_stock_price"
|
|
assert parsed["blocks"][1]["status"] == "completed"
|
|
|
|
|
|
def test_summarize_non_json_output_falls_back_to_head_truncation() -> None:
|
|
text = "not-json " * 1000
|
|
result = _summarize_tool_output(text)
|
|
assert len(result) < len(text)
|
|
assert result.startswith("not-json")
|
|
assert "older tool output truncated" in result
|
|
|
|
|
|
def test_summarize_short_output_is_unchanged() -> None:
|
|
assert _summarize_tool_output("small") == "small"
|
|
|
|
|
|
def test_recent_large_output_is_head_truncated_not_summarized() -> None:
|
|
# Big JSON in the most-recent slot should be head-truncated at 2000 chars,
|
|
# NOT replaced with a summary.
|
|
large = '{"ok":true,"data":{"value":"' + ("y" * 3000) + '"}}'
|
|
items = [_fco("c_recent", large)]
|
|
pruned = _prune_input_list(items)
|
|
out = pruned[0]["output"]
|
|
assert out.startswith('{"ok":true,')
|
|
assert out.endswith("\n... [truncated]")
|
|
assert len(out) <= 2020
|
|
|
|
|
|
class TestEnforcement:
|
|
def _make_ctx(self, **overrides: Any) -> Any:
|
|
"""Create a mock context with enforcement attributes."""
|
|
ctx = MagicMock()
|
|
ctx.navigate_called = False
|
|
ctx.observation_after_navigate = False
|
|
ctx.navigate_enforcement_done = False
|
|
ctx.update_workflow_called = False
|
|
ctx.test_after_update_done = False
|
|
ctx.post_update_nudge_count = 0
|
|
ctx.coverage_nudge_count = 0
|
|
ctx.format_nudge_count = 0
|
|
ctx.explore_without_workflow_nudge_count = 0
|
|
ctx.last_test_suspicious_success = False
|
|
ctx.last_test_anti_bot = None
|
|
for k, v in overrides.items():
|
|
setattr(ctx, k, v)
|
|
return ctx
|
|
|
|
@staticmethod
|
|
def _reply_result(user_response: str = "") -> Any:
|
|
"""Build a RunResultStreaming-shaped mock whose final_output parses as REPLY."""
|
|
import json
|
|
|
|
result = MagicMock()
|
|
result.final_output = json.dumps({"type": "REPLY", "user_response": user_response})
|
|
result.new_items = []
|
|
return result
|
|
|
|
@staticmethod
|
|
def _empty_result() -> Any:
|
|
"""Build a mock with no final text — triggers the 'not sure how to help' fallback."""
|
|
result = MagicMock()
|
|
result.final_output = None
|
|
result.new_items = []
|
|
return result
|
|
|
|
def test_no_enforcement_when_nothing_pending(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx()
|
|
assert _check_enforcement(ctx) is None
|
|
|
|
def test_post_navigate_nudge(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(navigate_called=True, observation_after_navigate=False)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge is not None
|
|
assert "observe" in nudge.lower() or "inspect" in nudge.lower()
|
|
assert ctx.navigate_enforcement_done is True
|
|
|
|
def test_post_navigate_only_fires_once(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=False,
|
|
navigate_enforcement_done=True,
|
|
)
|
|
assert _check_enforcement(ctx) is None
|
|
|
|
def test_post_update_nudge(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(update_workflow_called=True, test_after_update_done=False)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge is not None
|
|
assert "test" in nudge.lower() or "run_blocks" in nudge.lower()
|
|
|
|
def test_navigate_takes_priority_over_update(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=False,
|
|
update_workflow_called=True,
|
|
test_after_update_done=False,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
assert "observe" in nudge.lower() or "inspect" in nudge.lower()
|
|
|
|
def test_intermediate_success_nudge_for_multistep_goal(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=1,
|
|
user_message="Go to france.fr and then download all french regulations",
|
|
coverage_nudge_count=0,
|
|
)
|
|
from skyvern.forge.sdk.copilot.enforcement import POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
|
|
# Coverage gate only fires when the model tries to emit a REPLY.
|
|
nudge = _check_enforcement(ctx, self._reply_result("draft response"))
|
|
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
assert ctx.coverage_nudge_count == 1
|
|
|
|
def test_no_intermediate_success_nudge_for_single_step_goal(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=1,
|
|
user_message="Go to france.fr",
|
|
coverage_nudge_count=0,
|
|
)
|
|
assert _check_enforcement(ctx, self._reply_result("done")) is None
|
|
|
|
def test_intermediate_success_nudge_fires_for_two_blocks(self) -> None:
|
|
"""Key regression: nudge must fire even when block_count > 1."""
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=2,
|
|
user_message="Go to france.fr and then download all french regulations and extract the titles",
|
|
coverage_nudge_count=0,
|
|
)
|
|
from skyvern.forge.sdk.copilot.enforcement import POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
|
|
nudge = _check_enforcement(ctx, self._reply_result("two-block draft"))
|
|
assert nudge == POST_INTERMEDIATE_SUCCESS_NUDGE
|
|
|
|
def test_intermediate_nudge_respects_global_cap(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import MAX_INTERMEDIATE_NUDGES, _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=2,
|
|
user_message="Go to france.fr and then download all french regulations",
|
|
coverage_nudge_count=MAX_INTERMEDIATE_NUDGES,
|
|
)
|
|
assert _check_enforcement(ctx, self._reply_result("capped")) is None
|
|
|
|
def test_intermediate_nudge_does_not_fire_for_ten_plus_blocks(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=10,
|
|
user_message="Go to france.fr and then download all french regulations",
|
|
coverage_nudge_count=0,
|
|
)
|
|
assert _check_enforcement(ctx, self._reply_result("ten blocks")) is None
|
|
|
|
def test_ask_question_always_passes_even_with_coverage_gap(self) -> None:
|
|
"""Regression guard: ASK_QUESTION must never be blocked by coverage."""
|
|
import json
|
|
|
|
from skyvern.forge.sdk.copilot.enforcement import _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=True,
|
|
last_test_ok=True,
|
|
last_update_block_count=1,
|
|
user_message="Go to france.fr and then download all french regulations",
|
|
coverage_nudge_count=0,
|
|
)
|
|
ask = MagicMock()
|
|
ask.final_output = json.dumps({"type": "ASK_QUESTION", "user_response": "Which source?"})
|
|
ask.new_items = []
|
|
assert _check_enforcement(ctx, ask) is None
|
|
|
|
def test_explore_without_workflow_nudge(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE, _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=True,
|
|
update_workflow_called=False,
|
|
test_after_update_done=False,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge == POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE
|
|
assert ctx.explore_without_workflow_nudge_count == 1
|
|
|
|
def test_explore_without_workflow_not_when_update_called(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import (
|
|
POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE,
|
|
POST_UPDATE_NUDGE,
|
|
_check_enforcement,
|
|
)
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=True,
|
|
update_workflow_called=True,
|
|
test_after_update_done=False,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge == POST_UPDATE_NUDGE
|
|
assert nudge != POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE
|
|
assert ctx.explore_without_workflow_nudge_count == 0
|
|
|
|
def test_explore_without_workflow_not_when_test_done(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE, _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=True,
|
|
update_workflow_called=False,
|
|
test_after_update_done=True,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge != POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE
|
|
|
|
def test_explore_without_workflow_respects_cap(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import (
|
|
MAX_EXPLORE_WITHOUT_WORKFLOW_NUDGES,
|
|
POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE,
|
|
_check_enforcement,
|
|
)
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=True,
|
|
update_workflow_called=False,
|
|
test_after_update_done=False,
|
|
explore_without_workflow_nudge_count=MAX_EXPLORE_WITHOUT_WORKFLOW_NUDGES,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
assert nudge != POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE
|
|
|
|
def test_explore_without_workflow_not_without_observation(self) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE, _check_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=False,
|
|
update_workflow_called=False,
|
|
test_after_update_done=False,
|
|
)
|
|
nudge = _check_enforcement(ctx)
|
|
# Should get navigate nudge, not explore-without-workflow
|
|
assert nudge != POST_EXPLORE_WITHOUT_WORKFLOW_NUDGE
|
|
assert ctx.explore_without_workflow_nudge_count == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_post_navigate_nudge_does_not_increment_post_update_counter(
|
|
self,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import run_with_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
navigate_called=True,
|
|
observation_after_navigate=False,
|
|
update_workflow_called=False,
|
|
post_update_nudge_count=0,
|
|
)
|
|
stream = MagicMock()
|
|
stream.is_disconnected = AsyncMock(return_value=False)
|
|
|
|
call_count = {"count": 0}
|
|
|
|
# final_output=None + new_items=[] makes extract_final_text return "",
|
|
# which parses to a REPLY fallback — safe for the response-peek path
|
|
# when the state-based branches may or may not short-circuit first.
|
|
fake_result = self._empty_result()
|
|
fake_result.to_input_list.return_value = []
|
|
|
|
def fake_run_streamed(*args: Any, **kwargs: Any) -> Any:
|
|
call_count["count"] += 1
|
|
return fake_result
|
|
|
|
async def fake_stream_to_sse(result: Any, s: Any, c: Any) -> None:
|
|
# Resolve post-navigate enforcement on second pass.
|
|
if call_count["count"] >= 2:
|
|
c.observation_after_navigate = True
|
|
|
|
monkeypatch.setattr("skyvern.forge.sdk.copilot.enforcement.Runner.run_streamed", fake_run_streamed)
|
|
monkeypatch.setattr(
|
|
"skyvern.forge.sdk.copilot.streaming_adapter.stream_to_sse",
|
|
fake_stream_to_sse,
|
|
)
|
|
|
|
returned = await run_with_enforcement(
|
|
agent=MagicMock(),
|
|
initial_input="hello",
|
|
ctx=ctx,
|
|
stream=stream,
|
|
)
|
|
assert returned is fake_result
|
|
assert ctx.post_update_nudge_count == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_post_update_nudge_increments_counter(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
|
from skyvern.forge.sdk.copilot.enforcement import run_with_enforcement
|
|
|
|
ctx = self._make_ctx(
|
|
update_workflow_called=True,
|
|
test_after_update_done=False,
|
|
post_update_nudge_count=0,
|
|
)
|
|
stream = MagicMock()
|
|
stream.is_disconnected = AsyncMock(return_value=False)
|
|
|
|
call_count = {"count": 0}
|
|
fake_result = self._empty_result()
|
|
fake_result.to_input_list.return_value = []
|
|
|
|
def fake_run_streamed(*args: Any, **kwargs: Any) -> Any:
|
|
call_count["count"] += 1
|
|
return fake_result
|
|
|
|
async def fake_stream_to_sse(result: Any, s: Any, c: Any) -> None:
|
|
# Resolve post-update enforcement on second pass.
|
|
if call_count["count"] >= 2:
|
|
c.test_after_update_done = True
|
|
|
|
monkeypatch.setattr("skyvern.forge.sdk.copilot.enforcement.Runner.run_streamed", fake_run_streamed)
|
|
monkeypatch.setattr(
|
|
"skyvern.forge.sdk.copilot.streaming_adapter.stream_to_sse",
|
|
fake_stream_to_sse,
|
|
)
|
|
|
|
returned = await run_with_enforcement(
|
|
agent=MagicMock(),
|
|
initial_input="hello",
|
|
ctx=ctx,
|
|
stream=stream,
|
|
)
|
|
assert returned is fake_result
|
|
assert ctx.post_update_nudge_count == 1
|
|
|
|
|
|
class TestGoalLikelyNeedsMoreBlocks:
|
|
@staticmethod
|
|
def _check(user_message: str, block_count: int) -> bool:
|
|
from skyvern.forge.sdk.copilot.enforcement import _goal_likely_needs_more_blocks
|
|
|
|
return _goal_likely_needs_more_blocks(user_message, block_count)
|
|
|
|
def test_navigate_and_download_needs_two(self) -> None:
|
|
assert self._check("Go to france.fr and then download regulations", 1) is True
|
|
assert self._check("Go to france.fr and then download regulations", 2) is False
|
|
|
|
def test_login_search_and_extract_needs_three(self) -> None:
|
|
assert self._check("Login to the site, search for products, and extract prices", 1) is True
|
|
assert self._check("Login to the site, search for products, and extract prices", 2) is True
|
|
assert self._check("Login to the site, search for products, and extract prices", 3) is False
|
|
|
|
def test_single_action_does_not_need_more(self) -> None:
|
|
assert self._check("Go to france.fr", 1) is False
|
|
|
|
def test_sequential_connector_needs_at_least_two(self) -> None:
|
|
assert self._check("Do X and then do Y", 1) is True
|
|
|
|
def test_ten_plus_blocks_always_false(self) -> None:
|
|
assert self._check("Go to X and then download Y and extract Z", 10) is False
|
|
|
|
def test_non_string_returns_false(self) -> None:
|
|
assert self._check(None, 1) is False # type: ignore[arg-type]
|
|
assert self._check(123, 1) is False # type: ignore[arg-type]
|