diff --git a/tests/unit/test_copilot_cancel_helpers.py b/tests/unit/test_copilot_cancel_helpers.py index 8ae451c1d..c6554c263 100644 --- a/tests/unit/test_copilot_cancel_helpers.py +++ b/tests/unit/test_copilot_cancel_helpers.py @@ -238,7 +238,7 @@ def test_trusted_post_drain_handles_missing_row() -> None: from types import SimpleNamespace as _NS # noqa: E402 (grouped with test block) -from skyvern.forge.sdk.copilot.tools import _BLOCK_RUNNING_TOOLS, _tool_loop_error # noqa: E402 +from skyvern.forge.sdk.copilot.tools import BLOCK_RUNNING_TOOLS, _tool_loop_error # noqa: E402 def _guard_ctx(pending_run_id: str | None = None) -> _NS: @@ -251,9 +251,9 @@ def _guard_ctx(pending_run_id: str | None = None) -> _NS: ) -@pytest.mark.parametrize("tool_name", sorted(_BLOCK_RUNNING_TOOLS)) +@pytest.mark.parametrize("tool_name", sorted(BLOCK_RUNNING_TOOLS)) def test_reconciliation_guard_blocks_block_running_tools(tool_name: str) -> None: - """With a pending run set, every tool in ``_BLOCK_RUNNING_TOOLS`` is + """With a pending run set, every tool in ``BLOCK_RUNNING_TOOLS`` is rejected with an error that names the run id and directs the LLM to call ``get_run_results`` first.""" ctx = _guard_ctx(pending_run_id="wr_pending_123") @@ -268,7 +268,7 @@ def test_reconciliation_guard_blocks_block_running_tools(tool_name: str) -> None ["get_run_results", "update_workflow", "list_credentials"], ) def test_reconciliation_guard_ignores_non_block_running_tools(tool_name: str) -> None: - """The guard is scoped to ``_BLOCK_RUNNING_TOOLS``. Planning / metadata + """The guard is scoped to ``BLOCK_RUNNING_TOOLS``. Planning / metadata tools (including ``get_run_results`` itself, which is the tool that can CLEAR the flag) must not be rejected.""" ctx = _guard_ctx(pending_run_id="wr_pending_123") @@ -279,7 +279,7 @@ def test_reconciliation_guard_passes_when_flag_empty() -> None: """No pending run → `_tool_loop_error` returns None for block-running tools (assuming no other guard trips).""" ctx = _guard_ctx(pending_run_id=None) - for name in _BLOCK_RUNNING_TOOLS: + for name in BLOCK_RUNNING_TOOLS: assert _tool_loop_error(ctx, name) is None @@ -288,7 +288,7 @@ def test_reconciliation_guard_rejects_empty_string_run_id() -> None: not set. Prevents a spurious guard trip if anything ever clears the flag to ``''`` instead of ``None``.""" ctx = _guard_ctx(pending_run_id="") - for name in _BLOCK_RUNNING_TOOLS: + for name in BLOCK_RUNNING_TOOLS: assert _tool_loop_error(ctx, name) is None diff --git a/tests/unit/test_copilot_enforcement_pruning.py b/tests/unit/test_copilot_enforcement_pruning.py index 7cead0262..a0344184d 100644 --- a/tests/unit/test_copilot_enforcement_pruning.py +++ b/tests/unit/test_copilot_enforcement_pruning.py @@ -25,7 +25,11 @@ from skyvern.forge.sdk.copilot.enforcement import ( _prune_input_list, _summarize_tool_output, ) -from skyvern.forge.sdk.copilot.tools import _is_meaningful_extracted_data +from skyvern.forge.sdk.copilot.tools import ( + _analyze_run_blocks, + _is_meaningful_extracted_data, + _record_run_blocks_result, +) class _Ctx: @@ -107,6 +111,169 @@ def test_meaningful_data_string() -> None: assert _is_meaningful_extracted_data("$260.48") is True +# --------------------------------------------------------------------------- +# _analyze_run_blocks — envelope-unwrap for EXTRACTION blocks +# +# ExtractionBlock stores TaskOutput.from_task() on block.output. Envelope +# fields (task_id, status, *_screenshot_artifact_ids) are always populated on +# a completed run and would short-circuit _is_meaningful_extracted_data to +# True even when the real payload fields (extracted_information, +# downloaded_files, downloaded_file_urls) are empty. The meaningful-data +# check must judge against the payload slice, not the envelope. +# --------------------------------------------------------------------------- + + +_EMPTY_EXTRACTION_ENVELOPE: dict[str, Any] = { + "task_id": "tsk_00000000000000000001", + "status": "completed", + "extracted_information": [], + "failure_reason": None, + "errors": [], + "failure_category": None, + "downloaded_files": [], + "downloaded_file_urls": None, + "task_screenshots": None, + "workflow_screenshots": None, + "task_screenshot_artifact_ids": ["a_00000000000000000001", "a_00000000000000000002"], + "workflow_screenshot_artifact_ids": ["a_00000000000000000001", "a_00000000000000000003"], +} + + +def _run_result(blocks: list[dict[str, Any]], ok: bool = True) -> dict[str, Any]: + return {"ok": ok, "data": {"blocks": blocks}} + + +def _envelope(**overrides: Any) -> dict[str, Any]: + """Return a fresh copy of the empty-extraction envelope with field overrides.""" + return {**_EMPTY_EXTRACTION_ENVELOPE, **overrides} + + +def _extraction_block(extracted_data: dict[str, Any]) -> dict[str, Any]: + return { + "label": "extract_flights", + "block_type": "EXTRACTION", + "status": "completed", + "extracted_data": extracted_data, + } + + +def _text_prompt_block(extracted_data: Any) -> dict[str, Any]: + return { + "label": "summarize", + "block_type": "TEXT_PROMPT", + "status": "completed", + "extracted_data": extracted_data, + } + + +# Case id -> (envelope overrides, expected empty_data_blocks) +# +# empty_payload_trace_repro: extracted_information=[], downloaded_files=[], +# downloaded_file_urls=None, envelope metadata populated. Envelope-as-a-whole +# is truthy; real payload is empty; gate must flip. (SKY-9143 repro.) +# download_only_files / download_only_urls: legitimate extraction success where the +# block produced files but no structured payload — must NOT flip the gate. +_EXTRACTION_ENVELOPE_CASES: list[tuple[str, dict[str, Any], bool]] = [ + ("empty_payload_trace_repro", {}, True), + ("real_extraction", {"extracted_information": [{"price": "260.48"}]}, False), + ( + "download_only_files", + {"downloaded_files": [{"url": "https://example.com/a.pdf", "checksum": "abc123"}]}, + False, + ), + ( + "download_only_urls", + {"extracted_information": None, "downloaded_file_urls": ["https://example.com/a.pdf"]}, + False, + ), +] + + +@pytest.mark.parametrize( + "overrides,expected_empty", + [(ovr, exp) for _, ovr, exp in _EXTRACTION_ENVELOPE_CASES], + ids=[case_id for case_id, _, _ in _EXTRACTION_ENVELOPE_CASES], +) +def test_analyze_extraction_envelope(overrides: dict[str, Any], expected_empty: bool) -> None: + _, empty, _ = _analyze_run_blocks(_run_result([_extraction_block(_envelope(**overrides))])) + assert empty is expected_empty + + +def test_analyze_text_prompt_default_schema_is_not_empty() -> None: + # TEXT_PROMPT blocks return the raw LLM response dict (no Task envelope). + # Default schema is {"llm_response": ""}. + _, empty, _ = _analyze_run_blocks(_run_result([_text_prompt_block({"llm_response": "the sentiment is positive"})])) + assert empty is False + + +def test_analyze_text_prompt_user_schema_named_extracted_information_is_not_sliced() -> None: + # Guard against a too-broad unwrap: a user's json_schema may name a + # top-level field "extracted_information". The helper must not mistake + # that for an EXTRACTION envelope and discard sibling fields. + block = _text_prompt_block({"extracted_information": "ignored because this is TEXT_PROMPT", "summary": "x"}) + _, empty, _ = _analyze_run_blocks(_run_result([block])) + assert empty is False + + +def test_analyze_text_prompt_all_null_is_empty() -> None: + # Symmetric to {"price": None} — a text-prompt response with all-null + # fields counts as no meaningful output. + _, empty, _ = _analyze_run_blocks(_run_result([_text_prompt_block({"summary": None})])) + assert empty is True + + +# --------------------------------------------------------------------------- +# _record_run_blocks_result — end-to-end flip of last_test_ok on empty envelope +# --------------------------------------------------------------------------- + + +def _fresh_ctx_for_record() -> Any: + """SimpleNamespace shaped for _record_run_blocks_result + update_repeated_failure_state. + + Uses getattr-with-default-compatible defaults so the function under test + populates the interesting fields without tripping AttributeError on the + downstream update_repeated_failure_state call. + """ + from types import SimpleNamespace + + return SimpleNamespace( + last_test_ok=True, + last_test_failure_reason=None, + last_test_suspicious_success=False, + last_test_anti_bot=None, + last_failure_category_top=None, + last_test_non_retriable_nav_error=None, + null_data_streak_count=0, + failed_test_nudge_count=0, + last_failed_workflow_yaml=None, + non_retriable_nav_error_last_emitted_signature=None, + workflow_yaml=None, + last_workflow=None, + last_frontier_start_label=None, + last_executed_block_labels=[], + last_failure_signature=None, + last_frontier_fingerprint=None, + repeated_failure_streak_count=0, + repeated_failure_nudge_emitted_at_streak=0, + pending_action_sequence_fingerprint=None, + last_action_sequence_fingerprint=None, + repeated_action_fingerprint_streak_count=0, + ) + + +def test_record_run_blocks_result_flips_last_test_ok_on_empty_extraction_envelope() -> None: + # End-to-end: a run reporting ok=true but whose sole EXTRACTION block + # produced the empty envelope must push last_test_ok from True to None, + # so _verified_workflow_or_none blocks the proposal. This is the user- + # visible SKY-9143 regression. + ctx = _fresh_ctx_for_record() + result = _run_result([_extraction_block(_envelope())]) + _record_run_blocks_result(ctx, result) + assert ctx.last_test_ok is None + assert ctx.last_test_suspicious_success is True + assert ctx.last_test_failure_reason is not None + + # --------------------------------------------------------------------------- # Repeated null-data escalation # --------------------------------------------------------------------------- diff --git a/tests/unit/test_copilot_non_retriable_nav.py b/tests/unit/test_copilot_non_retriable_nav.py index d79b8e419..8bdf63952 100644 --- a/tests/unit/test_copilot_non_retriable_nav.py +++ b/tests/unit/test_copilot_non_retriable_nav.py @@ -570,7 +570,7 @@ def test_tool_loop_error_blocks_run_blocks_and_collect_debug_after_dns_failure() def test_tool_loop_error_does_not_block_planning_tools() -> None: # get_run_results / list_credentials / update_workflow are scoped out of - # _BLOCK_RUNNING_TOOLS and should remain callable so the agent can inspect + # BLOCK_RUNNING_TOOLS and should remain callable so the agent can inspect # the failure and decide how to respond to the user. ctx = _fresh_context() ctx.last_test_non_retriable_nav_error = _DNS_FAILURE_REASON