Skyvern/tests/unit/test_copilot_output_utils.py

366 lines
13 KiB
Python

"""Tests for truncate_output and sanitize_tool_result_for_llm."""
from __future__ import annotations
from unittest.mock import MagicMock
from skyvern.forge.sdk.copilot.output_utils import (
parse_final_response,
sanitize_tool_result_for_llm,
summarize_tool_result,
truncate_output,
)
def test_truncate_output_none() -> None:
assert truncate_output(None) is None
def test_truncate_output_short_string() -> None:
assert truncate_output("ok") == "ok"
def test_truncate_output_long_string_truncates() -> None:
text = "x" * 2100
result = truncate_output(text, max_chars=2000)
assert result is not None
assert result.startswith("x" * 2000)
assert result.endswith("\n... [truncated]")
def test_truncate_output_serializes_dict() -> None:
result = truncate_output({"a": 1, "b": True})
assert result == '{"a": 1, "b": true}'
def test_truncate_output_falls_back_to_str_on_json_error() -> None:
circular: dict[str, object] = {}
circular["self"] = circular
result = truncate_output(circular)
assert result is not None
assert "self" in result
def test_sanitize_get_run_results_scrubs_nested_block_screenshots() -> None:
result = {
"ok": True,
"data": {
"workflow_run_id": "wr_123",
"overall_status": "failed",
"blocks": [
{
"label": "open_page",
"status": "completed",
"screenshot_b64": "iVBORw0KGgoAAAANSUhEUgAAA" + "A" * 500,
},
{
"label": "extract_data",
"status": "failed",
"failure_reason": "timeout",
"screenshot_b64": "iVBORw0KGgo" + "B" * 800,
},
],
},
}
sanitized = sanitize_tool_result_for_llm("get_run_results", result)
blocks = sanitized["data"]["blocks"]
assert blocks[0]["screenshot_b64"] == "[base64 image omitted — screenshot was taken successfully]"
assert blocks[1]["screenshot_b64"] == "[base64 image omitted — screenshot was taken successfully]"
assert blocks[1]["failure_reason"] == "timeout"
assert blocks[0]["status"] == "completed"
def test_sanitize_does_not_mutate_original_blocks() -> None:
original_screenshot = "iVBORw0KGgo" + "B" * 500
result = {
"ok": True,
"data": {
"blocks": [{"label": "extract", "screenshot_b64": original_screenshot}],
},
}
original_block = result["data"]["blocks"][0]
sanitized = sanitize_tool_result_for_llm("get_run_results", result)
assert original_block["screenshot_b64"] == original_screenshot
assert sanitized["data"]["blocks"][0]["screenshot_b64"].startswith("[base64 image omitted")
assert sanitized["data"]["blocks"][0] is not original_block
def test_sanitize_run_blocks_debug_does_not_mutate_extracted_data() -> None:
original_extracted = [{"price": 19.99, "name": "widget"}]
result = {
"ok": True,
"data": {
"blocks": [{"label": "extract", "extracted_data": original_extracted}],
},
}
original_block = result["data"]["blocks"][0]
sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)
assert original_block["extracted_data"] is original_extracted
def test_sanitize_other_tools_do_not_touch_block_screenshot_b64() -> None:
# `run_blocks_and_collect_debug` does not attach nested `screenshot_b64`;
# if one somehow shows up there, leave it alone so behavior is scoped.
result = {
"ok": True,
"data": {
"overall_status": "completed",
"blocks": [
{
"label": "a",
"status": "completed",
"screenshot_b64": "stays_here",
}
],
},
}
sanitized = sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)
assert sanitized["data"]["blocks"][0]["screenshot_b64"] == "stays_here"
class TestSanitization:
def test_screenshot_sanitization(self) -> None:
from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm
result = {
"ok": True,
"data": {
"screenshot_base64": "iVBOR...",
"url": "https://example.com",
},
}
sanitized = sanitize_tool_result_for_llm("get_browser_screenshot", result)
expected = "[base64 image omitted — screenshot was taken successfully]"
assert sanitized["data"]["screenshot_base64"] == expected
assert sanitized["data"]["url"] == "https://example.com"
def test_mcp_fields_stripped(self) -> None:
from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm
result = {
"ok": True,
"action": "skyvern_navigate",
"browser_context": {"mode": "cloud_session"},
"timing_ms": {"total": 500},
"artifacts": [],
"data": {
"url": "https://example.com",
"sdk_equivalent": "await page.goto(...)",
},
}
sanitized = sanitize_tool_result_for_llm("navigate_browser", result)
assert "action" not in sanitized
assert "browser_context" not in sanitized
assert "timing_ms" not in sanitized
assert "artifacts" not in sanitized
assert "sdk_equivalent" not in sanitized.get("data", {})
def test_workflow_key_stripped(self) -> None:
from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm
result = {
"ok": True,
"data": {"block_count": 2},
"_workflow": MagicMock(),
}
sanitized = sanitize_tool_result_for_llm("update_workflow", result)
assert "_workflow" not in sanitized
def test_large_schema_truncated(self) -> None:
from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm
big_schema = {f"field_{i}": {"type": "string"} for i in range(200)}
result = {
"ok": True,
"data": {"schema": big_schema},
}
sanitized = sanitize_tool_result_for_llm("get_block_schema", result)
assert sanitized["data"]["schema"]["_truncated"] is True
def test_run_blocks_sanitizer_preserves_compact_packet_fields(self) -> None:
from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm
# visible_elements_html is no longer in the default run-blocks payload
# (it moved to the heavier get_run_results / direct browser path). The
# sanitizer should leave the compact packet fields intact.
result = {
"ok": False,
"data": {
"workflow_run_id": "wr_1",
"overall_status": "failed",
"requested_block_labels": ["a", "b"],
"executed_block_labels": ["b"],
"frontier_start_label": "b",
"current_url": "https://example.test",
"page_title": "Example",
"action_trace_summary": ["click #submit failed"],
"blocks": [{"label": "b", "block_type": "EXTRACTION", "status": "failed"}],
},
}
sanitized = sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)
data = sanitized["data"]
assert "visible_elements_html" not in data
assert data["requested_block_labels"] == ["a", "b"]
assert data["executed_block_labels"] == ["b"]
assert data["frontier_start_label"] == "b"
assert data["action_trace_summary"] == ["click #submit failed"]
assert data["current_url"] == "https://example.test"
class TestSummarizeToolResult:
@staticmethod
def _summarize(tool_name: str, result: dict) -> str:
return summarize_tool_result(tool_name, result)
def test_error_result(self) -> None:
summary = self._summarize("any_tool", {"ok": False, "error": "oops"})
assert "Failed" in summary
assert "oops" in summary
def test_failed_run_surfaces_block_failure_reason_when_error_absent(self) -> None:
summary = self._summarize(
"run_blocks_and_collect_debug",
{
"ok": False,
"data": {
"overall_status": "failed",
"blocks": [
{
"label": "navigate",
"status": "failed",
"failure_reason": (
"Failed to navigate to url https://example.invalid. "
"Error message: net::ERR_NAME_NOT_RESOLVED"
),
}
],
},
},
)
assert "ERR_NAME_NOT_RESOLVED" in summary
assert "Unknown error" not in summary
def test_failed_run_prefers_top_level_error_over_nested(self) -> None:
summary = self._summarize(
"run_blocks_and_collect_debug",
{
"ok": False,
"error": "top-level message",
"data": {"blocks": [{"failure_reason": "nested message"}]},
},
)
assert "top-level message" in summary
assert "nested message" not in summary
def test_failed_run_prefers_data_failure_reason_over_block_failure_reason(self) -> None:
summary = self._summarize(
"run_blocks_and_collect_debug",
{
"ok": False,
"data": {
"failure_reason": "run-level",
"blocks": [{"failure_reason": "block-level"}],
},
},
)
assert "run-level" in summary
assert "block-level" not in summary
def test_failed_run_falls_back_to_unknown_error_when_nothing_present(self) -> None:
summary = self._summarize(
"run_blocks_and_collect_debug",
{"ok": False, "data": {"blocks": []}},
)
assert "Unknown error" in summary
def test_update_workflow(self) -> None:
summary = self._summarize(
"update_workflow",
{
"ok": True,
"data": {"block_count": 3},
},
)
assert "3" in summary
def test_navigate_browser(self) -> None:
summary = self._summarize(
"navigate_browser",
{
"ok": True,
"url": "https://example.com",
},
)
assert summary == "Navigated to https://example.com"
def test_type_text_typed_length(self) -> None:
summary = self._summarize(
"type_text",
{
"ok": True,
"data": {"selector": "#email", "typed_length": 10},
},
)
assert "10" in summary
def test_type_text_text_length(self) -> None:
summary = self._summarize(
"type_text",
{
"ok": True,
"data": {"selector": "#email", "text_length": 20},
},
)
assert "20" in summary
def test_unknown_tool_returns_ok(self) -> None:
summary = self._summarize("unknown_tool", {"ok": True})
assert summary == "OK"
class TestParseFinalResponse:
"""parse_final_response is the last mile between model output and the frontend.
A parse failure falls back to `{"type": "REPLY", "user_response": text}`,
which means the raw JSON object is rendered in the chat bubble. Real model
outputs sometimes embed literal newlines inside string values (strict
`json.loads` rejects those) — seen in SKY-9189 test-2 where the full
refusal envelope landed in the user bubble instead of just user_response.
"""
def test_parses_clean_json_envelope(self) -> None:
envelope = '{"type": "ASK_QUESTION", "user_response": "hi"}'
parsed = parse_final_response(envelope)
assert parsed == {"type": "ASK_QUESTION", "user_response": "hi"}
def test_strips_json_code_fence(self) -> None:
envelope = '```json\n{"type": "REPLY", "user_response": "ok"}\n```'
assert parse_final_response(envelope)["type"] == "REPLY"
def test_tolerates_literal_newline_inside_string_value(self) -> None:
# Real model output shape: a multi-line user_response split across
# actual newlines instead of \n escapes. strict=True rejects this,
# strict=False accepts it. Without the fallback, the whole JSON blob
# gets shown to the user.
envelope = '{"type": "ASK_QUESTION", "user_response": "line one\nline two"}'
parsed = parse_final_response(envelope)
assert parsed["type"] == "ASK_QUESTION"
assert parsed["user_response"] == "line one\nline two"
def test_unparseable_text_falls_back_to_reply(self) -> None:
# Genuinely broken output still degrades gracefully.
parsed = parse_final_response("not json at all")
assert parsed == {"type": "REPLY", "user_response": "not json at all"}
def test_non_dict_json_falls_back_to_reply(self) -> None:
# A JSON array at top level is valid JSON but not a valid envelope.
parsed = parse_final_response("[1, 2, 3]")
assert parsed == {"type": "REPLY", "user_response": "[1, 2, 3]"}