Skyvern/tests/unit/test_copilot_output_utils.py

"""Tests for truncate_output and sanitize_tool_result_for_llm."""

from __future__ import annotations

from unittest.mock import MagicMock

from skyvern.forge.sdk.copilot.output_utils import (
    parse_final_response,
    sanitize_tool_result_for_llm,
    summarize_tool_result,
    truncate_output,
)


def test_truncate_output_none() -> None:
    assert truncate_output(None) is None


def test_truncate_output_short_string() -> None:
    assert truncate_output("ok") == "ok"


def test_truncate_output_long_string_truncates() -> None:
    text = "x" * 2100
    result = truncate_output(text, max_chars=2000)

    assert result is not None
    assert result.startswith("x" * 2000)
    assert result.endswith("\n... [truncated]")


def test_truncate_output_serializes_dict() -> None:
    result = truncate_output({"a": 1, "b": True})
    assert result == '{"a": 1, "b": true}'


def test_truncate_output_falls_back_to_str_on_json_error() -> None:
    circular: dict[str, object] = {}
    circular["self"] = circular

    result = truncate_output(circular)
    assert result is not None
    assert "self" in result


def test_sanitize_get_run_results_scrubs_nested_block_screenshots() -> None:
    result = {
        "ok": True,
        "data": {
            "workflow_run_id": "wr_123",
            "overall_status": "failed",
            "blocks": [
                {
                    "label": "open_page",
                    "status": "completed",
                    "screenshot_b64": "iVBORw0KGgoAAAANSUhEUgAAA" + "A" * 500,
                },
                {
                    "label": "extract_data",
                    "status": "failed",
                    "failure_reason": "timeout",
                    "screenshot_b64": "iVBORw0KGgo" + "B" * 800,
                },
            ],
        },
    }

    sanitized = sanitize_tool_result_for_llm("get_run_results", result)
    blocks = sanitized["data"]["blocks"]

    assert blocks[0]["screenshot_b64"] == "[base64 image omitted — screenshot was taken successfully]"
    assert blocks[1]["screenshot_b64"] == "[base64 image omitted — screenshot was taken successfully]"
    assert blocks[1]["failure_reason"] == "timeout"
    assert blocks[0]["status"] == "completed"


def test_sanitize_does_not_mutate_original_blocks() -> None:
    original_screenshot = "iVBORw0KGgo" + "B" * 500
    result = {
        "ok": True,
        "data": {
            "blocks": [{"label": "extract", "screenshot_b64": original_screenshot}],
        },
    }
    original_block = result["data"]["blocks"][0]

    sanitized = sanitize_tool_result_for_llm("get_run_results", result)

    assert original_block["screenshot_b64"] == original_screenshot
    assert sanitized["data"]["blocks"][0]["screenshot_b64"].startswith("[base64 image omitted")
    assert sanitized["data"]["blocks"][0] is not original_block


def test_sanitize_run_blocks_debug_does_not_mutate_extracted_data() -> None:
    original_extracted = [{"price": 19.99, "name": "widget"}]
    result = {
        "ok": True,
        "data": {
            "blocks": [{"label": "extract", "extracted_data": original_extracted}],
        },
    }
    original_block = result["data"]["blocks"][0]

    sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)

    assert original_block["extracted_data"] is original_extracted


def test_sanitize_other_tools_do_not_touch_block_screenshot_b64() -> None:
    # `run_blocks_and_collect_debug` does not attach nested `screenshot_b64`;
    # if one somehow shows up there, leave it alone so behavior is scoped.
    result = {
        "ok": True,
        "data": {
            "overall_status": "completed",
            "blocks": [
                {
                    "label": "a",
                    "status": "completed",
                    "screenshot_b64": "stays_here",
                }
            ],
        },
    }
    sanitized = sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)
    assert sanitized["data"]["blocks"][0]["screenshot_b64"] == "stays_here"


class TestSanitization:
    def test_screenshot_sanitization(self) -> None:
        from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm

        result = {
            "ok": True,
            "data": {
                "screenshot_base64": "iVBOR...",
                "url": "https://example.com",
            },
        }
        sanitized = sanitize_tool_result_for_llm("get_browser_screenshot", result)
        expected = "[base64 image omitted — screenshot was taken successfully]"
        assert sanitized["data"]["screenshot_base64"] == expected
        assert sanitized["data"]["url"] == "https://example.com"

    def test_mcp_fields_stripped(self) -> None:
        from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm

        result = {
            "ok": True,
            "action": "skyvern_navigate",
            "browser_context": {"mode": "cloud_session"},
            "timing_ms": {"total": 500},
            "artifacts": [],
            "data": {
                "url": "https://example.com",
                "sdk_equivalent": "await page.goto(...)",
            },
        }
        sanitized = sanitize_tool_result_for_llm("navigate_browser", result)
        assert "action" not in sanitized
        assert "browser_context" not in sanitized
        assert "timing_ms" not in sanitized
        assert "artifacts" not in sanitized
        assert "sdk_equivalent" not in sanitized.get("data", {})

    def test_workflow_key_stripped(self) -> None:
        from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm

        result = {
            "ok": True,
            "data": {"block_count": 2},
            "_workflow": MagicMock(),
        }
        sanitized = sanitize_tool_result_for_llm("update_workflow", result)
        assert "_workflow" not in sanitized

    def test_large_schema_truncated(self) -> None:
        from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm

        big_schema = {f"field_{i}": {"type": "string"} for i in range(200)}
        result = {
            "ok": True,
            "data": {"schema": big_schema},
        }
        sanitized = sanitize_tool_result_for_llm("get_block_schema", result)
        assert sanitized["data"]["schema"]["_truncated"] is True

    def test_run_blocks_sanitizer_preserves_compact_packet_fields(self) -> None:
        from skyvern.forge.sdk.copilot.output_utils import sanitize_tool_result_for_llm

        # visible_elements_html is no longer in the default run-blocks payload
        # (it moved to the heavier get_run_results / direct browser path). The
        # sanitizer should leave the compact packet fields intact.
        result = {
            "ok": False,
            "data": {
                "workflow_run_id": "wr_1",
                "overall_status": "failed",
                "requested_block_labels": ["a", "b"],
                "executed_block_labels": ["b"],
                "frontier_start_label": "b",
                "current_url": "https://example.test",
                "page_title": "Example",
                "action_trace_summary": ["click #submit failed"],
                "blocks": [{"label": "b", "block_type": "EXTRACTION", "status": "failed"}],
            },
        }
        sanitized = sanitize_tool_result_for_llm("run_blocks_and_collect_debug", result)
        data = sanitized["data"]
        assert "visible_elements_html" not in data
        assert data["requested_block_labels"] == ["a", "b"]
        assert data["executed_block_labels"] == ["b"]
        assert data["frontier_start_label"] == "b"
        assert data["action_trace_summary"] == ["click #submit failed"]
        assert data["current_url"] == "https://example.test"


class TestSummarizeToolResult:
    @staticmethod
    def _summarize(tool_name: str, result: dict) -> str:
        return summarize_tool_result(tool_name, result)

    def test_error_result(self) -> None:
        summary = self._summarize("any_tool", {"ok": False, "error": "oops"})
        assert "Failed" in summary
        assert "oops" in summary

    def test_failed_run_surfaces_block_failure_reason_when_error_absent(self) -> None:
        summary = self._summarize(
            "run_blocks_and_collect_debug",
            {
                "ok": False,
                "data": {
                    "overall_status": "failed",
                    "blocks": [
                        {
                            "label": "navigate",
                            "status": "failed",
                            "failure_reason": (
                                "Failed to navigate to url https://example.invalid. "
                                "Error message: net::ERR_NAME_NOT_RESOLVED"
                            ),
                        }
                    ],
                },
            },
        )
        assert "ERR_NAME_NOT_RESOLVED" in summary
        assert "Unknown error" not in summary

    def test_failed_run_prefers_top_level_error_over_nested(self) -> None:
        summary = self._summarize(
            "run_blocks_and_collect_debug",
            {
                "ok": False,
                "error": "top-level message",
                "data": {"blocks": [{"failure_reason": "nested message"}]},
            },
        )
        assert "top-level message" in summary
        assert "nested message" not in summary

    def test_failed_run_prefers_data_failure_reason_over_block_failure_reason(self) -> None:
        summary = self._summarize(
            "run_blocks_and_collect_debug",
            {
                "ok": False,
                "data": {
                    "failure_reason": "run-level",
                    "blocks": [{"failure_reason": "block-level"}],
                },
            },
        )
        assert "run-level" in summary
        assert "block-level" not in summary

    def test_failed_run_falls_back_to_unknown_error_when_nothing_present(self) -> None:
        summary = self._summarize(
            "run_blocks_and_collect_debug",
            {"ok": False, "data": {"blocks": []}},
        )
        assert "Unknown error" in summary

    def test_update_workflow(self) -> None:
        summary = self._summarize(
            "update_workflow",
            {
                "ok": True,
                "data": {"block_count": 3},
            },
        )
        assert "3" in summary

    def test_navigate_browser(self) -> None:
        summary = self._summarize(
            "navigate_browser",
            {
                "ok": True,
                "url": "https://example.com",
            },
        )
        assert summary == "Navigated to https://example.com"

    def test_type_text_typed_length(self) -> None:
        summary = self._summarize(
            "type_text",
            {
                "ok": True,
                "data": {"selector": "#email", "typed_length": 10},
            },
        )
        assert "10" in summary

    def test_type_text_text_length(self) -> None:
        summary = self._summarize(
            "type_text",
            {
                "ok": True,
                "data": {"selector": "#email", "text_length": 20},
            },
        )
        assert "20" in summary

    def test_unknown_tool_returns_ok(self) -> None:
        summary = self._summarize("unknown_tool", {"ok": True})
        assert summary == "OK"


class TestParseFinalResponse:
    """parse_final_response is the last mile between model output and the frontend.

    A parse failure falls back to `{"type": "REPLY", "user_response": text}`,
    which means the raw JSON object is rendered in the chat bubble. Real model
    outputs sometimes embed literal newlines inside string values (strict
    `json.loads` rejects those) — seen in SKY-9189 test-2 where the full
    refusal envelope landed in the user bubble instead of just user_response.
    """

    def test_parses_clean_json_envelope(self) -> None:
        envelope = '{"type": "ASK_QUESTION", "user_response": "hi"}'
        parsed = parse_final_response(envelope)
        assert parsed == {"type": "ASK_QUESTION", "user_response": "hi"}

    def test_strips_json_code_fence(self) -> None:
        envelope = '```json\n{"type": "REPLY", "user_response": "ok"}\n```'
        assert parse_final_response(envelope)["type"] == "REPLY"

    def test_tolerates_literal_newline_inside_string_value(self) -> None:
        # Real model output shape: a multi-line user_response split across
        # actual newlines instead of \n escapes. strict=True rejects this,
        # strict=False accepts it. Without the fallback, the whole JSON blob
        # gets shown to the user.
        envelope = '{"type": "ASK_QUESTION", "user_response": "line one\nline two"}'
        parsed = parse_final_response(envelope)
        assert parsed["type"] == "ASK_QUESTION"
        assert parsed["user_response"] == "line one\nline two"

    def test_unparseable_text_falls_back_to_reply(self) -> None:
        # Genuinely broken output still degrades gracefully.
        parsed = parse_final_response("not json at all")
        assert parsed == {"type": "REPLY", "user_response": "not json at all"}

    def test_non_dict_json_falls_back_to_reply(self) -> None:
        # A JSON array at top level is valid JSON but not a valid envelope.
        parsed = parse_final_response("[1, 2, 3]")
        assert parsed == {"type": "REPLY", "user_response": "[1, 2, 3]"}