mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
355 lines
15 KiB
Python
355 lines
15 KiB
Python
"""Tests for agent.py helpers that are hard to drive through run_copilot_agent."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock
|
|
|
|
|
|
def _ctx(**overrides):
|
|
from skyvern.forge.sdk.copilot.context import CopilotContext
|
|
|
|
defaults = dict(
|
|
organization_id="org-1",
|
|
workflow_id="wf-1",
|
|
workflow_permanent_id="wfp-1",
|
|
workflow_yaml="",
|
|
browser_session_id=None,
|
|
stream=MagicMock(),
|
|
)
|
|
defaults.update(overrides)
|
|
return CopilotContext(**defaults)
|
|
|
|
|
|
class TestFailedTestResponseNormalization:
|
|
def test_rewrite_failed_test_response_avoids_success_language(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _rewrite_failed_test_response
|
|
|
|
ctx = _ctx(
|
|
last_update_block_count=2,
|
|
last_test_ok=False,
|
|
last_test_failure_reason=(
|
|
"Failed to navigate to url https://bad.example. "
|
|
"Error: net::ERR_NAME_NOT_RESOLVED Call log: navigating..."
|
|
),
|
|
)
|
|
rewritten = _rewrite_failed_test_response("The workflow was successfully created.", ctx)
|
|
|
|
assert "successfully created" not in rewritten.lower()
|
|
assert "draft workflow with 2 blocks" in rewritten
|
|
assert "test failed" in rewritten.lower()
|
|
assert "Call log:" not in rewritten
|
|
|
|
def test_failed_run_does_not_clear_last_workflow_state(self) -> None:
|
|
from skyvern.forge.sdk.copilot.tools import _record_run_blocks_result
|
|
|
|
sentinel_workflow = object()
|
|
ctx = MagicMock()
|
|
ctx.last_workflow = sentinel_workflow
|
|
ctx.last_test_ok = None
|
|
ctx.last_test_failure_reason = None
|
|
|
|
_record_run_blocks_result(
|
|
ctx,
|
|
{
|
|
"ok": False,
|
|
"data": {
|
|
"blocks": [
|
|
{
|
|
"label": "open_website",
|
|
"failure_reason": "net::ERR_NAME_NOT_RESOLVED",
|
|
}
|
|
]
|
|
},
|
|
},
|
|
)
|
|
|
|
assert ctx.last_workflow is sentinel_workflow
|
|
assert ctx.last_test_ok is False
|
|
assert ctx.last_test_failure_reason == "net::ERR_NAME_NOT_RESOLVED"
|
|
|
|
def test_rewrite_includes_navigation_follow_up_when_category_matches(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _rewrite_failed_test_response
|
|
|
|
ctx = _ctx(
|
|
last_update_block_count=1,
|
|
last_test_ok=False,
|
|
last_test_failure_reason="Failed to navigate to url https://bad.example.",
|
|
last_failure_category_top="NAVIGATION_FAILURE",
|
|
)
|
|
rewritten = _rewrite_failed_test_response("done", ctx)
|
|
|
|
assert "test failed" in rewritten.lower()
|
|
assert "confirm the url" in rewritten.lower()
|
|
|
|
def test_rewrite_untested_edit_asks_for_more_context(self) -> None:
|
|
# SKY-9143 row 3: agent updated YAML without testing. The reply must
|
|
# not promise the backend can re-run a durable draft — the restore
|
|
# helper rolled it back and there is nothing to re-test next turn.
|
|
from skyvern.forge.sdk.copilot.agent import _rewrite_failed_test_response
|
|
|
|
sentinel_workflow = object()
|
|
ctx = _ctx(
|
|
last_update_block_count=1,
|
|
last_test_ok=None,
|
|
last_workflow=sentinel_workflow,
|
|
)
|
|
rewritten = _rewrite_failed_test_response("Here's the updated YAML.", ctx)
|
|
|
|
assert "drafted an update" in rewritten.lower()
|
|
assert "run it" not in rewritten.lower()
|
|
assert "more context" in rewritten.lower() or "clarify" in rewritten.lower()
|
|
|
|
def test_rewrite_passes_through_when_no_update_or_failure(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _rewrite_failed_test_response
|
|
|
|
ctx = _ctx()
|
|
original = "Let me know what you want to build."
|
|
assert _rewrite_failed_test_response(original, ctx) == original
|
|
|
|
|
|
class TestVerifiedWorkflowOrNone:
|
|
"""SKY-9143 strict invariant: a proposal surfaces only after a passing test this turn."""
|
|
|
|
def _wf(self) -> object:
|
|
return object()
|
|
|
|
def test_passes_workflow_when_tested_successfully(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _verified_workflow_or_none
|
|
|
|
wf = self._wf()
|
|
ctx = _ctx(last_workflow=wf, last_workflow_yaml="foo: bar", last_test_ok=True)
|
|
assert _verified_workflow_or_none(ctx) == (wf, "foo: bar")
|
|
|
|
def test_zeros_when_test_failed(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _verified_workflow_or_none
|
|
|
|
ctx = _ctx(last_workflow=self._wf(), last_workflow_yaml="foo: bar", last_test_ok=False)
|
|
assert _verified_workflow_or_none(ctx) == (None, None)
|
|
|
|
def test_zeros_when_untested_update(self) -> None:
|
|
# Exactly the scenario where _record_workflow_update_result reset
|
|
# last_test_ok to None after a standalone update_workflow or after
|
|
# the agent edited post-failure without re-testing.
|
|
from skyvern.forge.sdk.copilot.agent import _verified_workflow_or_none
|
|
|
|
ctx = _ctx(last_workflow=self._wf(), last_workflow_yaml="foo: bar", last_test_ok=None)
|
|
assert _verified_workflow_or_none(ctx) == (None, None)
|
|
|
|
def test_zeros_when_no_last_workflow(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _verified_workflow_or_none
|
|
|
|
ctx = _ctx(last_workflow=None, last_test_ok=True)
|
|
assert _verified_workflow_or_none(ctx) == (None, None)
|
|
|
|
def test_zeros_on_suspicious_success(self) -> None:
|
|
# _record_run_blocks_result sets last_test_ok=None when blocks ran ok
|
|
# but produced no meaningful extraction data. Still an unverified
|
|
# outcome; must not surface a proposal.
|
|
from skyvern.forge.sdk.copilot.agent import _verified_workflow_or_none
|
|
|
|
ctx = _ctx(
|
|
last_workflow=self._wf(),
|
|
last_workflow_yaml="foo: bar",
|
|
last_test_ok=None,
|
|
last_test_suspicious_success=True,
|
|
)
|
|
assert _verified_workflow_or_none(ctx) == (None, None)
|
|
|
|
|
|
class TestShouldRestorePersistedWorkflow:
|
|
"""SKY-9143: auto_accept=True must still restore when no proposal shipped."""
|
|
|
|
def _result(self, *, persisted: bool, updated_workflow: object | None):
|
|
r = MagicMock()
|
|
r.workflow_was_persisted = persisted
|
|
r.updated_workflow = updated_workflow
|
|
return r
|
|
|
|
def test_restores_when_no_proposal_even_under_auto_accept(self) -> None:
|
|
from skyvern.forge.sdk.routes.workflow_copilot import _should_restore_persisted_workflow
|
|
|
|
r = self._result(persisted=True, updated_workflow=None)
|
|
assert _should_restore_persisted_workflow(True, r) is True
|
|
|
|
def test_keeps_persisted_write_under_auto_accept_when_proposal_valid(self) -> None:
|
|
from skyvern.forge.sdk.routes.workflow_copilot import _should_restore_persisted_workflow
|
|
|
|
r = self._result(persisted=True, updated_workflow=object())
|
|
assert _should_restore_persisted_workflow(True, r) is False
|
|
|
|
def test_restores_when_not_auto_accept_and_persisted(self) -> None:
|
|
from skyvern.forge.sdk.routes.workflow_copilot import _should_restore_persisted_workflow
|
|
|
|
r = self._result(persisted=True, updated_workflow=object())
|
|
assert _should_restore_persisted_workflow(False, r) is True
|
|
|
|
def test_noop_when_nothing_was_persisted(self) -> None:
|
|
from skyvern.forge.sdk.routes.workflow_copilot import _should_restore_persisted_workflow
|
|
|
|
r = self._result(persisted=False, updated_workflow=None)
|
|
assert _should_restore_persisted_workflow(True, r) is False
|
|
assert _should_restore_persisted_workflow(False, r) is False
|
|
|
|
|
|
def _fake_run_result(payload: dict) -> SimpleNamespace:
|
|
"""Minimal shim for ``RunResultStreaming`` — extract_final_text reads ``final_output``."""
|
|
return SimpleNamespace(final_output=json.dumps(payload), new_items=[])
|
|
|
|
|
|
def _chat_request() -> SimpleNamespace:
|
|
return SimpleNamespace(
|
|
workflow_id="wf-1",
|
|
workflow_permanent_id="wfp-1",
|
|
workflow_copilot_chat_id="chat-1",
|
|
)
|
|
|
|
|
|
class TestTranslateToAgentResultGating:
|
|
"""Covers the three SKY-9143 invariants that live in _translate_to_agent_result."""
|
|
|
|
def test_inline_replace_workflow_resets_test_ok_after_prior_pass(self, monkeypatch) -> None:
|
|
# A prior run_blocks test passed for the old workflow (ctx.last_test_ok=True,
|
|
# ctx.last_workflow=old_wf). The agent then emits inline REPLACE_WORKFLOW
|
|
# with a different yaml. The translate helper must invalidate the prior
|
|
# test result so _verified_workflow_or_none rejects the untested REPLACE.
|
|
from skyvern.forge.sdk.copilot import agent as agent_module
|
|
|
|
old_wf = SimpleNamespace(name="old")
|
|
new_wf = SimpleNamespace(name="new-from-replace")
|
|
monkeypatch.setattr(
|
|
"skyvern.forge.sdk.copilot.tools._process_workflow_yaml",
|
|
lambda **kwargs: new_wf,
|
|
)
|
|
|
|
ctx = _ctx(last_workflow=old_wf, last_workflow_yaml="old: yaml", last_test_ok=True)
|
|
result = _fake_run_result(
|
|
{"type": "REPLACE_WORKFLOW", "user_response": "Here you go.", "workflow_yaml": "new: yaml"}
|
|
)
|
|
agent_result = agent_module._translate_to_agent_result(
|
|
result, ctx, global_llm_context=None, chat_request=_chat_request(), organization_id="org-1"
|
|
)
|
|
|
|
assert ctx.last_test_ok is None
|
|
assert ctx.last_workflow is new_wf
|
|
# The REPLACE yaml itself (not the stale snapshot) must land on ctx;
|
|
# otherwise a future code path that reads last_workflow_yaml would
|
|
# see a string that no longer matches last_workflow.
|
|
assert ctx.last_workflow_yaml == "new: yaml"
|
|
assert agent_result.updated_workflow is None
|
|
assert agent_result.workflow_yaml is None
|
|
assert agent_result.response_type == "REPLACE_WORKFLOW"
|
|
|
|
def test_inline_replace_with_invalid_yaml_keeps_prior_pass(self, monkeypatch) -> None:
|
|
# _process_workflow_yaml raising on a malformed REPLACE must leave
|
|
# ctx untouched — no spurious last_test_ok reset, no workflow swap —
|
|
# so a prior tested workflow remains available.
|
|
import yaml as yaml_mod
|
|
|
|
from skyvern.forge.sdk.copilot import agent as agent_module
|
|
|
|
tested_wf = SimpleNamespace(name="tested")
|
|
|
|
def boom(**kwargs):
|
|
raise yaml_mod.YAMLError("mangled yaml")
|
|
|
|
monkeypatch.setattr("skyvern.forge.sdk.copilot.tools._process_workflow_yaml", boom)
|
|
|
|
ctx = _ctx(last_workflow=tested_wf, last_workflow_yaml="tested: yaml", last_test_ok=True)
|
|
result = _fake_run_result(
|
|
{"type": "REPLACE_WORKFLOW", "user_response": "here", "workflow_yaml": "::: not yaml"}
|
|
)
|
|
agent_result = agent_module._translate_to_agent_result(
|
|
result, ctx, global_llm_context=None, chat_request=_chat_request(), organization_id="org-1"
|
|
)
|
|
|
|
assert ctx.last_workflow is tested_wf
|
|
assert ctx.last_workflow_yaml == "tested: yaml"
|
|
assert ctx.last_test_ok is True
|
|
assert agent_result.updated_workflow is tested_wf
|
|
assert "validation error" in agent_result.user_response.lower()
|
|
|
|
def test_ask_question_preserves_model_specific_question(self) -> None:
|
|
# The new prompt instructs the model to stop and ASK_QUESTION when it
|
|
# cannot test an edit. Row-3 of _rewrite_failed_test_response would
|
|
# clobber that specific unblocker with "Could you share more context";
|
|
# the resp_type==ASK_QUESTION guard must skip the rewrite.
|
|
from skyvern.forge.sdk.copilot import agent as agent_module
|
|
|
|
ctx = _ctx(
|
|
last_update_block_count=1,
|
|
last_test_ok=None,
|
|
last_workflow=SimpleNamespace(name="drafted"),
|
|
last_workflow_yaml="drafted: yaml",
|
|
)
|
|
specific_question = "I need credentials for site.example — can you link one in Settings?"
|
|
result = _fake_run_result({"type": "ASK_QUESTION", "user_response": specific_question})
|
|
agent_result = agent_module._translate_to_agent_result(
|
|
result, ctx, global_llm_context=None, chat_request=_chat_request(), organization_id="org-1"
|
|
)
|
|
|
|
assert agent_result.user_response == specific_question
|
|
# Even ASK_QUESTION must obey the strict gate — no verified workflow this turn.
|
|
assert agent_result.updated_workflow is None
|
|
assert agent_result.response_type == "ASK_QUESTION"
|
|
|
|
def test_reply_still_rewrites_after_failed_test(self) -> None:
|
|
# Guard rail for the above: a plain REPLY after a failed test must
|
|
# still flow through the "test failed" rewrite so we don't regress
|
|
# the original SKY-9143 behavior.
|
|
from skyvern.forge.sdk.copilot import agent as agent_module
|
|
|
|
ctx = _ctx(
|
|
last_update_block_count=2,
|
|
last_test_ok=False,
|
|
last_test_failure_reason="Failed to navigate to url https://bad.example.",
|
|
last_failure_category_top="NAVIGATION_FAILURE",
|
|
)
|
|
result = _fake_run_result({"type": "REPLY", "user_response": "All done — your workflow is ready."})
|
|
agent_result = agent_module._translate_to_agent_result(
|
|
result, ctx, global_llm_context=None, chat_request=_chat_request(), organization_id="org-1"
|
|
)
|
|
|
|
assert "test failed" in agent_result.user_response.lower()
|
|
assert "All done" not in agent_result.user_response
|
|
assert agent_result.updated_workflow is None
|
|
|
|
|
|
class TestCredentialRefusalReachesAgent:
|
|
"""Prove the SKY-9189 refusal rule is actually delivered to the agent.
|
|
|
|
`run_copilot_agent` constructs the openai-agents SDK `Agent(...)` with
|
|
`instructions=_build_system_prompt(...)` and `tools=list(NATIVE_TOOLS)`.
|
|
A behavior test would require patching the agent loop and is fragile; a
|
|
construction test (rule text flows through the exact helpers the route
|
|
uses) is deterministic and catches both prompt and tool-surface drift.
|
|
"""
|
|
|
|
def test_build_system_prompt_carries_refusal_clause(self) -> None:
|
|
from skyvern.forge.sdk.copilot.agent import _build_system_prompt
|
|
|
|
prompt = _build_system_prompt(tool_usage_guide="", security_rules="")
|
|
|
|
assert "CREDENTIAL HANDLING - CRITICAL" in prompt
|
|
assert "DO NOT PROVIDE RAW LOGIN/PASSWORD" in prompt
|
|
assert "MUST NOT build, update, or run a workflow" in prompt
|
|
assert "redacted from the outbound client stream" not in prompt
|
|
|
|
def test_native_tools_carry_refusal_reference(self) -> None:
|
|
import re
|
|
|
|
from skyvern.forge.sdk.copilot.tools import NATIVE_TOOLS
|
|
|
|
targets = {"run_blocks_and_collect_debug", "update_and_run_blocks"}
|
|
matched = {tool.name for tool in NATIVE_TOOLS if tool.name in targets}
|
|
assert matched == targets, f"missing tools in NATIVE_TOOLS: {targets - matched}"
|
|
|
|
cross_ref = re.compile(r"CREDENTIAL\s+HANDLING refusal rule")
|
|
for tool in NATIVE_TOOLS:
|
|
if tool.name not in targets:
|
|
continue
|
|
desc = tool.description
|
|
assert "redacted from" not in desc, f"{tool.name} still claims redaction"
|
|
assert "you may pass it via" not in desc, f"{tool.name} still permits inline secrets"
|
|
assert cross_ref.search(desc), f"{tool.name} missing refusal cross-reference"
|