mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
feat: teach script reviewer about page.terminate() with conditional-only rule (SKY-8334) (#5097)
This commit is contained in:
parent
c45d14527e
commit
ef4e68a8a5
3 changed files with 444 additions and 1 deletions
|
|
@ -164,6 +164,7 @@ When no suggested selector is available, build one from the element's `[tag]`, `
|
|||
| `click: Submit` [tag: button] [text: "Submit"] | `await page.click(selector='button:has-text("Submit")', ai='fallback', prompt='Click Submit')` |
|
||||
| `click: Gender option` (no tag/attrs) | `await page.click(ai='fallback', prompt='Select the first gender option')` |
|
||||
| `complete: ...` | `await page.complete()` |
|
||||
| `terminate: ...` | `if <condition>: await page.terminate(errors=["reason"])` — see **Termination Rules** in the Lifecycle section below |
|
||||
|
||||
### Selector priority (most reliable first)
|
||||
1. `#id` — if the element has a stable `id` attribute (not auto-generated hex/numeric IDs)
|
||||
|
|
@ -379,6 +380,13 @@ async def block_fn(page, context):
|
|||
- `await page.complete()` — Mark the task as complete (call at the end)
|
||||
- `await page.wait(timeout_ms=)` — Wait for a duration
|
||||
- `await page.solve_captcha()` — Solve a CAPTCHA on the page
|
||||
- `await page.terminate(errors=["reason"])` — Stop the task when the goal cannot be achieved (see rules below)
|
||||
|
||||
### Termination Rules
|
||||
- `page.terminate()` MUST be inside an `if`/`elif` branch guarded by `page.classify()` or `page.extract()`. NEVER call terminate unconditionally.
|
||||
- After a `page.classify()`, the `else` branch must be `element_fallback()`, not terminate — "unknown state" means "let the agent figure it out", not "give up." After a `page.extract()` with a known set of outcomes, terminate in `else` is acceptable when the else represents a definite failure condition (not an unknown state).
|
||||
- Only use terminate when the navigation goal explicitly describes conditions under which the task should stop (e.g., "if no invoice is available, terminate", "if the account doesn't exist, stop").
|
||||
- Do NOT use terminate as a fallback for "something went wrong" — that's what `element_fallback()` is for.
|
||||
|
||||
### AI Fallback
|
||||
- `await page.element_fallback(navigation_goal="...", max_steps=10)` — Hand off to AI agent
|
||||
|
|
|
|||
|
|
@ -2,8 +2,9 @@ from __future__ import annotations
|
|||
|
||||
import json
|
||||
import re
|
||||
from typing import Literal
|
||||
from typing import Literal, Sequence
|
||||
|
||||
import libcst as cst
|
||||
import structlog
|
||||
|
||||
from skyvern.forge import app
|
||||
|
|
@ -742,6 +743,19 @@ class ScriptReviewer:
|
|||
current_prompt = self._build_retry_prompt(updated_code, regression_error, function_signature)
|
||||
continue
|
||||
|
||||
# Validate bare terminate calls (must be inside if/elif, never unconditional)
|
||||
terminate_error = self._validate_bare_terminate(updated_code)
|
||||
if terminate_error is not None:
|
||||
LOG.warning(
|
||||
"ScriptReviewer: bare terminate detected, retrying",
|
||||
block_label=block_label,
|
||||
attempt=attempt,
|
||||
error=terminate_error,
|
||||
)
|
||||
if attempt < max_attempts:
|
||||
current_prompt = self._build_retry_prompt(updated_code, terminate_error, function_signature)
|
||||
continue
|
||||
|
||||
# Validate no hardcoded parameter values (catch leaked run-specific data)
|
||||
hardcoded_error = self._validate_no_hardcoded_values(updated_code, run_parameter_values)
|
||||
if hardcoded_error is not None:
|
||||
|
|
@ -1217,6 +1231,7 @@ class ScriptReviewer:
|
|||
"scroll": frozenset({"direction", "amount", "selector"}),
|
||||
"keypress": frozenset({"keys", "prompt"}),
|
||||
"wait": frozenset({"timeout_ms"}),
|
||||
"terminate": frozenset({"errors"}),
|
||||
"complete": frozenset(set()),
|
||||
"goto": frozenset({"url", "timeout"}),
|
||||
}
|
||||
|
|
@ -1467,6 +1482,177 @@ class ScriptReviewer:
|
|||
|
||||
return None
|
||||
|
||||
def _validate_bare_terminate(self, code: str) -> str | None:
|
||||
"""Validate that page.terminate() is never called unconditionally.
|
||||
|
||||
Every page.terminate() call must be inside an if/elif branch (guarded by
|
||||
a classify or extract result). A terminate at function-body level (not
|
||||
inside any conditional) is rejected.
|
||||
|
||||
Uses libcst.parse_module to walk the tree and check that every
|
||||
page.terminate() Expr node has a cst.If ancestor.
|
||||
|
||||
Note: the prompt distinguishes classify-else (should use element_fallback)
|
||||
from extract-else (terminate is acceptable). This validator enforces only
|
||||
the structural rule; else-branch semantics are left to the LLM prompt.
|
||||
|
||||
Returns an error message or None if valid.
|
||||
"""
|
||||
# Fast short-circuit: skip libcst parsing when terminate is not present
|
||||
if "terminate" not in code:
|
||||
return None
|
||||
|
||||
try:
|
||||
tree = cst.parse_module(code)
|
||||
except cst.ParserSyntaxError:
|
||||
return None # compile check handles syntax errors separately
|
||||
|
||||
# Walk the CST looking for page.terminate() calls not inside an If node.
|
||||
# Each top-level FunctionDef is validated independently. Nested function
|
||||
# definitions are intentionally not recursed into — generated script blocks
|
||||
# are always a single top-level async function with no inner defs.
|
||||
# libcst uses a single FunctionDef for both sync and async.
|
||||
for stmt in tree.body:
|
||||
func_def = stmt if isinstance(stmt, cst.FunctionDef) else None
|
||||
if func_def is None:
|
||||
continue
|
||||
bare = ScriptReviewer._find_bare_terminate_in_body(func_def.body.body, inside_conditional=False)
|
||||
if bare is not None:
|
||||
return bare
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _find_bare_terminate_in_body(
|
||||
stmts: Sequence[cst.BaseStatement],
|
||||
inside_conditional: bool,
|
||||
) -> str | None:
|
||||
"""Recursively check statements for bare page.terminate() calls.
|
||||
|
||||
Returns an error message if a bare terminate is found, None otherwise.
|
||||
|
||||
Note: the prompt distinguishes classify-else (should use element_fallback)
|
||||
from extract-else (terminate is acceptable). This validator only enforces
|
||||
the structural rule (terminate must be inside *some* conditional). Finer
|
||||
classify-vs-extract else-branch enforcement is left to the LLM prompt.
|
||||
"""
|
||||
|
||||
def _unwrap_body(suite: cst.BaseSuite | cst.Else | cst.Finally | None) -> Sequence[cst.BaseStatement]:
|
||||
"""Extract the statement list from an IndentedBlock, Else, or Finally."""
|
||||
if suite is None:
|
||||
return ()
|
||||
if isinstance(suite, (cst.Else, cst.Finally)):
|
||||
suite = suite.body
|
||||
if isinstance(suite, cst.IndentedBlock):
|
||||
return suite.body
|
||||
return ()
|
||||
|
||||
def _check_bodies(bodies: list[Sequence[cst.BaseStatement]], cond: bool) -> str | None:
|
||||
"""Check multiple statement lists, returning the first error."""
|
||||
for body in bodies:
|
||||
err = ScriptReviewer._find_bare_terminate_in_body(body, inside_conditional=cond)
|
||||
if err:
|
||||
return err
|
||||
return None
|
||||
|
||||
for stmt in stmts:
|
||||
# In libcst, expression statements are wrapped in SimpleStatementLine
|
||||
if isinstance(stmt, cst.SimpleStatementLine):
|
||||
for small_stmt in stmt.body:
|
||||
if isinstance(small_stmt, cst.Expr) and ScriptReviewer._is_terminate_call(small_stmt.value):
|
||||
if not inside_conditional:
|
||||
return (
|
||||
"page.terminate() must be inside an if/elif branch — unconditional terminate rejected"
|
||||
)
|
||||
|
||||
# Recurse into if/elif/else bodies
|
||||
if isinstance(stmt, cst.If):
|
||||
# The if and elif bodies are "inside a conditional".
|
||||
# cst.If.orelse is If (elif) | Else (else) | None.
|
||||
# Collect all branch bodies via the elif/else chain.
|
||||
branch_bodies: list[Sequence[cst.BaseStatement]] = [_unwrap_body(stmt.body)]
|
||||
orelse: cst.If | cst.Else | None = stmt.orelse
|
||||
while orelse is not None:
|
||||
if isinstance(orelse, cst.If):
|
||||
branch_bodies.append(_unwrap_body(orelse.body))
|
||||
orelse = orelse.orelse
|
||||
elif isinstance(orelse, cst.Else):
|
||||
branch_bodies.append(_unwrap_body(orelse.body))
|
||||
orelse = None
|
||||
else:
|
||||
break
|
||||
err = _check_bodies(branch_bodies, cond=True)
|
||||
if err:
|
||||
return err
|
||||
|
||||
elif isinstance(stmt, (cst.For, cst.While)):
|
||||
bodies: list[Sequence[cst.BaseStatement]] = [_unwrap_body(stmt.body)]
|
||||
if stmt.orelse is not None:
|
||||
bodies.append(_unwrap_body(stmt.orelse))
|
||||
err = _check_bodies(bodies, cond=inside_conditional)
|
||||
if err:
|
||||
return err
|
||||
|
||||
elif isinstance(stmt, cst.With):
|
||||
err = _check_bodies([_unwrap_body(stmt.body)], cond=inside_conditional)
|
||||
if err:
|
||||
return err
|
||||
|
||||
elif isinstance(stmt, cst.Try):
|
||||
# except handler bodies inherit inside_conditional from the
|
||||
# enclosing scope (not set to True) — terminate in an except
|
||||
# block is "something went wrong" error handling, which should
|
||||
# use element_fallback, not terminate.
|
||||
handler_bodies = [_unwrap_body(h.body) for h in stmt.handlers]
|
||||
err = _check_bodies(
|
||||
[_unwrap_body(stmt.body), *handler_bodies, _unwrap_body(stmt.orelse), _unwrap_body(stmt.finalbody)],
|
||||
cond=inside_conditional,
|
||||
)
|
||||
if err:
|
||||
return err
|
||||
|
||||
elif isinstance(stmt, cst.TryStar):
|
||||
handler_bodies = [_unwrap_body(h.body) for h in stmt.handlers]
|
||||
err = _check_bodies(
|
||||
[_unwrap_body(stmt.body), *handler_bodies, _unwrap_body(stmt.orelse), _unwrap_body(stmt.finalbody)],
|
||||
cond=inside_conditional,
|
||||
)
|
||||
if err:
|
||||
return err
|
||||
|
||||
# Python 3.10+ match/case — each case body is conditional
|
||||
elif isinstance(stmt, cst.Match):
|
||||
err = _check_bodies([_unwrap_body(case.body) for case in stmt.cases], cond=True)
|
||||
if err:
|
||||
return err
|
||||
|
||||
# FunctionDef stmts are intentionally not recursed into —
|
||||
# generated scripts never contain nested function definitions.
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _is_terminate_call(node: cst.BaseExpression) -> bool:
|
||||
"""Check if a CST expression node is a page.terminate(...) call.
|
||||
|
||||
Handles both ``await page.terminate(...)`` and bare ``page.terminate(...)``.
|
||||
Generated scripts always use ``await``, but we check both defensively.
|
||||
|
||||
The caller passes ``small_stmt.value`` (the inner expression of a
|
||||
``cst.Expr`` small statement), so this method receives a ``cst.Await``
|
||||
or ``cst.Call`` node, never a ``cst.Expr`` wrapper.
|
||||
"""
|
||||
call = node.expression if isinstance(node, cst.Await) else node
|
||||
if not isinstance(call, cst.Call):
|
||||
return False
|
||||
func = call.func
|
||||
return (
|
||||
isinstance(func, cst.Attribute)
|
||||
and isinstance(func.attr, cst.Name)
|
||||
and func.attr.value == "terminate"
|
||||
and isinstance(func.value, cst.Name)
|
||||
and func.value.value == "page"
|
||||
)
|
||||
|
||||
# Regex to extract string literals (single or double quoted, excluding escaped quotes)
|
||||
_STRING_LITERAL_RE: re.Pattern[str] = re.compile(r"""(?<![\\])(['"])((?:(?!\1)[^\\]|\\.)*)(\1)""")
|
||||
|
||||
|
|
|
|||
249
tests/unit/test_script_reviewer_terminate.py
Normal file
249
tests/unit/test_script_reviewer_terminate.py
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
"""Tests for ScriptReviewer bare-terminate validation."""
|
||||
|
||||
from skyvern.services.script_reviewer import ScriptReviewer
|
||||
|
||||
|
||||
class TestValidateBareTerminate:
|
||||
"""Tests for _validate_bare_terminate."""
|
||||
|
||||
def setup_method(self) -> None:
|
||||
self.reviewer = ScriptReviewer()
|
||||
|
||||
def test_terminate_inside_if_passes(self) -> None:
|
||||
"""page.terminate() inside an if block should pass validation."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
result = await page.extract(
|
||||
prompt='Is there an invoice available?',
|
||||
schema={'type': 'object', 'properties': {'available': {'type': 'boolean'}}}
|
||||
)
|
||||
if not result.get('available', True):
|
||||
await page.terminate(errors=["No invoice available"])
|
||||
else:
|
||||
await page.element_fallback(navigation_goal="Download the invoice")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_inside_elif_passes(self) -> None:
|
||||
"""page.terminate() inside an elif block should pass validation."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
state = await page.classify(
|
||||
options={"has_invoice": "invoice is available", "no_invoice": "no invoice found"},
|
||||
text_patterns={"has_invoice": "Download Invoice", "no_invoice": "No invoices"},
|
||||
)
|
||||
if state == "has_invoice":
|
||||
await page.click(selector='button:has-text("Download")', ai='fallback', prompt='Click download')
|
||||
elif state == "no_invoice":
|
||||
await page.terminate(errors=["No invoice available for this period"])
|
||||
else:
|
||||
await page.element_fallback(navigation_goal="Handle invoice page")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_bare_terminate_at_function_body_rejected(self) -> None:
|
||||
"""page.terminate() at function body level (unconditional) should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
await page.terminate(errors=["Something went wrong"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
assert "if/elif" in error
|
||||
|
||||
def test_bare_terminate_after_other_calls_rejected(self) -> None:
|
||||
"""page.terminate() at function body level after other calls should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
await page.click(selector='button', ai='fallback', prompt='Click button')
|
||||
await page.terminate(errors=["Done but wrong"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_no_terminate_passes(self) -> None:
|
||||
"""Code without any terminate call should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
await page.click(selector='button:has-text("Submit")', ai='fallback', prompt='Click submit')
|
||||
await page.complete()
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_in_nested_if_passes(self) -> None:
|
||||
"""page.terminate() inside a nested if should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
result = await page.extract(
|
||||
prompt='What is the account status?',
|
||||
schema={'type': 'object', 'properties': {'status': {'type': 'string'}}}
|
||||
)
|
||||
if result.get('status') == 'active':
|
||||
data = await page.extract(
|
||||
prompt='Is there a balance?',
|
||||
schema={'type': 'object', 'properties': {'has_balance': {'type': 'boolean'}}}
|
||||
)
|
||||
if not data.get('has_balance', True):
|
||||
await page.terminate(errors=["No balance to process"])
|
||||
else:
|
||||
await page.click(selector='button:has-text("Pay")', ai='fallback', prompt='Click pay')
|
||||
else:
|
||||
await page.element_fallback(navigation_goal="Handle account status")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_in_for_loop_without_conditional_rejected(self) -> None:
|
||||
"""page.terminate() inside a for loop but not in an if should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
for item in items:
|
||||
await page.terminate(errors=["Bad item"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_terminate_in_for_loop_inside_if_passes(self) -> None:
|
||||
"""page.terminate() inside a for loop AND inside an if should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
for item in items:
|
||||
if item.get('invalid'):
|
||||
await page.terminate(errors=["Invalid item found"])
|
||||
else:
|
||||
await page.click(selector='button', ai='fallback', prompt='Process item')
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_in_else_branch_passes(self) -> None:
|
||||
"""page.terminate() inside an else branch still passes (it's inside a conditional).
|
||||
|
||||
This test uses an extract-pattern example. The validator would also pass
|
||||
terminate in a classify-else branch -- classify-vs-extract else-branch
|
||||
semantics are enforced by the LLM prompt, not the structural validator.
|
||||
"""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
result = await page.extract(
|
||||
prompt='Check status',
|
||||
schema={'type': 'object', 'properties': {'ok': {'type': 'boolean'}}}
|
||||
)
|
||||
if result.get('ok'):
|
||||
await page.complete()
|
||||
else:
|
||||
await page.terminate(errors=["Status check failed"])
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_bare_non_awaited_terminate_rejected(self) -> None:
|
||||
"""page.terminate() without await at function body level should still be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
page.terminate(errors=["Something went wrong"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_non_awaited_terminate_inside_if_passes(self) -> None:
|
||||
"""page.terminate() without await but inside an if block should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
if some_condition:
|
||||
page.terminate(errors=["Condition met"])
|
||||
else:
|
||||
await page.element_fallback(navigation_goal="Handle it")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_inside_match_case_passes(self) -> None:
|
||||
"""page.terminate() inside a match/case body should pass (case is conditional)."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
result = await page.extract(
|
||||
prompt='What is the status?',
|
||||
schema={'type': 'object', 'properties': {'status': {'type': 'string'}}}
|
||||
)
|
||||
match result.get('status'):
|
||||
case 'active':
|
||||
await page.click(selector='button', ai='fallback', prompt='Click button')
|
||||
case 'cancelled':
|
||||
await page.terminate(errors=["Account cancelled — cannot proceed"])
|
||||
case _:
|
||||
await page.element_fallback(navigation_goal="Handle unknown status")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_in_bare_try_rejected(self) -> None:
|
||||
"""page.terminate() inside a try block but not in an if should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
try:
|
||||
await page.terminate(errors=["Something failed"])
|
||||
except Exception:
|
||||
await page.element_fallback(navigation_goal="Handle error")
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_terminate_in_try_inside_if_passes(self) -> None:
|
||||
"""page.terminate() inside try > if should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
try:
|
||||
result = await page.extract(
|
||||
prompt='Check status',
|
||||
schema={'type': 'object', 'properties': {'ok': {'type': 'boolean'}}}
|
||||
)
|
||||
if not result.get('ok'):
|
||||
await page.terminate(errors=["Status check failed"])
|
||||
else:
|
||||
await page.complete()
|
||||
except Exception:
|
||||
await page.element_fallback(navigation_goal="Handle error")
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_terminate_in_bare_except_rejected(self) -> None:
|
||||
"""page.terminate() in an except handler (not inside if) should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
try:
|
||||
await page.click(selector='button', ai='fallback', prompt='Click')
|
||||
except Exception:
|
||||
await page.terminate(errors=["Exception occurred"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_terminate_in_bare_with_rejected(self) -> None:
|
||||
"""page.terminate() inside a with block but not in an if should be rejected."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
async with some_context_manager() as ctx:
|
||||
await page.terminate(errors=["Bad state"])
|
||||
"""
|
||||
error = self.reviewer._validate_bare_terminate(code)
|
||||
assert error is not None
|
||||
assert "unconditional terminate rejected" in error
|
||||
|
||||
def test_terminate_in_with_inside_if_passes(self) -> None:
|
||||
"""page.terminate() inside with > if should pass."""
|
||||
code = """
|
||||
async def block_fn(page, context):
|
||||
async with some_context_manager() as ctx:
|
||||
if ctx.failed:
|
||||
await page.terminate(errors=["Context failed"])
|
||||
else:
|
||||
await page.complete()
|
||||
"""
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
|
||||
def test_syntax_error_code_returns_none(self) -> None:
|
||||
"""Code that doesn't parse should return None (syntax errors handled elsewhere)."""
|
||||
code = "this is not valid python at all {"
|
||||
assert self.reviewer._validate_bare_terminate(code) is None
|
||||
Loading…
Add table
Add a link
Reference in a new issue