Skyvern/tests/unit/test_validation_span_attrs.py

182 lines
7.3 KiB
Python

"""Tests for the ``validation.decision`` / ``validation.reasoning_kind`` span
attributes (SKY-9174, Part D.3).
The two attributes give us a post-merge logfire signal for when a validation
block's LLM reasons literally and/or terminates — the failure mode Part D aims
to reduce. Query shape::
SELECT COUNT(*) FROM records
WHERE span_name = 'skyvern.agent.step_body'
AND attributes->>'validation.decision' = 'terminate'
AND attributes->>'validation.reasoning_kind' = 'literal'
AND start_timestamp > now() - INTERVAL '24 hours';
Pre-fix this count should be non-trivial; post-fix it should trend to zero on
the copilot-v2 cohort. These tests cover the attribute-writing logic directly
(the helper is pure, so we don't need to drive the full agent step).
"""
from __future__ import annotations
from datetime import UTC, datetime
import opentelemetry.trace as otel_trace
import pytest
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from skyvern.forge.agent import record_validation_span_attrs
from skyvern.forge.sdk.db.enums import TaskType
from skyvern.webeye.actions.actions import (
Action,
ActionType,
ClickAction,
CompleteAction,
TerminateAction,
)
from tests.unit.helpers import make_organization, make_task
STEP_SPAN_NAME = "skyvern.agent.validation_step_body_fixture"
def _validation_task() -> object:
now = datetime.now(UTC)
org = make_organization(now)
return make_task(now, org, task_type=TaskType.validation)
def _general_task() -> object:
now = datetime.now(UTC)
org = make_organization(now)
return make_task(now, org, task_type=TaskType.general)
def _run_with_span(task: object, actions: list[Action]) -> dict:
"""Start a span, invoke the helper inside it, end the span. Return the
span's attribute dict via the in-memory exporter."""
tracer = otel_trace.get_tracer("sky-9174-test")
with tracer.start_as_current_span(STEP_SPAN_NAME) as span:
record_validation_span_attrs(span, task, actions)
return {} # attrs read from the exporter by the caller
def _span_attrs(span_exporter: InMemorySpanExporter) -> dict:
span = next((s for s in span_exporter.get_finished_spans() if s.name == STEP_SPAN_NAME), None)
assert span is not None, "expected fixture span to be recorded"
return dict(span.attributes or {})
def _complete_action(reasoning: str) -> CompleteAction:
return CompleteAction(
reasoning=reasoning,
intention=reasoning,
action_type=ActionType.COMPLETE,
)
def _terminate_action(reasoning: str) -> TerminateAction:
return TerminateAction(
reasoning=reasoning,
intention=reasoning,
action_type=ActionType.TERMINATE,
)
def test_complete_with_semantic_reasoning_records_semantic(span_exporter: InMemorySpanExporter) -> None:
task = _validation_task()
actions = [_complete_action("The current page shows a thank-you confirmation.")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.decision") == "complete"
assert attrs.get("validation.reasoning_kind") == "semantic"
def test_terminate_with_literal_reasoning_records_literal(span_exporter: InMemorySpanExporter) -> None:
"""The regression we care about most: LLM terminated because an exact
string wasn't found. This is the combination (terminate, literal) that
Part D aims to drive toward zero."""
task = _validation_task()
actions = [
_terminate_action(
"The page does not contain the exact complete-criterion text 'Your message has been sent'. TERMINATE."
)
]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.decision") == "terminate"
assert attrs.get("validation.reasoning_kind") == "literal"
def test_terminate_with_semantic_reasoning_records_semantic(span_exporter: InMemorySpanExporter) -> None:
task = _validation_task()
actions = [_terminate_action("An error banner surfaced at the top of the page saying the submission failed.")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.decision") == "terminate"
assert attrs.get("validation.reasoning_kind") == "semantic"
def test_complete_with_literal_reasoning_records_literal(span_exporter: InMemorySpanExporter) -> None:
"""Symmetric — a literal COMPLETE is harmless but we still tag it, because
the post-merge dashboard cares about the distribution across both axes,
not just the terminate one."""
task = _validation_task()
actions = [_complete_action("The page contains the exact phrase 'Your message has been sent'.")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.decision") == "complete"
assert attrs.get("validation.reasoning_kind") == "literal"
def test_non_validation_task_does_not_tag_span(span_exporter: InMemorySpanExporter) -> None:
"""Guard against accidental tagging of non-validation step spans — those
span attributes are reserved for TaskType.validation."""
task = _general_task()
actions = [_complete_action("The current page shows a thank-you confirmation.")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert "validation.decision" not in attrs
assert "validation.reasoning_kind" not in attrs
def test_non_decisive_action_does_not_tag_span(span_exporter: InMemorySpanExporter) -> None:
"""Validation tasks whose first action isn't a Complete/Terminate (unusual
but possible during partial parsing) should not produce tagged attrs."""
task = _validation_task()
# A ClickAction stands in for any non-DecisiveAction leading-first.
non_decisive = ClickAction(action_type=ActionType.CLICK, element_id="AAAB", reasoning="click")
_run_with_span(task, [non_decisive])
attrs = _span_attrs(span_exporter)
assert "validation.decision" not in attrs
assert "validation.reasoning_kind" not in attrs
def test_empty_actions_list_does_not_tag_span(span_exporter: InMemorySpanExporter) -> None:
task = _validation_task()
_run_with_span(task, [])
attrs = _span_attrs(span_exporter)
assert "validation.decision" not in attrs
assert "validation.reasoning_kind" not in attrs
def test_missing_reasoning_defaults_to_semantic(span_exporter: InMemorySpanExporter) -> None:
"""Empty/None reasoning shouldn't crash — absence of literal signals means
semantic by the helper's rule."""
task = _validation_task()
actions = [_complete_action("")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.decision") == "complete"
assert attrs.get("validation.reasoning_kind") == "semantic"
@pytest.mark.parametrize(
"signal",
["exact", "literal", "verbatim", "word-for-word", "word for word"],
)
def test_every_literal_signal_flags_reasoning(signal: str, span_exporter: InMemorySpanExporter) -> None:
"""Each configured signal, on its own, must classify reasoning as literal."""
task = _validation_task()
actions = [_terminate_action(f"The criterion does not appear {signal} on the page.")]
_run_with_span(task, actions)
attrs = _span_attrs(span_exporter)
assert attrs.get("validation.reasoning_kind") == "literal", signal