mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 11:40:32 +00:00
911 lines
32 KiB
Python
911 lines
32 KiB
Python
"""Unit tests for the extract-information shadow-mode correctness verification."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from skyvern.forge.sdk.cache import extraction_shadow
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — strict equality
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_strict_identical_dicts_match() -> None:
|
|
"""Two dicts with identical fields should match under strict comparison."""
|
|
cached = {"title": "Invoice #123", "total": 42.5}
|
|
fresh = {"title": "Invoice #123", "total": 42.5}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is True
|
|
assert result.mode == "strict"
|
|
assert result.diff_summary == set()
|
|
|
|
|
|
def test_compare_strict_field_value_mismatch_reports_diff() -> None:
|
|
cached = {"title": "Invoice #123", "total": 42.5}
|
|
fresh = {"title": "Invoice #123", "total": 42.6}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is False
|
|
assert result.mode == "strict"
|
|
# diff_summary should name the mismatching path.
|
|
assert "total" in result.diff_summary
|
|
# And must NOT leak the raw mismatching values — we care about which path
|
|
# differed, not the exact content (diff_summary is going to a log line).
|
|
assert "42.5" not in str(result.diff_summary)
|
|
assert "42.6" not in str(result.diff_summary)
|
|
|
|
|
|
def test_compare_strict_missing_field_reports_diff() -> None:
|
|
cached = {"title": "x", "total": 1}
|
|
fresh = {"title": "x"}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is False
|
|
assert "total" in result.diff_summary
|
|
|
|
|
|
def test_compare_strict_extra_field_reports_diff() -> None:
|
|
cached = {"title": "x"}
|
|
fresh = {"title": "x", "extra": True}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is False
|
|
assert "extra" in result.diff_summary
|
|
|
|
|
|
def test_compare_strict_nested_dict_mismatch_reports_path() -> None:
|
|
cached = {"meta": {"page": 1, "count": 10}}
|
|
fresh = {"meta": {"page": 1, "count": 11}}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is False
|
|
# Path should surface the nested key so we can bucket regressions by field.
|
|
assert any("count" in path for path in result.diff_summary)
|
|
|
|
|
|
def test_compare_strict_list_order_matters_without_schema() -> None:
|
|
cached = {"docs": ["a.pdf", "b.pdf"]}
|
|
fresh = {"docs": ["b.pdf", "a.pdf"]}
|
|
# With no schema hint, lists are ordered — reordering is a mismatch.
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_strict_list_identical_order_match() -> None:
|
|
cached = {"docs": ["a.pdf", "b.pdf"]}
|
|
fresh = {"docs": ["a.pdf", "b.pdf"]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_string_result_match() -> None:
|
|
result = extraction_shadow.compare_results("hello world", "hello world", schema=None)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_string_result_mismatch() -> None:
|
|
result = extraction_shadow.compare_results("hello world", "hello universe", schema=None)
|
|
assert result.match is False
|
|
assert "root" in result.diff_summary or "" in result.diff_summary
|
|
|
|
|
|
def test_compare_root_list_result_match() -> None:
|
|
"""Some extraction schemas produce a list at the root — must still compare correctly."""
|
|
cached = [{"id": 1}, {"id": 2}]
|
|
fresh = [{"id": 1}, {"id": 2}]
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=None)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_none_results_match() -> None:
|
|
result = extraction_shadow.compare_results(None, None, schema=None)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_one_none_one_populated_mismatch() -> None:
|
|
result = extraction_shadow.compare_results(None, {"a": 1}, schema=None)
|
|
assert result.match is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — semantic list-as-set when schema declares uniqueItems
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_semantic_unique_items_list_order_insensitive() -> None:
|
|
"""When schema marks a list as uniqueItems, reordering is a match, not a diff.
|
|
|
|
This matches the RFC: extract-information may return list elements in a
|
|
different order on a fresh run even though the set of items is identical.
|
|
"""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
}
|
|
cached = {"docs": ["a.pdf", "b.pdf"]}
|
|
fresh = {"docs": ["b.pdf", "a.pdf"]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
assert result.mode == "semantic"
|
|
|
|
|
|
def test_compare_semantic_unique_items_list_content_mismatch_is_diff() -> None:
|
|
"""Different contents — not just order — are still a mismatch."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
}
|
|
cached = {"docs": ["a.pdf", "b.pdf"]}
|
|
fresh = {"docs": ["a.pdf", "c.pdf"]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is False
|
|
assert "docs" in result.diff_summary
|
|
|
|
|
|
def test_compare_semantic_non_unique_list_still_order_sensitive() -> None:
|
|
"""Lists without uniqueItems must stay order-sensitive — we can't assume set semantics."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"entries": {"type": "array", "items": {"type": "string"}}, # no uniqueItems
|
|
},
|
|
}
|
|
cached = {"entries": ["a", "b"]}
|
|
fresh = {"entries": ["b", "a"]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_semantic_unique_items_list_of_dicts() -> None:
|
|
"""Unique-item lists of dicts must compare as sets (hashable via sorted-json)."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"items": {
|
|
"type": "array",
|
|
"uniqueItems": True,
|
|
"items": {"type": "object"},
|
|
},
|
|
},
|
|
}
|
|
cached = {"items": [{"id": 1, "name": "a"}, {"id": 2, "name": "b"}]}
|
|
fresh = {"items": [{"id": 2, "name": "b"}, {"id": 1, "name": "a"}]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_root_array_with_unique_items_order_insensitive() -> None:
|
|
"""Schema whose *root* is a uniqueItems array must also get set semantics.
|
|
|
|
Regression guard for a bug where _collect_unique_item_paths only recorded
|
|
uniqueItems paths when they had a non-empty dotted prefix, so root arrays
|
|
were still compared order-sensitively — inflating the shadow FP metric.
|
|
"""
|
|
schema = {"type": "array", "uniqueItems": True, "items": {"type": "string"}}
|
|
cached = ["a.pdf", "b.pdf"]
|
|
fresh = ["b.pdf", "a.pdf"]
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
assert result.mode == "semantic"
|
|
|
|
|
|
def test_compare_semantic_root_array_with_unique_items_content_mismatch() -> None:
|
|
schema = {"type": "array", "uniqueItems": True, "items": {"type": "string"}}
|
|
cached = ["a.pdf", "b.pdf"]
|
|
fresh = ["a.pdf", "c.pdf"]
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — combinator schemas (allOf/anyOf/oneOf)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_semantic_unique_items_inside_all_of_wrapper() -> None:
|
|
"""Pydantic wraps array fields in allOf when Field(description=...) is used.
|
|
|
|
Without combinator traversal, uniqueItems on these fields would be missed
|
|
and reorder-only diffs would inflate the shadow FP metric for most
|
|
real-world extraction schemas.
|
|
"""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"ids": {
|
|
"allOf": [{"type": "array", "uniqueItems": True, "items": {"type": "integer"}}],
|
|
"description": "unique identifiers",
|
|
},
|
|
},
|
|
}
|
|
cached = {"ids": [1, 2, 3]}
|
|
fresh = {"ids": [3, 2, 1]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
assert result.mode == "semantic"
|
|
|
|
|
|
def test_compare_semantic_unique_items_inside_any_of_nullable() -> None:
|
|
"""anyOf is how JSON Schema expresses ``Optional[list[...]]`` — must still honor uniqueItems."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"tags": {
|
|
"anyOf": [
|
|
{"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
{"type": "null"},
|
|
],
|
|
},
|
|
},
|
|
}
|
|
cached = {"tags": ["a", "b"]}
|
|
fresh = {"tags": ["b", "a"]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_unique_items_inside_one_of() -> None:
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"vals": {
|
|
"oneOf": [
|
|
{"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
|
|
{"type": "string"},
|
|
],
|
|
},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"vals": [1, 2]},
|
|
{"vals": [2, 1]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — $ref resolution for Pydantic-generated schemas
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_semantic_unique_items_behind_ref() -> None:
|
|
"""Pydantic puts nested models under $defs and references them via $ref.
|
|
|
|
Without $ref resolution, uniqueItems inside those definitions would be
|
|
missed and reorder-only diffs would inflate the shadow FP metric.
|
|
"""
|
|
schema = {
|
|
"$defs": {
|
|
"Item": {
|
|
"type": "object",
|
|
"properties": {
|
|
"tags": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
},
|
|
},
|
|
"type": "object",
|
|
"properties": {
|
|
"item": {"$ref": "#/$defs/Item"},
|
|
},
|
|
}
|
|
cached = {"item": {"tags": ["a", "b"]}}
|
|
fresh = {"item": {"tags": ["b", "a"]}}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_unique_items_ref_cycle_does_not_skip_siblings() -> None:
|
|
"""Hitting a cycle must not short-circuit the rest of the current node's keys.
|
|
|
|
Regression guard: the cycle check used to `return` early, which dropped
|
|
sibling traversal (properties/items/combinators) on any node that
|
|
contained a `$ref` already in the current expansion path.
|
|
"""
|
|
# Both the outer "ids" ref and the sibling uniqueItems must be picked up.
|
|
schema = {
|
|
"$defs": {
|
|
"Container": {
|
|
"type": "object",
|
|
"properties": {
|
|
# Self-referential: Container.parent → Container
|
|
"parent": {"$ref": "#/$defs/Container"},
|
|
# Sibling that depends on cycle-guard allowing traversal.
|
|
"ids": {"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
|
|
},
|
|
},
|
|
},
|
|
"$ref": "#/$defs/Container",
|
|
}
|
|
# Reorder-only diff at ids — must match because uniqueItems is detected.
|
|
cached = {"ids": [1, 2, 3], "parent": {"ids": [1, 2, 3]}}
|
|
fresh = {"ids": [3, 2, 1], "parent": {"ids": [1, 2, 3]}}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_unique_items_ref_circular_safe() -> None:
|
|
"""Circular $ref must not cause infinite recursion in the collector."""
|
|
schema = {
|
|
"$defs": {
|
|
"Node": {
|
|
"type": "object",
|
|
"properties": {
|
|
"children": {
|
|
"type": "array",
|
|
"uniqueItems": True,
|
|
"items": {"$ref": "#/$defs/Node"},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
"$ref": "#/$defs/Node",
|
|
}
|
|
# Same-shape trees, inner children reordered — should match.
|
|
cached = {"children": [{"children": []}, {"children": []}]}
|
|
fresh = {"children": [{"children": []}, {"children": []}]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_unique_items_external_ref_ignored() -> None:
|
|
"""External $ref (non-#/) must be silently skipped, not crash."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"ids": {"$ref": "https://example.com/schema.json#/Foo"},
|
|
},
|
|
}
|
|
# External ref can't be resolved, so these compare strictly.
|
|
# The important thing is the collector doesn't raise.
|
|
result = extraction_shadow.compare_results({"ids": [1]}, {"ids": [1]}, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — bool vs int must be a mismatch (Python treats True == 1)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_bool_vs_int_at_field_is_mismatch() -> None:
|
|
"""True vs 1 must be a diff — Python treats them as equal, the cache metric must not."""
|
|
result = extraction_shadow.compare_results({"flag": True}, {"flag": 1}, schema=None)
|
|
assert result.match is False
|
|
assert "flag" in result.diff_summary
|
|
|
|
|
|
def test_compare_bool_vs_int_at_root_is_mismatch() -> None:
|
|
result = extraction_shadow.compare_results(True, 1, schema=None)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_false_vs_zero_is_mismatch() -> None:
|
|
result = extraction_shadow.compare_results({"f": False}, {"f": 0}, schema=None)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_int_vs_float_still_allowed_when_equal() -> None:
|
|
"""Int vs float with the same value should still match — that's a JSON-ism, not a real diff."""
|
|
result = extraction_shadow.compare_results({"n": 1}, {"n": 1.0}, schema=None)
|
|
assert result.match is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — uniqueItems set comparison must preserve multiplicity
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_semantic_unique_items_preserves_multiplicity() -> None:
|
|
"""uniqueItems set comparison must not collapse duplicates.
|
|
|
|
If cached is ['a', 'a'] and fresh is ['a'], the payloads differ even
|
|
though the underlying set is identical — treat it as a mismatch so the
|
|
FP metric doesn't undercount real divergences.
|
|
"""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"docs": ["a.pdf", "a.pdf"]},
|
|
{"docs": ["a.pdf"]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is False
|
|
assert "docs" in result.diff_summary
|
|
|
|
|
|
def test_compare_semantic_unique_items_multiplicity_match() -> None:
|
|
"""Same multiset with different order should still match."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"docs": ["a.pdf", "b.pdf", "a.pdf"]},
|
|
{"docs": ["a.pdf", "a.pdf", "b.pdf"]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compare_results — uniqueItems inside array items (nested arrays)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_compare_semantic_unique_items_nested_inside_array_items() -> None:
|
|
"""Schema: {groups: array<array(uniqueItems)>}. Inner reordering must match."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"groups": {
|
|
"type": "array",
|
|
"items": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
},
|
|
}
|
|
cached = {"groups": [["a", "b"], ["c", "d"]]}
|
|
fresh = {"groups": [["b", "a"], ["d", "c"]]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
assert result.mode == "semantic"
|
|
|
|
|
|
def test_compare_semantic_outer_array_without_unique_stays_order_sensitive_when_inner_is_unique() -> None:
|
|
"""Outer array (no uniqueItems) must NOT inherit set semantics from inner unique arrays."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"groups": {
|
|
"type": "array",
|
|
"items": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
},
|
|
}
|
|
# Outer order changed — must be a diff even though inner elements are the same sets.
|
|
cached = {"groups": [["a"], ["b"]]}
|
|
fresh = {"groups": [["b"], ["a"]]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_semantic_unique_items_preserves_large_int_precision() -> None:
|
|
"""Large ints above 2^53 must not collapse to the same float in canonical form."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"ids": {"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
|
|
},
|
|
}
|
|
# 2^53 + 1 cannot be represented exactly as a float64; naive float(int)
|
|
# conversion would collapse these two distinct ids to the same value.
|
|
result = extraction_shadow.compare_results(
|
|
{"ids": [9007199254740992]},
|
|
{"ids": [9007199254740993]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is False
|
|
assert "ids" in result.diff_summary
|
|
|
|
|
|
def test_compare_semantic_unique_items_number_array_int_vs_float_match() -> None:
|
|
"""uniqueItems number array must treat 1 and 1.0 as equal, matching _diff_paths."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"vals": {"type": "array", "uniqueItems": True, "items": {"type": "number"}},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"vals": [1, 2]},
|
|
{"vals": [1.0, 2.0]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is True
|
|
|
|
|
|
def test_compare_semantic_unique_items_close_but_unequal_number_array() -> None:
|
|
"""Arrays differing in one numeric value must register as a mismatch under set-equality."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"vals": {"type": "array", "uniqueItems": True, "items": {"type": "number"}},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"vals": [1, 2, 3]},
|
|
{"vals": [1, 2, 4]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is False
|
|
assert "vals" in result.diff_summary
|
|
|
|
|
|
def test_compare_semantic_unique_items_bool_still_distinct_from_int() -> None:
|
|
"""Even inside a uniqueItems array, True must not equal 1."""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"flags": {"type": "array", "uniqueItems": True, "items": {}},
|
|
},
|
|
}
|
|
result = extraction_shadow.compare_results(
|
|
{"flags": [True]},
|
|
{"flags": [1]},
|
|
schema=schema,
|
|
)
|
|
assert result.match is False
|
|
|
|
|
|
def test_compare_semantic_unique_items_with_nested_unique_objects_reorder_matches() -> None:
|
|
"""uniqueItems array of objects containing nested uniqueItems lists.
|
|
|
|
Cached and fresh have identical elements modulo (a) outer reorder and
|
|
(b) inner uniqueItems list reorder. Must match — the recursive semantic
|
|
rules have to apply inside the set-equality comparison, not just at the
|
|
top level.
|
|
"""
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"items": {
|
|
"type": "array",
|
|
"uniqueItems": True,
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"tags": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
cached = {"items": [{"tags": ["a", "b"]}, {"tags": ["c", "d"]}]}
|
|
fresh = {"items": [{"tags": ["d", "c"]}, {"tags": ["b", "a"]}]}
|
|
result = extraction_shadow.compare_results(cached, fresh, schema=schema)
|
|
assert result.match is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class _DummyLogCapture:
|
|
"""Structlog capture helper — records each call as (event, kwargs)."""
|
|
|
|
def __init__(self) -> None:
|
|
self.calls: list[tuple[str, dict[str, Any]]] = []
|
|
|
|
def debug(self, event: str, **kwargs: Any) -> None:
|
|
self.calls.append((event, kwargs))
|
|
|
|
def info(self, event: str, **kwargs: Any) -> None:
|
|
self.calls.append((event, kwargs))
|
|
|
|
def warning(self, event: str, **kwargs: Any) -> None:
|
|
self.calls.append((event, kwargs))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# run_shadow_comparison — exception sanitization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_shadow_comparison_error_log_does_not_leak_exception_message() -> None:
|
|
"""Exception messages can contain raw LLM response payloads — log only the type."""
|
|
captured = _DummyLogCapture()
|
|
|
|
async def llm_call() -> Any:
|
|
raise ValueError("SSN: 123-45-6789 leaked from model response")
|
|
|
|
await extraction_shadow.run_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
assert len(captured.calls) == 1
|
|
_event, fields = captured.calls[0]
|
|
flat = " ".join(str(v) for v in fields.values())
|
|
assert "123-45-6789" not in flat
|
|
assert "SSN" not in flat
|
|
# Class name is fine to log.
|
|
assert "ValueError" in flat
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# run_shadow_comparison — background runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _fresh_ok(_result: Any) -> Any:
|
|
return _result
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_shadow_comparison_logs_match_event() -> None:
|
|
cached = {"docs": ["a.pdf"]}
|
|
captured = _DummyLogCapture()
|
|
|
|
async def llm_call() -> Any:
|
|
return {"docs": ["a.pdf"]}
|
|
|
|
await extraction_shadow.run_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value=cached,
|
|
cached_age_seconds=12.3,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
assert event == "extract_information.shadow_comparison"
|
|
assert fields["status"] == "ok"
|
|
assert fields["cache_key"] == "k1"
|
|
assert fields["workflow_run_id"] == "wfr_1"
|
|
assert fields["match"] is True
|
|
assert fields["cached_age_seconds"] == 12.3
|
|
assert "shadow_duration_ms" in fields
|
|
assert fields["shadow_duration_ms"] >= 0
|
|
assert fields["mode"] == "strict"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_shadow_comparison_logs_mismatch_with_diff() -> None:
|
|
captured = _DummyLogCapture()
|
|
|
|
async def llm_call() -> Any:
|
|
return {"docs": ["a.pdf", "b.pdf"]}
|
|
|
|
await extraction_shadow.run_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"docs": ["a.pdf"]},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
assert event == "extract_information.shadow_comparison"
|
|
assert fields["match"] is False
|
|
assert fields["diff_summary"] # non-empty
|
|
assert "docs" in fields["diff_summary"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_shadow_comparison_swallows_llm_errors() -> None:
|
|
"""A failing LLM call must not propagate — shadow is best-effort and fire-and-forget."""
|
|
captured = _DummyLogCapture()
|
|
|
|
async def llm_call() -> Any:
|
|
raise RuntimeError("LLM down")
|
|
|
|
# Must not raise.
|
|
await extraction_shadow.run_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
# Single consolidated event — filter on status=error to exclude from the FP metric.
|
|
assert event == "extract_information.shadow_comparison"
|
|
assert fields["status"] == "error"
|
|
assert fields["cache_key"] == "k1"
|
|
assert fields["error_type"] == "RuntimeError"
|
|
assert fields["error_stage"] == "llm_call"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_shadow_comparison_uses_structlog_by_default(caplog: pytest.LogCaptureFixture) -> None:
|
|
"""If no logger is injected, the module's default structlog logger is used."""
|
|
|
|
async def llm_call() -> Any:
|
|
return {"a": 1}
|
|
|
|
with caplog.at_level(logging.INFO):
|
|
await extraction_shadow.run_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
)
|
|
|
|
# Default path should succeed without raising even when no logger override is provided.
|
|
# We don't assert on caplog content (structlog routing varies by test env), only that
|
|
# no exception escaped.
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_schedule_shadow_check_runs_gate_in_background() -> None:
|
|
"""schedule_shadow_check must not await the gate on the caller's stack.
|
|
|
|
Regression guard for the P1 where handler.py used to `await` the PostHog
|
|
flag lookup directly, blocking cache-hit returns on the flag provider.
|
|
"""
|
|
gate_release = asyncio.Event()
|
|
captured = _DummyLogCapture()
|
|
|
|
async def slow_gate() -> bool:
|
|
await gate_release.wait()
|
|
return True
|
|
|
|
async def llm_call() -> Any:
|
|
return {"a": 1}
|
|
|
|
task = extraction_shadow.schedule_shadow_check(
|
|
gate=slow_gate,
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
# Caller returns immediately — gate has not run yet.
|
|
assert len(captured.calls) == 0
|
|
|
|
gate_release.set()
|
|
await task
|
|
assert len(captured.calls) == 1
|
|
assert captured.calls[0][1]["status"] == "ok"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_schedule_shadow_check_skips_when_gate_returns_false() -> None:
|
|
captured = _DummyLogCapture()
|
|
|
|
async def gate() -> bool:
|
|
return False
|
|
|
|
async def llm_call() -> Any: # pragma: no cover — must not be called
|
|
raise AssertionError("LLM should not be called when gate is False")
|
|
|
|
task = extraction_shadow.schedule_shadow_check(
|
|
gate=gate,
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
await task
|
|
# One info log confirming the gate evaluated to False — used as the
|
|
# sampling-rate denominator (status:skipped) alongside status:ok/error.
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
assert event == "extract_information.shadow_comparison"
|
|
assert fields["status"] == "skipped"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_schedule_shadow_check_swallows_gate_errors() -> None:
|
|
captured = _DummyLogCapture()
|
|
|
|
async def gate() -> bool:
|
|
raise RuntimeError("posthog unavailable")
|
|
|
|
async def llm_call() -> Any: # pragma: no cover — must not be called
|
|
raise AssertionError("LLM should not be called when gate raises")
|
|
|
|
task = extraction_shadow.schedule_shadow_check(
|
|
gate=gate,
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
await task # must not raise
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
assert event == "extract_information.shadow_comparison"
|
|
assert fields["status"] == "error"
|
|
assert fields["error_stage"] == "gate"
|
|
assert fields["error_type"] == "RuntimeError"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_schedule_returns_none_and_warns_when_cap_reached(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""Safety valve: when _PENDING_SHADOW_TASKS is full, schedule must skip and warn.
|
|
|
|
Protects the hot path from LLM-provider rate-limit contention when shadow
|
|
tasks pile up (slow provider, sustained cache-hit burst).
|
|
"""
|
|
|
|
# Fill the pending set with already-done tasks that won't be pruned by _prune_pending()
|
|
# — simulates an in-flight backlog rather than leaked done tasks.
|
|
class _PendingMarker:
|
|
def done(self) -> bool:
|
|
return False
|
|
|
|
fake_pending: set[Any] = {_PendingMarker() for _ in range(extraction_shadow._MAX_PENDING_SHADOWS)}
|
|
monkeypatch.setattr(extraction_shadow, "_PENDING_SHADOW_TASKS", fake_pending)
|
|
|
|
captured = _DummyLogCapture()
|
|
|
|
async def llm_call() -> Any: # pragma: no cover — must not run when capped
|
|
raise AssertionError("shadow LLM must not run when the cap is hit")
|
|
|
|
task = extraction_shadow.schedule_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
assert task is None
|
|
assert len(captured.calls) == 1
|
|
event, fields = captured.calls[0]
|
|
assert event == "shadow_task_cap_reached"
|
|
assert fields["pending"] == extraction_shadow._MAX_PENDING_SHADOWS
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_schedule_shadow_comparison_does_not_block_caller() -> None:
|
|
"""schedule_shadow_comparison must return immediately; background task executes after."""
|
|
release = asyncio.Event()
|
|
observed: list[bool] = []
|
|
|
|
async def slow_llm_call() -> Any:
|
|
# Block until the test tells us to proceed, proving the caller isn't awaiting us.
|
|
await release.wait()
|
|
return {"a": 1}
|
|
|
|
captured = _DummyLogCapture()
|
|
|
|
task = extraction_shadow.schedule_shadow_comparison(
|
|
cache_key="k1",
|
|
workflow_run_id="wfr_1",
|
|
cached_value={"a": 1},
|
|
cached_age_seconds=0.0,
|
|
llm_call=slow_llm_call,
|
|
schema=None,
|
|
logger=captured,
|
|
)
|
|
|
|
# Caller returns immediately — no logs yet.
|
|
observed.append(task is not None)
|
|
assert len(captured.calls) == 0
|
|
|
|
release.set()
|
|
await task
|
|
assert len(captured.calls) == 1
|
|
assert captured.calls[0][1]["match"] is True
|