Skyvern/tests/unit/test_extraction_shadow.py

"""Unit tests for the extract-information shadow-mode correctness verification."""

from __future__ import annotations

import asyncio
import logging
from typing import Any

import pytest

from skyvern.forge.sdk.cache import extraction_shadow

# ---------------------------------------------------------------------------
# compare_results — strict equality
# ---------------------------------------------------------------------------


def test_compare_strict_identical_dicts_match() -> None:
    """Two dicts with identical fields should match under strict comparison."""
    cached = {"title": "Invoice #123", "total": 42.5}
    fresh = {"title": "Invoice #123", "total": 42.5}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is True
    assert result.mode == "strict"
    assert result.diff_summary == set()


def test_compare_strict_field_value_mismatch_reports_diff() -> None:
    cached = {"title": "Invoice #123", "total": 42.5}
    fresh = {"title": "Invoice #123", "total": 42.6}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is False
    assert result.mode == "strict"
    # diff_summary should name the mismatching path.
    assert "total" in result.diff_summary
    # And must NOT leak the raw mismatching values — we care about which path
    # differed, not the exact content (diff_summary is going to a log line).
    assert "42.5" not in str(result.diff_summary)
    assert "42.6" not in str(result.diff_summary)


def test_compare_strict_missing_field_reports_diff() -> None:
    cached = {"title": "x", "total": 1}
    fresh = {"title": "x"}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is False
    assert "total" in result.diff_summary


def test_compare_strict_extra_field_reports_diff() -> None:
    cached = {"title": "x"}
    fresh = {"title": "x", "extra": True}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is False
    assert "extra" in result.diff_summary


def test_compare_strict_nested_dict_mismatch_reports_path() -> None:
    cached = {"meta": {"page": 1, "count": 10}}
    fresh = {"meta": {"page": 1, "count": 11}}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is False
    # Path should surface the nested key so we can bucket regressions by field.
    assert any("count" in path for path in result.diff_summary)


def test_compare_strict_list_order_matters_without_schema() -> None:
    cached = {"docs": ["a.pdf", "b.pdf"]}
    fresh = {"docs": ["b.pdf", "a.pdf"]}
    # With no schema hint, lists are ordered — reordering is a mismatch.
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is False


def test_compare_strict_list_identical_order_match() -> None:
    cached = {"docs": ["a.pdf", "b.pdf"]}
    fresh = {"docs": ["a.pdf", "b.pdf"]}
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is True


def test_compare_string_result_match() -> None:
    result = extraction_shadow.compare_results("hello world", "hello world", schema=None)
    assert result.match is True


def test_compare_string_result_mismatch() -> None:
    result = extraction_shadow.compare_results("hello world", "hello universe", schema=None)
    assert result.match is False
    assert "root" in result.diff_summary or "" in result.diff_summary


def test_compare_root_list_result_match() -> None:
    """Some extraction schemas produce a list at the root — must still compare correctly."""
    cached = [{"id": 1}, {"id": 2}]
    fresh = [{"id": 1}, {"id": 2}]
    result = extraction_shadow.compare_results(cached, fresh, schema=None)
    assert result.match is True


def test_compare_none_results_match() -> None:
    result = extraction_shadow.compare_results(None, None, schema=None)
    assert result.match is True


def test_compare_one_none_one_populated_mismatch() -> None:
    result = extraction_shadow.compare_results(None, {"a": 1}, schema=None)
    assert result.match is False


# ---------------------------------------------------------------------------
# compare_results — semantic list-as-set when schema declares uniqueItems
# ---------------------------------------------------------------------------


def test_compare_semantic_unique_items_list_order_insensitive() -> None:
    """When schema marks a list as uniqueItems, reordering is a match, not a diff.

    This matches the RFC: extract-information may return list elements in a
    different order on a fresh run even though the set of items is identical.
    """
    schema = {
        "type": "object",
        "properties": {
            "docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
        },
    }
    cached = {"docs": ["a.pdf", "b.pdf"]}
    fresh = {"docs": ["b.pdf", "a.pdf"]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True
    assert result.mode == "semantic"


def test_compare_semantic_unique_items_list_content_mismatch_is_diff() -> None:
    """Different contents — not just order — are still a mismatch."""
    schema = {
        "type": "object",
        "properties": {
            "docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
        },
    }
    cached = {"docs": ["a.pdf", "b.pdf"]}
    fresh = {"docs": ["a.pdf", "c.pdf"]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is False
    assert "docs" in result.diff_summary


def test_compare_semantic_non_unique_list_still_order_sensitive() -> None:
    """Lists without uniqueItems must stay order-sensitive — we can't assume set semantics."""
    schema = {
        "type": "object",
        "properties": {
            "entries": {"type": "array", "items": {"type": "string"}},  # no uniqueItems
        },
    }
    cached = {"entries": ["a", "b"]}
    fresh = {"entries": ["b", "a"]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is False


def test_compare_semantic_unique_items_list_of_dicts() -> None:
    """Unique-item lists of dicts must compare as sets (hashable via sorted-json)."""
    schema = {
        "type": "object",
        "properties": {
            "items": {
                "type": "array",
                "uniqueItems": True,
                "items": {"type": "object"},
            },
        },
    }
    cached = {"items": [{"id": 1, "name": "a"}, {"id": 2, "name": "b"}]}
    fresh = {"items": [{"id": 2, "name": "b"}, {"id": 1, "name": "a"}]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


def test_compare_semantic_root_array_with_unique_items_order_insensitive() -> None:
    """Schema whose *root* is a uniqueItems array must also get set semantics.

    Regression guard for a bug where _collect_unique_item_paths only recorded
    uniqueItems paths when they had a non-empty dotted prefix, so root arrays
    were still compared order-sensitively — inflating the shadow FP metric.
    """
    schema = {"type": "array", "uniqueItems": True, "items": {"type": "string"}}
    cached = ["a.pdf", "b.pdf"]
    fresh = ["b.pdf", "a.pdf"]
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True
    assert result.mode == "semantic"


def test_compare_semantic_root_array_with_unique_items_content_mismatch() -> None:
    schema = {"type": "array", "uniqueItems": True, "items": {"type": "string"}}
    cached = ["a.pdf", "b.pdf"]
    fresh = ["a.pdf", "c.pdf"]
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is False


# ---------------------------------------------------------------------------
# compare_results — combinator schemas (allOf/anyOf/oneOf)
# ---------------------------------------------------------------------------


def test_compare_semantic_unique_items_inside_all_of_wrapper() -> None:
    """Pydantic wraps array fields in allOf when Field(description=...) is used.

    Without combinator traversal, uniqueItems on these fields would be missed
    and reorder-only diffs would inflate the shadow FP metric for most
    real-world extraction schemas.
    """
    schema = {
        "type": "object",
        "properties": {
            "ids": {
                "allOf": [{"type": "array", "uniqueItems": True, "items": {"type": "integer"}}],
                "description": "unique identifiers",
            },
        },
    }
    cached = {"ids": [1, 2, 3]}
    fresh = {"ids": [3, 2, 1]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True
    assert result.mode == "semantic"


def test_compare_semantic_unique_items_inside_any_of_nullable() -> None:
    """anyOf is how JSON Schema expresses ``Optional[list[...]]`` — must still honor uniqueItems."""
    schema = {
        "type": "object",
        "properties": {
            "tags": {
                "anyOf": [
                    {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
                    {"type": "null"},
                ],
            },
        },
    }
    cached = {"tags": ["a", "b"]}
    fresh = {"tags": ["b", "a"]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


def test_compare_semantic_unique_items_inside_one_of() -> None:
    schema = {
        "type": "object",
        "properties": {
            "vals": {
                "oneOf": [
                    {"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
                    {"type": "string"},
                ],
            },
        },
    }
    result = extraction_shadow.compare_results(
        {"vals": [1, 2]},
        {"vals": [2, 1]},
        schema=schema,
    )
    assert result.match is True


# ---------------------------------------------------------------------------
# compare_results — $ref resolution for Pydantic-generated schemas
# ---------------------------------------------------------------------------


def test_compare_semantic_unique_items_behind_ref() -> None:
    """Pydantic puts nested models under $defs and references them via $ref.

    Without $ref resolution, uniqueItems inside those definitions would be
    missed and reorder-only diffs would inflate the shadow FP metric.
    """
    schema = {
        "$defs": {
            "Item": {
                "type": "object",
                "properties": {
                    "tags": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
                },
            },
        },
        "type": "object",
        "properties": {
            "item": {"$ref": "#/$defs/Item"},
        },
    }
    cached = {"item": {"tags": ["a", "b"]}}
    fresh = {"item": {"tags": ["b", "a"]}}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


def test_compare_semantic_unique_items_ref_cycle_does_not_skip_siblings() -> None:
    """Hitting a cycle must not short-circuit the rest of the current node's keys.

    Regression guard: the cycle check used to `return` early, which dropped
    sibling traversal (properties/items/combinators) on any node that
    contained a `$ref` already in the current expansion path.
    """
    # Both the outer "ids" ref and the sibling uniqueItems must be picked up.
    schema = {
        "$defs": {
            "Container": {
                "type": "object",
                "properties": {
                    # Self-referential: Container.parent → Container
                    "parent": {"$ref": "#/$defs/Container"},
                    # Sibling that depends on cycle-guard allowing traversal.
                    "ids": {"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
                },
            },
        },
        "$ref": "#/$defs/Container",
    }
    # Reorder-only diff at ids — must match because uniqueItems is detected.
    cached = {"ids": [1, 2, 3], "parent": {"ids": [1, 2, 3]}}
    fresh = {"ids": [3, 2, 1], "parent": {"ids": [1, 2, 3]}}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


def test_compare_semantic_unique_items_ref_circular_safe() -> None:
    """Circular $ref must not cause infinite recursion in the collector."""
    schema = {
        "$defs": {
            "Node": {
                "type": "object",
                "properties": {
                    "children": {
                        "type": "array",
                        "uniqueItems": True,
                        "items": {"$ref": "#/$defs/Node"},
                    },
                },
            },
        },
        "$ref": "#/$defs/Node",
    }
    # Same-shape trees, inner children reordered — should match.
    cached = {"children": [{"children": []}, {"children": []}]}
    fresh = {"children": [{"children": []}, {"children": []}]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


def test_compare_semantic_unique_items_external_ref_ignored() -> None:
    """External $ref (non-#/) must be silently skipped, not crash."""
    schema = {
        "type": "object",
        "properties": {
            "ids": {"$ref": "https://example.com/schema.json#/Foo"},
        },
    }
    # External ref can't be resolved, so these compare strictly.
    # The important thing is the collector doesn't raise.
    result = extraction_shadow.compare_results({"ids": [1]}, {"ids": [1]}, schema=schema)
    assert result.match is True


# ---------------------------------------------------------------------------
# compare_results — bool vs int must be a mismatch (Python treats True == 1)
# ---------------------------------------------------------------------------


def test_compare_bool_vs_int_at_field_is_mismatch() -> None:
    """True vs 1 must be a diff — Python treats them as equal, the cache metric must not."""
    result = extraction_shadow.compare_results({"flag": True}, {"flag": 1}, schema=None)
    assert result.match is False
    assert "flag" in result.diff_summary


def test_compare_bool_vs_int_at_root_is_mismatch() -> None:
    result = extraction_shadow.compare_results(True, 1, schema=None)
    assert result.match is False


def test_compare_false_vs_zero_is_mismatch() -> None:
    result = extraction_shadow.compare_results({"f": False}, {"f": 0}, schema=None)
    assert result.match is False


def test_compare_int_vs_float_still_allowed_when_equal() -> None:
    """Int vs float with the same value should still match — that's a JSON-ism, not a real diff."""
    result = extraction_shadow.compare_results({"n": 1}, {"n": 1.0}, schema=None)
    assert result.match is True


# ---------------------------------------------------------------------------
# compare_results — uniqueItems set comparison must preserve multiplicity
# ---------------------------------------------------------------------------


def test_compare_semantic_unique_items_preserves_multiplicity() -> None:
    """uniqueItems set comparison must not collapse duplicates.

    If cached is ['a', 'a'] and fresh is ['a'], the payloads differ even
    though the underlying set is identical — treat it as a mismatch so the
    FP metric doesn't undercount real divergences.
    """
    schema = {
        "type": "object",
        "properties": {
            "docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
        },
    }
    result = extraction_shadow.compare_results(
        {"docs": ["a.pdf", "a.pdf"]},
        {"docs": ["a.pdf"]},
        schema=schema,
    )
    assert result.match is False
    assert "docs" in result.diff_summary


def test_compare_semantic_unique_items_multiplicity_match() -> None:
    """Same multiset with different order should still match."""
    schema = {
        "type": "object",
        "properties": {
            "docs": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
        },
    }
    result = extraction_shadow.compare_results(
        {"docs": ["a.pdf", "b.pdf", "a.pdf"]},
        {"docs": ["a.pdf", "a.pdf", "b.pdf"]},
        schema=schema,
    )
    assert result.match is True


# ---------------------------------------------------------------------------
# compare_results — uniqueItems inside array items (nested arrays)
# ---------------------------------------------------------------------------


def test_compare_semantic_unique_items_nested_inside_array_items() -> None:
    """Schema: {groups: array<array(uniqueItems)>}. Inner reordering must match."""
    schema = {
        "type": "object",
        "properties": {
            "groups": {
                "type": "array",
                "items": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
            },
        },
    }
    cached = {"groups": [["a", "b"], ["c", "d"]]}
    fresh = {"groups": [["b", "a"], ["d", "c"]]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True
    assert result.mode == "semantic"


def test_compare_semantic_outer_array_without_unique_stays_order_sensitive_when_inner_is_unique() -> None:
    """Outer array (no uniqueItems) must NOT inherit set semantics from inner unique arrays."""
    schema = {
        "type": "object",
        "properties": {
            "groups": {
                "type": "array",
                "items": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
            },
        },
    }
    # Outer order changed — must be a diff even though inner elements are the same sets.
    cached = {"groups": [["a"], ["b"]]}
    fresh = {"groups": [["b"], ["a"]]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is False


def test_compare_semantic_unique_items_preserves_large_int_precision() -> None:
    """Large ints above 2^53 must not collapse to the same float in canonical form."""
    schema = {
        "type": "object",
        "properties": {
            "ids": {"type": "array", "uniqueItems": True, "items": {"type": "integer"}},
        },
    }
    # 2^53 + 1 cannot be represented exactly as a float64; naive float(int)
    # conversion would collapse these two distinct ids to the same value.
    result = extraction_shadow.compare_results(
        {"ids": [9007199254740992]},
        {"ids": [9007199254740993]},
        schema=schema,
    )
    assert result.match is False
    assert "ids" in result.diff_summary


def test_compare_semantic_unique_items_number_array_int_vs_float_match() -> None:
    """uniqueItems number array must treat 1 and 1.0 as equal, matching _diff_paths."""
    schema = {
        "type": "object",
        "properties": {
            "vals": {"type": "array", "uniqueItems": True, "items": {"type": "number"}},
        },
    }
    result = extraction_shadow.compare_results(
        {"vals": [1, 2]},
        {"vals": [1.0, 2.0]},
        schema=schema,
    )
    assert result.match is True


def test_compare_semantic_unique_items_close_but_unequal_number_array() -> None:
    """Arrays differing in one numeric value must register as a mismatch under set-equality."""
    schema = {
        "type": "object",
        "properties": {
            "vals": {"type": "array", "uniqueItems": True, "items": {"type": "number"}},
        },
    }
    result = extraction_shadow.compare_results(
        {"vals": [1, 2, 3]},
        {"vals": [1, 2, 4]},
        schema=schema,
    )
    assert result.match is False
    assert "vals" in result.diff_summary


def test_compare_semantic_unique_items_bool_still_distinct_from_int() -> None:
    """Even inside a uniqueItems array, True must not equal 1."""
    schema = {
        "type": "object",
        "properties": {
            "flags": {"type": "array", "uniqueItems": True, "items": {}},
        },
    }
    result = extraction_shadow.compare_results(
        {"flags": [True]},
        {"flags": [1]},
        schema=schema,
    )
    assert result.match is False


def test_compare_semantic_unique_items_with_nested_unique_objects_reorder_matches() -> None:
    """uniqueItems array of objects containing nested uniqueItems lists.

    Cached and fresh have identical elements modulo (a) outer reorder and
    (b) inner uniqueItems list reorder. Must match — the recursive semantic
    rules have to apply inside the set-equality comparison, not just at the
    top level.
    """
    schema = {
        "type": "object",
        "properties": {
            "items": {
                "type": "array",
                "uniqueItems": True,
                "items": {
                    "type": "object",
                    "properties": {
                        "tags": {"type": "array", "uniqueItems": True, "items": {"type": "string"}},
                    },
                },
            },
        },
    }
    cached = {"items": [{"tags": ["a", "b"]}, {"tags": ["c", "d"]}]}
    fresh = {"items": [{"tags": ["d", "c"]}, {"tags": ["b", "a"]}]}
    result = extraction_shadow.compare_results(cached, fresh, schema=schema)
    assert result.match is True


# ---------------------------------------------------------------------------
# Test helpers
# ---------------------------------------------------------------------------


class _DummyLogCapture:
    """Structlog capture helper — records each call as (event, kwargs)."""

    def __init__(self) -> None:
        self.calls: list[tuple[str, dict[str, Any]]] = []

    def debug(self, event: str, **kwargs: Any) -> None:
        self.calls.append((event, kwargs))

    def info(self, event: str, **kwargs: Any) -> None:
        self.calls.append((event, kwargs))

    def warning(self, event: str, **kwargs: Any) -> None:
        self.calls.append((event, kwargs))


# ---------------------------------------------------------------------------
# run_shadow_comparison — exception sanitization
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_run_shadow_comparison_error_log_does_not_leak_exception_message() -> None:
    """Exception messages can contain raw LLM response payloads — log only the type."""
    captured = _DummyLogCapture()

    async def llm_call() -> Any:
        raise ValueError("SSN: 123-45-6789 leaked from model response")

    await extraction_shadow.run_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )

    assert len(captured.calls) == 1
    _event, fields = captured.calls[0]
    flat = " ".join(str(v) for v in fields.values())
    assert "123-45-6789" not in flat
    assert "SSN" not in flat
    # Class name is fine to log.
    assert "ValueError" in flat


# ---------------------------------------------------------------------------
# run_shadow_comparison — background runner
# ---------------------------------------------------------------------------


async def _fresh_ok(_result: Any) -> Any:
    return _result


@pytest.mark.asyncio
async def test_run_shadow_comparison_logs_match_event() -> None:
    cached = {"docs": ["a.pdf"]}
    captured = _DummyLogCapture()

    async def llm_call() -> Any:
        return {"docs": ["a.pdf"]}

    await extraction_shadow.run_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value=cached,
        cached_age_seconds=12.3,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )

    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    assert event == "extract_information.shadow_comparison"
    assert fields["status"] == "ok"
    assert fields["cache_key"] == "k1"
    assert fields["workflow_run_id"] == "wfr_1"
    assert fields["match"] is True
    assert fields["cached_age_seconds"] == 12.3
    assert "shadow_duration_ms" in fields
    assert fields["shadow_duration_ms"] >= 0
    assert fields["mode"] == "strict"


@pytest.mark.asyncio
async def test_run_shadow_comparison_logs_mismatch_with_diff() -> None:
    captured = _DummyLogCapture()

    async def llm_call() -> Any:
        return {"docs": ["a.pdf", "b.pdf"]}

    await extraction_shadow.run_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"docs": ["a.pdf"]},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )

    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    assert event == "extract_information.shadow_comparison"
    assert fields["match"] is False
    assert fields["diff_summary"]  # non-empty
    assert "docs" in fields["diff_summary"]


@pytest.mark.asyncio
async def test_run_shadow_comparison_swallows_llm_errors() -> None:
    """A failing LLM call must not propagate — shadow is best-effort and fire-and-forget."""
    captured = _DummyLogCapture()

    async def llm_call() -> Any:
        raise RuntimeError("LLM down")

    # Must not raise.
    await extraction_shadow.run_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )

    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    # Single consolidated event — filter on status=error to exclude from the FP metric.
    assert event == "extract_information.shadow_comparison"
    assert fields["status"] == "error"
    assert fields["cache_key"] == "k1"
    assert fields["error_type"] == "RuntimeError"
    assert fields["error_stage"] == "llm_call"


@pytest.mark.asyncio
async def test_run_shadow_comparison_uses_structlog_by_default(caplog: pytest.LogCaptureFixture) -> None:
    """If no logger is injected, the module's default structlog logger is used."""

    async def llm_call() -> Any:
        return {"a": 1}

    with caplog.at_level(logging.INFO):
        await extraction_shadow.run_shadow_comparison(
            cache_key="k1",
            workflow_run_id="wfr_1",
            cached_value={"a": 1},
            cached_age_seconds=0.0,
            llm_call=llm_call,
            schema=None,
        )

    # Default path should succeed without raising even when no logger override is provided.
    # We don't assert on caplog content (structlog routing varies by test env), only that
    # no exception escaped.


@pytest.mark.asyncio
async def test_schedule_shadow_check_runs_gate_in_background() -> None:
    """schedule_shadow_check must not await the gate on the caller's stack.

    Regression guard for the P1 where handler.py used to `await` the PostHog
    flag lookup directly, blocking cache-hit returns on the flag provider.
    """
    gate_release = asyncio.Event()
    captured = _DummyLogCapture()

    async def slow_gate() -> bool:
        await gate_release.wait()
        return True

    async def llm_call() -> Any:
        return {"a": 1}

    task = extraction_shadow.schedule_shadow_check(
        gate=slow_gate,
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )

    # Caller returns immediately — gate has not run yet.
    assert len(captured.calls) == 0

    gate_release.set()
    await task
    assert len(captured.calls) == 1
    assert captured.calls[0][1]["status"] == "ok"


@pytest.mark.asyncio
async def test_schedule_shadow_check_skips_when_gate_returns_false() -> None:
    captured = _DummyLogCapture()

    async def gate() -> bool:
        return False

    async def llm_call() -> Any:  # pragma: no cover — must not be called
        raise AssertionError("LLM should not be called when gate is False")

    task = extraction_shadow.schedule_shadow_check(
        gate=gate,
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )
    await task
    # One info log confirming the gate evaluated to False — used as the
    # sampling-rate denominator (status:skipped) alongside status:ok/error.
    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    assert event == "extract_information.shadow_comparison"
    assert fields["status"] == "skipped"


@pytest.mark.asyncio
async def test_schedule_shadow_check_swallows_gate_errors() -> None:
    captured = _DummyLogCapture()

    async def gate() -> bool:
        raise RuntimeError("posthog unavailable")

    async def llm_call() -> Any:  # pragma: no cover — must not be called
        raise AssertionError("LLM should not be called when gate raises")

    task = extraction_shadow.schedule_shadow_check(
        gate=gate,
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )
    await task  # must not raise
    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    assert event == "extract_information.shadow_comparison"
    assert fields["status"] == "error"
    assert fields["error_stage"] == "gate"
    assert fields["error_type"] == "RuntimeError"


@pytest.mark.asyncio
async def test_schedule_returns_none_and_warns_when_cap_reached(monkeypatch: pytest.MonkeyPatch) -> None:
    """Safety valve: when _PENDING_SHADOW_TASKS is full, schedule must skip and warn.

    Protects the hot path from LLM-provider rate-limit contention when shadow
    tasks pile up (slow provider, sustained cache-hit burst).
    """

    # Fill the pending set with already-done tasks that won't be pruned by _prune_pending()
    # — simulates an in-flight backlog rather than leaked done tasks.
    class _PendingMarker:
        def done(self) -> bool:
            return False

    fake_pending: set[Any] = {_PendingMarker() for _ in range(extraction_shadow._MAX_PENDING_SHADOWS)}
    monkeypatch.setattr(extraction_shadow, "_PENDING_SHADOW_TASKS", fake_pending)

    captured = _DummyLogCapture()

    async def llm_call() -> Any:  # pragma: no cover — must not run when capped
        raise AssertionError("shadow LLM must not run when the cap is hit")

    task = extraction_shadow.schedule_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=llm_call,
        schema=None,
        logger=captured,
    )
    assert task is None
    assert len(captured.calls) == 1
    event, fields = captured.calls[0]
    assert event == "shadow_task_cap_reached"
    assert fields["pending"] == extraction_shadow._MAX_PENDING_SHADOWS


@pytest.mark.asyncio
async def test_schedule_shadow_comparison_does_not_block_caller() -> None:
    """schedule_shadow_comparison must return immediately; background task executes after."""
    release = asyncio.Event()
    observed: list[bool] = []

    async def slow_llm_call() -> Any:
        # Block until the test tells us to proceed, proving the caller isn't awaiting us.
        await release.wait()
        return {"a": 1}

    captured = _DummyLogCapture()

    task = extraction_shadow.schedule_shadow_comparison(
        cache_key="k1",
        workflow_run_id="wfr_1",
        cached_value={"a": 1},
        cached_age_seconds=0.0,
        llm_call=slow_llm_call,
        schema=None,
        logger=captured,
    )

    # Caller returns immediately — no logs yet.
    observed.append(task is not None)
    assert len(captured.calls) == 0

    release.set()
    await task
    assert len(captured.calls) == 1
    assert captured.calls[0][1]["match"] is True