Skyvern/tests/unit/test_extraction_cache.py

"""Unit tests for the extract-information result cache."""

from __future__ import annotations

import pytest

from skyvern.forge.sdk.cache import extraction_cache


@pytest.fixture(autouse=True)
def _reset_cache() -> None:
    extraction_cache._reset_for_tests()
    yield
    extraction_cache._reset_for_tests()


def _key(**overrides: object) -> str:
    defaults: dict[str, object] = {
        "call_path": "test",
        "element_tree": "<html><body>docs</body></html>",
        "extracted_text": "Document list",
        "current_url": "https://example.com/docs",
        "data_extraction_goal": "Extract document list",
        "extracted_information_schema": {"type": "object", "properties": {"docs": {"type": "array"}}},
        "navigation_payload": {"user": "alice"},
    }
    defaults.update(overrides)
    return extraction_cache.compute_cache_key(**defaults)  # type: ignore[arg-type]


def test_identical_inputs_produce_identical_key() -> None:
    assert _key() == _key()


def test_key_changes_when_element_tree_changes() -> None:
    assert _key() != _key(element_tree="<html><body>different</body></html>")


def test_key_changes_when_schema_changes() -> None:
    assert _key() != _key(extracted_information_schema={"type": "object", "properties": {}})


def test_key_changes_when_extracted_text_changes() -> None:
    assert _key() != _key(extracted_text="Something else entirely")


def test_key_changes_when_url_changes() -> None:
    assert _key() != _key(current_url="https://example.com/other")


def test_key_changes_when_error_code_mapping_changes() -> None:
    # RFC review: error_code_mapping is rendered into the prompt,
    # so it must be part of the key.
    assert _key(error_code_mapping={"E1": "oops"}) != _key(error_code_mapping={"E1": "different"})


def test_key_changes_when_previous_extracted_information_changes() -> None:
    # RFC review: previous_extracted_information is rendered into the prompt as
    # prior context. In a loop where each iteration is a fresh task so
    # this is None on step 1 — the cross-iteration cache hits still land —
    # but if an intra-task second-step extraction happens, the key must change.
    assert _key(previous_extracted_information=None) != _key(previous_extracted_information={"prior": "value"})


def test_key_changes_when_llm_key_changes() -> None:
    # RFC review: include llm_key so swapping models forces a fresh extraction
    # once this cache is backed by an off-process store.
    assert _key(llm_key="gpt-4o") != _key(llm_key="claude-sonnet-4-6")


def test_key_is_stable_across_equivalent_schema_dict_orderings() -> None:
    schema_a = {"type": "object", "properties": {"a": {"type": "string"}, "b": {"type": "string"}}}
    schema_b = {"properties": {"b": {"type": "string"}, "a": {"type": "string"}}, "type": "object"}
    assert _key(extracted_information_schema=schema_a) == _key(extracted_information_schema=schema_b)


def test_lookup_returns_miss_on_empty_cache() -> None:
    result = extraction_cache.lookup("wfr_1", _key())
    assert result.hit is False
    assert result.value is None
    assert result.age_seconds is None
    assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
    assert result.scope == extraction_cache.SCOPE_RUN


def test_store_then_lookup_returns_hit_with_age() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
    result = extraction_cache.lookup("wfr_1", key)
    assert result.hit is True
    assert result.value == {"docs": ["a.pdf"]}
    assert result.age_seconds is not None
    assert result.age_seconds >= 0.0
    assert result.fallback_reason is None
    assert result.scope == extraction_cache.SCOPE_RUN


def test_lookup_returns_key_not_found_when_run_exists_but_key_does_not() -> None:
    """A run with other entries but missing this key must report key_not_found,
    not first_call_in_run — downstream metrics use this split to distinguish
    unavoidable first-call misses from potential normalization opportunities."""
    extraction_cache.store("wfr_1", _key(current_url="https://example.com/A"), {"a": 1})
    result = extraction_cache.lookup("wfr_1", _key(current_url="https://example.com/B"))
    assert result.hit is False
    assert result.value is None
    assert result.fallback_reason == extraction_cache.FALLBACK_KEY_NOT_FOUND


def test_cache_is_isolated_per_workflow_run_id() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
    result = extraction_cache.lookup("wfr_2", key)
    assert result.hit is False
    assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN


def test_empty_workflow_run_id_bypasses_cache() -> None:
    key = _key()
    extraction_cache.store(None, key, {"docs": ["a.pdf"]})
    assert extraction_cache.lookup(None, key) is None


def test_clear_workflow_run_drops_entries() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
    extraction_cache.clear_workflow_run("wfr_1")
    assert extraction_cache.lookup("wfr_1", key).hit is False


def test_fifo_eviction_when_run_cache_is_full() -> None:
    # Insert MAX + 1 distinct entries; the oldest should be evicted.
    max_entries = extraction_cache._MAX_ENTRIES_PER_RUN
    first_key = _key(current_url="https://example.com/0")
    extraction_cache.store("wfr_1", first_key, {"i": 0})
    for i in range(1, max_entries + 1):
        k = _key(current_url=f"https://example.com/{i}")
        extraction_cache.store("wfr_1", k, {"i": i})

    assert extraction_cache.lookup("wfr_1", first_key).hit is False
    last_key = _key(current_url=f"https://example.com/{max_entries}")
    last_result = extraction_cache.lookup("wfr_1", last_key)
    assert last_result.hit is True
    assert last_result.value == {"i": max_entries}


def test_store_and_lookup_list_result() -> None:
    """Extraction schemas with array roots produce list results — these must be cached too."""
    key = _key()
    extraction_cache.store("wfr_1", key, [{"doc": "a.pdf"}, {"doc": "b.pdf"}])
    result = extraction_cache.lookup("wfr_1", key)
    assert result.hit is True
    assert result.value == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]


def test_store_and_lookup_string_result() -> None:
    """Some extractions return a plain string — these must be cached too."""
    key = _key()
    extraction_cache.store("wfr_1", key, "plain text extraction")
    result = extraction_cache.lookup("wfr_1", key)
    assert result.hit is True
    assert result.value == "plain text extraction"


def test_lookup_age_seconds_is_monotonic_delta(monkeypatch: pytest.MonkeyPatch) -> None:
    """age_seconds should reflect elapsed time between store() and lookup()."""
    fake_now = [1_000.0]

    def _fake_monotonic() -> float:
        return fake_now[0]

    monkeypatch.setattr(extraction_cache.time, "monotonic", _fake_monotonic)

    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": []})

    fake_now[0] = 1_012.5
    result = extraction_cache.lookup("wfr_1", key)
    assert result.hit is True
    assert result.age_seconds == pytest.approx(12.5)


def test_invalidate_key_drops_single_entry() -> None:
    """Per-key eviction leaves sibling entries intact. Used by the retry self-heal path."""
    key_a = _key()
    key_b = _key(current_url="https://example.com/other")
    extraction_cache.store("wfr_1", key_a, {"v": "a"})
    extraction_cache.store("wfr_1", key_b, {"v": "b"})

    removed = extraction_cache.invalidate_key("wfr_1", key_a)
    assert removed is True
    assert extraction_cache.lookup("wfr_1", key_a).hit is False
    # Sibling entry must survive — invalidate is per-key, not per-run.
    hit_b = extraction_cache.lookup("wfr_1", key_b)
    assert hit_b.hit is True
    assert hit_b.value == {"v": "b"}


def test_invalidate_key_returns_false_for_unknown_key() -> None:
    extraction_cache.store("wfr_1", _key(), {"v": "a"})
    assert extraction_cache.invalidate_key("wfr_1", "nonexistent-key") is False


def test_invalidate_key_returns_false_for_unknown_workflow_run() -> None:
    assert extraction_cache.invalidate_key("wfr_missing", _key()) is False


def test_invalidate_key_returns_false_for_empty_workflow_run_id() -> None:
    """Falsy workflow_run_id is a no-op, matching the store/lookup contract."""
    assert extraction_cache.invalidate_key(None, _key()) is False
    assert extraction_cache.invalidate_key("", _key()) is False


def test_compute_cache_key_rejects_legacy_local_datetime_kwarg() -> None:
    """``local_datetime`` was dropped from the signature (SKY-8873): content
    hash alone defines cache identity, so callers that still try to pass it
    must fail loudly rather than silently producing a key that happens to be
    stable-for-the-wrong-reason."""
    with pytest.raises(TypeError):
        extraction_cache.compute_cache_key(call_path="test", local_datetime="2026-04-10T00:00:00")  # type: ignore[call-arg]


def test_none_and_empty_string_produce_different_keys() -> None:
    """None and '' are distinct states and must not collide in the cache key."""
    assert _key(extracted_text=None) != _key(extracted_text="")
    assert _key(current_url=None) != _key(current_url="")
    assert _key(data_extraction_goal=None) != _key(data_extraction_goal="")


def test_lookup_refreshes_lru_position() -> None:
    """A cache hit should refresh the run's LRU position, preventing eviction."""
    max_runs = extraction_cache._MAX_WORKFLOW_RUNS
    key = _key()

    # Fill the global cache to capacity: wfr_oldest first, then wfr_1..wfr_(N-1).
    extraction_cache.store("wfr_oldest", key, {"v": 0})
    for i in range(1, max_runs):
        extraction_cache.store(f"wfr_{i}", key, {"v": i})

    # Cache is at capacity. wfr_oldest is the LRU candidate.
    # A lookup() hit should refresh its position to most-recent.
    refreshed = extraction_cache.lookup("wfr_oldest", key)
    assert refreshed.hit is True
    assert refreshed.value == {"v": 0}

    # Adding one more run triggers eviction. Without the LRU refresh,
    # wfr_oldest would be evicted; with it, wfr_1 (now the oldest) goes.
    extraction_cache.store("wfr_new", key, {"v": 999})

    oldest_after = extraction_cache.lookup("wfr_oldest", key)
    assert oldest_after.hit is True
    assert oldest_after.value == {"v": 0}
    assert extraction_cache.lookup("wfr_1", key).hit is False  # evicted


# ---------------------------------------------------------------------------
# _canonical_url primitive
# ---------------------------------------------------------------------------


class TestCanonicalUrl:
    def test_returns_none_for_none(self) -> None:
        assert extraction_cache._canonical_url(None) is None

    def test_returns_empty_for_empty(self) -> None:
        assert extraction_cache._canonical_url("") == ""

    def test_leaves_simple_url_unchanged(self) -> None:
        assert extraction_cache._canonical_url("https://example.com/docs") == "https://example.com/docs"

    def test_sorts_query_params_by_key(self) -> None:
        assert extraction_cache._canonical_url("https://x/y?b=2&a=1") == "https://x/y?a=1&b=2"

    def test_redacts_nonce_param_values_and_preserves_keys(self) -> None:
        """Nonce values are replaced with a sentinel but keys are preserved so
        presence/absence of the param still differentiates cache keys.
        """
        out = extraction_cache._canonical_url("https://x/y?a=1&_csrf=abc&b=2")
        assert "_csrf=__NONCE__" in out
        assert "a=1" in out and "b=2" in out

    def test_same_nonce_key_different_values_produce_same_canonical(self) -> None:
        """Two URLs that differ only in a nonce value must hash identically."""
        a = extraction_cache._canonical_url("https://x/y?_csrf=abc")
        b = extraction_cache._canonical_url("https://x/y?_csrf=xyz")
        assert a == b

    def test_nonce_key_absent_vs_present_produce_different_canonical(self) -> None:
        """A URL with the nonce key absent must canonicalize differently than one with the key present."""
        with_nonce = extraction_cache._canonical_url("https://x/y?_csrf=abc")
        without = extraction_cache._canonical_url("https://x/y")
        assert with_nonce != without

    def test_empty_nonce_value_does_not_collide_with_populated_value(self) -> None:
        """`?_csrf=` (empty) must canonicalize differently than `?_csrf=abc`."""
        empty = extraction_cache._canonical_url("https://x/y?_csrf=")
        populated = extraction_cache._canonical_url("https://x/y?_csrf=abc")
        assert empty != populated
        assert "_csrf=__NONCE__" not in empty

    def test_bare_flag_does_not_collide_with_empty_value(self) -> None:
        """`?flag` (no `=`) must canonicalize differently than `?flag=`."""
        bare = extraction_cache._canonical_url("https://x/y?flag")
        empty = extraction_cache._canonical_url("https://x/y?flag=")
        assert bare != empty
        assert bare.endswith("?flag")
        assert empty.endswith("?flag=")

    def test_preserves_fragment(self) -> None:
        """SPAs with hash routing encode page identity in the fragment (e.g. `#/orders/123` vs
        `#/orders/456`); stripping the fragment would collapse structurally-different pages."""
        assert extraction_cache._canonical_url("https://x/y?a=1#section") == "https://x/y?a=1#section"

    def test_different_fragments_produce_different_canonical(self) -> None:
        """Hash-routed SPA URLs must canonicalize distinctly when the fragment differs."""
        a = extraction_cache._canonical_url("https://x/y#/orders/123")
        b = extraction_cache._canonical_url("https://x/y#/orders/456")
        assert a != b

    def test_preserves_duplicate_keys_in_order(self) -> None:
        # Repeated keys can be semantically ordered (first-wins handlers,
        # ordered multi-sort). Python's stable sort preserves insertion order
        # within the same key.
        url = "https://x/y?sort=price&sort=rating"
        assert extraction_cache._canonical_url(url) == "https://x/y?sort=price&sort=rating"

    def test_trailing_punctuation_is_not_stripped(self) -> None:
        # _canonical_url operates on pre-parsed URL strings, not on URLs
        # embedded in prose. Callers pass `current_url` directly.
        assert extraction_cache._canonical_url("https://x/y.") == "https://x/y."

    def test_malformed_url_returns_input_unchanged(self) -> None:
        # Never raise — cache lookup must degrade gracefully.
        assert extraction_cache._canonical_url("not a url at all") == "not a url at all"

    def test_case_insensitive_nonce_match_redacts_value(self) -> None:
        """Uppercase nonce keys are still matched; the value is redacted, the key preserved."""
        out = extraction_cache._canonical_url("https://x/y?CSRF=abc&a=1")
        assert "CSRF=__NONCE__" in out
        assert "a=1" in out


# ---------------------------------------------------------------------------
# _canonical_element_tree primitive
# ---------------------------------------------------------------------------


class TestCanonicalElementTree:
    def test_returns_none_for_none(self) -> None:
        assert extraction_cache._canonical_element_tree(None) is None

    def test_returns_empty_for_empty(self) -> None:
        assert extraction_cache._canonical_element_tree("") == ""

    def test_scrubs_uuid_in_id_attribute(self) -> None:
        h1 = '<div id="3f8a9b12-1234-4678-9abc-def012345678">x</div>'
        h2 = '<div id="fedcba98-8765-4321-abcd-123456789abc">x</div>'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_scrubs_random_hex_suffix_in_id_attribute(self) -> None:
        h1 = '<div id="row-abc123def">x</div>'
        h2 = '<div id="row-fedcba987">x</div>'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_scrubs_data_testid(self) -> None:
        h1 = '<button data-testid="btn-1a2b3c4d">go</button>'
        h2 = '<button data-testid="btn-5e6f7a8b">go</button>'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_leaves_class_and_href_untouched(self) -> None:
        # class and href carry semantic weight — they must differentiate pages.
        h1 = '<a class="btn primary" href="/docs">go</a>'
        h2 = '<a class="btn danger" href="/docs">go</a>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_scrubs_csrf_input_value(self) -> None:
        h1 = '<input name="_csrf" value="abc123">'
        h2 = '<input name="_csrf" value="zyx987">'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_scrubs_csrf_meta_content(self) -> None:
        h1 = '<meta name="csrf-token" content="abc123">'
        h2 = '<meta name="csrf-token" content="zyx987">'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_canonical_element_tree_returns_string_for_valid_html(self) -> None:
        # selectolax is permissive enough that we can't reliably force its
        # parser to raise from pytest; the except-path fallback is exercised
        # indirectly by the None/empty-string guards at the top of the function.
        assert extraction_cache._canonical_element_tree("<div>ok</div>") is not None

    def test_preserves_text_content(self) -> None:
        out = extraction_cache._canonical_element_tree('<div id="x-abc123def">hello world</div>')
        assert "hello world" in out

    def test_different_text_produces_different_output(self) -> None:
        out1 = extraction_cache._canonical_element_tree("<div>alpha</div>")
        out2 = extraction_cache._canonical_element_tree("<div>beta</div>")
        assert out1 != out2

    def test_scrubs_csrf_input_case_insensitive(self) -> None:
        """CSRF <input name=...> match must be case-insensitive, matching prior regex behavior."""
        h1 = '<input name="CSRF_TOKEN" value="abc123">'
        h2 = '<input name="CSRF_TOKEN" value="zyx987">'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_scrubs_csrf_meta_case_insensitive(self) -> None:
        """CSRF <meta name=...> match must be case-insensitive, matching prior regex behavior."""
        h1 = '<meta name="CSRF-TOKEN" content="abc123">'
        h2 = '<meta name="CSRF-TOKEN" content="zyx987">'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)

    def test_preserves_semantic_input_name_values(self) -> None:
        """<input name=...> values carry field-name semantics (not transient IDs).
        Two forms with different input names must NOT collapse to the same canonical.
        """
        h1 = '<form><input name="company_name" type="text"><button>Go</button></form>'
        h2 = '<form><input name="contact_phone" type="text"><button>Go</button></form>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_preserves_stable_business_ids_in_suspect_attrs(self) -> None:
        """Semantic identifiers without transient patterns must survive canonicalization.

        Only UUIDs and random-looking hex suffixes are redacted inside suspect
        attributes; stable business IDs like id='submit-button' must differentiate.
        """
        h1 = '<button id="submit-button">go</button>'
        h2 = '<button id="cancel-button">go</button>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_preserves_numeric_only_suffix_in_suspect_attrs(self) -> None:
        """Purely numeric suffixes (e.g. 'order-123456') are stable business IDs; don't collapse them."""
        h1 = '<div id="order-123456">go</div>'
        h2 = '<div id="order-987654">go</div>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_preserves_hex_letter_only_english_words_in_suspect_attrs(self) -> None:
        """Hex-letter-only strings like 'facade' or 'decade' are English words, not random IDs."""
        h1 = '<div id="zone-facade">go</div>'
        h2 = '<div id="zone-decade">go</div>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_preserves_non_v4_uuid_in_suspect_attrs(self) -> None:
        """v1/v3/v5 UUIDs are deterministic / namespace-based and can be stable business keys."""
        # v1 UUID (version nibble = 1) — must NOT be collapsed
        h1 = '<div id="3f8a9b12-1234-1678-9abc-def012345678">x</div>'
        h2 = '<div id="fedcba98-8765-1321-abcd-123456789abc">x</div>'
        assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)

    def test_scrubs_attr_case_insensitively(self) -> None:
        """Even if a parser surfaces an uppercase attribute name, it must still match the suspect set."""
        # selectolax 0.3.34 normalizes to lowercase, but we want robustness if that ever changes.
        # Exercise via direct set membership: this test pins the lower() call.
        h1 = '<div ID="3f8a9b12-1234-4678-9abc-def012345678">x</div>'
        h2 = '<div ID="fedcba98-8765-4321-abcd-123456789abc">x</div>'
        assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)


# ---------------------------------------------------------------------------
# compute_cache_key — structured-path canonicalization integration
# ---------------------------------------------------------------------------


def test_key_stable_across_nonce_params_in_url() -> None:
    """current_url with different nonce param values should still hit."""
    assert _key(current_url="https://x/y?a=1&_csrf=abc") == _key(current_url="https://x/y?a=1&_csrf=xyz")


def test_key_stable_across_uuid_in_element_tree() -> None:
    """element_tree with different UUIDs in id= attributes should still hit."""
    h1 = '<div id="3f8a9b12-1234-4678-9abc-def012345678">doc</div>'
    h2 = '<div id="fedcba98-8765-4321-abcd-123456789abc">doc</div>'
    assert _key(element_tree=h1) == _key(element_tree=h2)


def test_key_stable_across_csrf_token_in_element_tree() -> None:
    """element_tree with different CSRF tokens should still hit."""
    h1 = '<input name="_csrf" value="abc123">'
    h2 = '<input name="_csrf" value="zyx987">'
    assert _key(element_tree=h1) == _key(element_tree=h2)


def test_key_stable_across_iso_timestamps_in_extracted_text() -> None:
    """extracted_text with same-day ISO timestamps should still hit."""
    t1 = "Report\n2026-04-10T08:30:15.123456\nEnd"
    t2 = "Report\n2026-04-10T23:59:59.999999\nEnd"
    assert _key(extracted_text=t1) == _key(extracted_text=t2)


def test_key_changes_across_different_dates_in_extracted_text() -> None:
    """Midnight crossing in extracted_text must produce a different key."""
    t1 = "Report\n2026-04-10T23:59:59\nEnd"
    t2 = "Report\n2026-04-11T00:00:01\nEnd"
    assert _key(extracted_text=t1) != _key(extracted_text=t2)


def test_call_path_discriminator_isolates_otherwise_identical_keys() -> None:
    """Different call_paths must produce different keys even when every other
    input is identical — guards against silent cross-path cache hits (e.g.
    script path replaying an agent-path extraction result)."""
    assert _key(call_path="handler") != _key(call_path="script")
    assert _key(call_path="handler") != _key(call_path="agent")
    assert _key(call_path="script") != _key(call_path="agent")


def test_key_stable_across_iso_timestamps_in_data_extraction_goal() -> None:
    """Same-day ISO timestamps in the goal (e.g. 'extract updated after <ts>')
    must not cause per-second key churn."""
    g1 = "Extract records updated after\n2026-04-10T08:30:15.123456\nonward"
    g2 = "Extract records updated after\n2026-04-10T23:59:59.999999\nonward"
    assert _key(data_extraction_goal=g1) == _key(data_extraction_goal=g2)