mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
510 lines
24 KiB
Python
510 lines
24 KiB
Python
"""Unit tests for the extract-information result cache."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from skyvern.forge.sdk.cache import extraction_cache
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _reset_cache() -> None:
|
|
extraction_cache._reset_for_tests()
|
|
yield
|
|
extraction_cache._reset_for_tests()
|
|
|
|
|
|
def _key(**overrides: object) -> str:
|
|
defaults: dict[str, object] = {
|
|
"call_path": "test",
|
|
"element_tree": "<html><body>docs</body></html>",
|
|
"extracted_text": "Document list",
|
|
"current_url": "https://example.com/docs",
|
|
"data_extraction_goal": "Extract document list",
|
|
"extracted_information_schema": {"type": "object", "properties": {"docs": {"type": "array"}}},
|
|
"navigation_payload": {"user": "alice"},
|
|
}
|
|
defaults.update(overrides)
|
|
return extraction_cache.compute_cache_key(**defaults) # type: ignore[arg-type]
|
|
|
|
|
|
def test_identical_inputs_produce_identical_key() -> None:
|
|
assert _key() == _key()
|
|
|
|
|
|
def test_key_changes_when_element_tree_changes() -> None:
|
|
assert _key() != _key(element_tree="<html><body>different</body></html>")
|
|
|
|
|
|
def test_key_changes_when_schema_changes() -> None:
|
|
assert _key() != _key(extracted_information_schema={"type": "object", "properties": {}})
|
|
|
|
|
|
def test_key_changes_when_extracted_text_changes() -> None:
|
|
assert _key() != _key(extracted_text="Something else entirely")
|
|
|
|
|
|
def test_key_changes_when_url_changes() -> None:
|
|
assert _key() != _key(current_url="https://example.com/other")
|
|
|
|
|
|
def test_key_changes_when_error_code_mapping_changes() -> None:
|
|
# RFC review: error_code_mapping is rendered into the prompt,
|
|
# so it must be part of the key.
|
|
assert _key(error_code_mapping={"E1": "oops"}) != _key(error_code_mapping={"E1": "different"})
|
|
|
|
|
|
def test_key_changes_when_previous_extracted_information_changes() -> None:
|
|
# RFC review: previous_extracted_information is rendered into the prompt as
|
|
# prior context. In a loop where each iteration is a fresh task so
|
|
# this is None on step 1 — the cross-iteration cache hits still land —
|
|
# but if an intra-task second-step extraction happens, the key must change.
|
|
assert _key(previous_extracted_information=None) != _key(previous_extracted_information={"prior": "value"})
|
|
|
|
|
|
def test_key_changes_when_llm_key_changes() -> None:
|
|
# RFC review: include llm_key so swapping models forces a fresh extraction
|
|
# once this cache is backed by an off-process store.
|
|
assert _key(llm_key="gpt-4o") != _key(llm_key="claude-sonnet-4-6")
|
|
|
|
|
|
def test_key_is_stable_across_equivalent_schema_dict_orderings() -> None:
|
|
schema_a = {"type": "object", "properties": {"a": {"type": "string"}, "b": {"type": "string"}}}
|
|
schema_b = {"properties": {"b": {"type": "string"}, "a": {"type": "string"}}, "type": "object"}
|
|
assert _key(extracted_information_schema=schema_a) == _key(extracted_information_schema=schema_b)
|
|
|
|
|
|
def test_lookup_returns_miss_on_empty_cache() -> None:
|
|
result = extraction_cache.lookup("wfr_1", _key())
|
|
assert result.hit is False
|
|
assert result.value is None
|
|
assert result.age_seconds is None
|
|
assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
|
|
assert result.scope == extraction_cache.SCOPE_RUN
|
|
|
|
|
|
def test_store_then_lookup_returns_hit_with_age() -> None:
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
|
|
result = extraction_cache.lookup("wfr_1", key)
|
|
assert result.hit is True
|
|
assert result.value == {"docs": ["a.pdf"]}
|
|
assert result.age_seconds is not None
|
|
assert result.age_seconds >= 0.0
|
|
assert result.fallback_reason is None
|
|
assert result.scope == extraction_cache.SCOPE_RUN
|
|
|
|
|
|
def test_lookup_returns_key_not_found_when_run_exists_but_key_does_not() -> None:
|
|
"""A run with other entries but missing this key must report key_not_found,
|
|
not first_call_in_run — downstream metrics use this split to distinguish
|
|
unavoidable first-call misses from potential normalization opportunities."""
|
|
extraction_cache.store("wfr_1", _key(current_url="https://example.com/A"), {"a": 1})
|
|
result = extraction_cache.lookup("wfr_1", _key(current_url="https://example.com/B"))
|
|
assert result.hit is False
|
|
assert result.value is None
|
|
assert result.fallback_reason == extraction_cache.FALLBACK_KEY_NOT_FOUND
|
|
|
|
|
|
def test_cache_is_isolated_per_workflow_run_id() -> None:
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
|
|
result = extraction_cache.lookup("wfr_2", key)
|
|
assert result.hit is False
|
|
assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
|
|
|
|
|
|
def test_empty_workflow_run_id_bypasses_cache() -> None:
|
|
key = _key()
|
|
extraction_cache.store(None, key, {"docs": ["a.pdf"]})
|
|
assert extraction_cache.lookup(None, key) is None
|
|
|
|
|
|
def test_clear_workflow_run_drops_entries() -> None:
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
|
|
extraction_cache.clear_workflow_run("wfr_1")
|
|
assert extraction_cache.lookup("wfr_1", key).hit is False
|
|
|
|
|
|
def test_fifo_eviction_when_run_cache_is_full() -> None:
|
|
# Insert MAX + 1 distinct entries; the oldest should be evicted.
|
|
max_entries = extraction_cache._MAX_ENTRIES_PER_RUN
|
|
first_key = _key(current_url="https://example.com/0")
|
|
extraction_cache.store("wfr_1", first_key, {"i": 0})
|
|
for i in range(1, max_entries + 1):
|
|
k = _key(current_url=f"https://example.com/{i}")
|
|
extraction_cache.store("wfr_1", k, {"i": i})
|
|
|
|
assert extraction_cache.lookup("wfr_1", first_key).hit is False
|
|
last_key = _key(current_url=f"https://example.com/{max_entries}")
|
|
last_result = extraction_cache.lookup("wfr_1", last_key)
|
|
assert last_result.hit is True
|
|
assert last_result.value == {"i": max_entries}
|
|
|
|
|
|
def test_store_and_lookup_list_result() -> None:
|
|
"""Extraction schemas with array roots produce list results — these must be cached too."""
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, [{"doc": "a.pdf"}, {"doc": "b.pdf"}])
|
|
result = extraction_cache.lookup("wfr_1", key)
|
|
assert result.hit is True
|
|
assert result.value == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]
|
|
|
|
|
|
def test_store_and_lookup_string_result() -> None:
|
|
"""Some extractions return a plain string — these must be cached too."""
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, "plain text extraction")
|
|
result = extraction_cache.lookup("wfr_1", key)
|
|
assert result.hit is True
|
|
assert result.value == "plain text extraction"
|
|
|
|
|
|
def test_lookup_age_seconds_is_monotonic_delta(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""age_seconds should reflect elapsed time between store() and lookup()."""
|
|
fake_now = [1_000.0]
|
|
|
|
def _fake_monotonic() -> float:
|
|
return fake_now[0]
|
|
|
|
monkeypatch.setattr(extraction_cache.time, "monotonic", _fake_monotonic)
|
|
|
|
key = _key()
|
|
extraction_cache.store("wfr_1", key, {"docs": []})
|
|
|
|
fake_now[0] = 1_012.5
|
|
result = extraction_cache.lookup("wfr_1", key)
|
|
assert result.hit is True
|
|
assert result.age_seconds == pytest.approx(12.5)
|
|
|
|
|
|
def test_invalidate_key_drops_single_entry() -> None:
|
|
"""Per-key eviction leaves sibling entries intact. Used by the retry self-heal path."""
|
|
key_a = _key()
|
|
key_b = _key(current_url="https://example.com/other")
|
|
extraction_cache.store("wfr_1", key_a, {"v": "a"})
|
|
extraction_cache.store("wfr_1", key_b, {"v": "b"})
|
|
|
|
removed = extraction_cache.invalidate_key("wfr_1", key_a)
|
|
assert removed is True
|
|
assert extraction_cache.lookup("wfr_1", key_a).hit is False
|
|
# Sibling entry must survive — invalidate is per-key, not per-run.
|
|
hit_b = extraction_cache.lookup("wfr_1", key_b)
|
|
assert hit_b.hit is True
|
|
assert hit_b.value == {"v": "b"}
|
|
|
|
|
|
def test_invalidate_key_returns_false_for_unknown_key() -> None:
|
|
extraction_cache.store("wfr_1", _key(), {"v": "a"})
|
|
assert extraction_cache.invalidate_key("wfr_1", "nonexistent-key") is False
|
|
|
|
|
|
def test_invalidate_key_returns_false_for_unknown_workflow_run() -> None:
|
|
assert extraction_cache.invalidate_key("wfr_missing", _key()) is False
|
|
|
|
|
|
def test_invalidate_key_returns_false_for_empty_workflow_run_id() -> None:
|
|
"""Falsy workflow_run_id is a no-op, matching the store/lookup contract."""
|
|
assert extraction_cache.invalidate_key(None, _key()) is False
|
|
assert extraction_cache.invalidate_key("", _key()) is False
|
|
|
|
|
|
def test_compute_cache_key_rejects_legacy_local_datetime_kwarg() -> None:
|
|
"""``local_datetime`` was dropped from the signature (SKY-8873): content
|
|
hash alone defines cache identity, so callers that still try to pass it
|
|
must fail loudly rather than silently producing a key that happens to be
|
|
stable-for-the-wrong-reason."""
|
|
with pytest.raises(TypeError):
|
|
extraction_cache.compute_cache_key(call_path="test", local_datetime="2026-04-10T00:00:00") # type: ignore[call-arg]
|
|
|
|
|
|
def test_none_and_empty_string_produce_different_keys() -> None:
|
|
"""None and '' are distinct states and must not collide in the cache key."""
|
|
assert _key(extracted_text=None) != _key(extracted_text="")
|
|
assert _key(current_url=None) != _key(current_url="")
|
|
assert _key(data_extraction_goal=None) != _key(data_extraction_goal="")
|
|
|
|
|
|
def test_lookup_refreshes_lru_position() -> None:
|
|
"""A cache hit should refresh the run's LRU position, preventing eviction."""
|
|
max_runs = extraction_cache._MAX_WORKFLOW_RUNS
|
|
key = _key()
|
|
|
|
# Fill the global cache to capacity: wfr_oldest first, then wfr_1..wfr_(N-1).
|
|
extraction_cache.store("wfr_oldest", key, {"v": 0})
|
|
for i in range(1, max_runs):
|
|
extraction_cache.store(f"wfr_{i}", key, {"v": i})
|
|
|
|
# Cache is at capacity. wfr_oldest is the LRU candidate.
|
|
# A lookup() hit should refresh its position to most-recent.
|
|
refreshed = extraction_cache.lookup("wfr_oldest", key)
|
|
assert refreshed.hit is True
|
|
assert refreshed.value == {"v": 0}
|
|
|
|
# Adding one more run triggers eviction. Without the LRU refresh,
|
|
# wfr_oldest would be evicted; with it, wfr_1 (now the oldest) goes.
|
|
extraction_cache.store("wfr_new", key, {"v": 999})
|
|
|
|
oldest_after = extraction_cache.lookup("wfr_oldest", key)
|
|
assert oldest_after.hit is True
|
|
assert oldest_after.value == {"v": 0}
|
|
assert extraction_cache.lookup("wfr_1", key).hit is False # evicted
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _canonical_url primitive
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCanonicalUrl:
|
|
def test_returns_none_for_none(self) -> None:
|
|
assert extraction_cache._canonical_url(None) is None
|
|
|
|
def test_returns_empty_for_empty(self) -> None:
|
|
assert extraction_cache._canonical_url("") == ""
|
|
|
|
def test_leaves_simple_url_unchanged(self) -> None:
|
|
assert extraction_cache._canonical_url("https://example.com/docs") == "https://example.com/docs"
|
|
|
|
def test_sorts_query_params_by_key(self) -> None:
|
|
assert extraction_cache._canonical_url("https://x/y?b=2&a=1") == "https://x/y?a=1&b=2"
|
|
|
|
def test_redacts_nonce_param_values_and_preserves_keys(self) -> None:
|
|
"""Nonce values are replaced with a sentinel but keys are preserved so
|
|
presence/absence of the param still differentiates cache keys.
|
|
"""
|
|
out = extraction_cache._canonical_url("https://x/y?a=1&_csrf=abc&b=2")
|
|
assert "_csrf=__NONCE__" in out
|
|
assert "a=1" in out and "b=2" in out
|
|
|
|
def test_same_nonce_key_different_values_produce_same_canonical(self) -> None:
|
|
"""Two URLs that differ only in a nonce value must hash identically."""
|
|
a = extraction_cache._canonical_url("https://x/y?_csrf=abc")
|
|
b = extraction_cache._canonical_url("https://x/y?_csrf=xyz")
|
|
assert a == b
|
|
|
|
def test_nonce_key_absent_vs_present_produce_different_canonical(self) -> None:
|
|
"""A URL with the nonce key absent must canonicalize differently than one with the key present."""
|
|
with_nonce = extraction_cache._canonical_url("https://x/y?_csrf=abc")
|
|
without = extraction_cache._canonical_url("https://x/y")
|
|
assert with_nonce != without
|
|
|
|
def test_empty_nonce_value_does_not_collide_with_populated_value(self) -> None:
|
|
"""`?_csrf=` (empty) must canonicalize differently than `?_csrf=abc`."""
|
|
empty = extraction_cache._canonical_url("https://x/y?_csrf=")
|
|
populated = extraction_cache._canonical_url("https://x/y?_csrf=abc")
|
|
assert empty != populated
|
|
assert "_csrf=__NONCE__" not in empty
|
|
|
|
def test_bare_flag_does_not_collide_with_empty_value(self) -> None:
|
|
"""`?flag` (no `=`) must canonicalize differently than `?flag=`."""
|
|
bare = extraction_cache._canonical_url("https://x/y?flag")
|
|
empty = extraction_cache._canonical_url("https://x/y?flag=")
|
|
assert bare != empty
|
|
assert bare.endswith("?flag")
|
|
assert empty.endswith("?flag=")
|
|
|
|
def test_preserves_fragment(self) -> None:
|
|
"""SPAs with hash routing encode page identity in the fragment (e.g. `#/orders/123` vs
|
|
`#/orders/456`); stripping the fragment would collapse structurally-different pages."""
|
|
assert extraction_cache._canonical_url("https://x/y?a=1#section") == "https://x/y?a=1#section"
|
|
|
|
def test_different_fragments_produce_different_canonical(self) -> None:
|
|
"""Hash-routed SPA URLs must canonicalize distinctly when the fragment differs."""
|
|
a = extraction_cache._canonical_url("https://x/y#/orders/123")
|
|
b = extraction_cache._canonical_url("https://x/y#/orders/456")
|
|
assert a != b
|
|
|
|
def test_preserves_duplicate_keys_in_order(self) -> None:
|
|
# Repeated keys can be semantically ordered (first-wins handlers,
|
|
# ordered multi-sort). Python's stable sort preserves insertion order
|
|
# within the same key.
|
|
url = "https://x/y?sort=price&sort=rating"
|
|
assert extraction_cache._canonical_url(url) == "https://x/y?sort=price&sort=rating"
|
|
|
|
def test_trailing_punctuation_is_not_stripped(self) -> None:
|
|
# _canonical_url operates on pre-parsed URL strings, not on URLs
|
|
# embedded in prose. Callers pass `current_url` directly.
|
|
assert extraction_cache._canonical_url("https://x/y.") == "https://x/y."
|
|
|
|
def test_malformed_url_returns_input_unchanged(self) -> None:
|
|
# Never raise — cache lookup must degrade gracefully.
|
|
assert extraction_cache._canonical_url("not a url at all") == "not a url at all"
|
|
|
|
def test_case_insensitive_nonce_match_redacts_value(self) -> None:
|
|
"""Uppercase nonce keys are still matched; the value is redacted, the key preserved."""
|
|
out = extraction_cache._canonical_url("https://x/y?CSRF=abc&a=1")
|
|
assert "CSRF=__NONCE__" in out
|
|
assert "a=1" in out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _canonical_element_tree primitive
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCanonicalElementTree:
|
|
def test_returns_none_for_none(self) -> None:
|
|
assert extraction_cache._canonical_element_tree(None) is None
|
|
|
|
def test_returns_empty_for_empty(self) -> None:
|
|
assert extraction_cache._canonical_element_tree("") == ""
|
|
|
|
def test_scrubs_uuid_in_id_attribute(self) -> None:
|
|
h1 = '<div id="3f8a9b12-1234-4678-9abc-def012345678">x</div>'
|
|
h2 = '<div id="fedcba98-8765-4321-abcd-123456789abc">x</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_random_hex_suffix_in_id_attribute(self) -> None:
|
|
h1 = '<div id="row-abc123def">x</div>'
|
|
h2 = '<div id="row-fedcba987">x</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_data_testid(self) -> None:
|
|
h1 = '<button data-testid="btn-1a2b3c4d">go</button>'
|
|
h2 = '<button data-testid="btn-5e6f7a8b">go</button>'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_leaves_class_and_href_untouched(self) -> None:
|
|
# class and href carry semantic weight — they must differentiate pages.
|
|
h1 = '<a class="btn primary" href="/docs">go</a>'
|
|
h2 = '<a class="btn danger" href="/docs">go</a>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_csrf_input_value(self) -> None:
|
|
h1 = '<input name="_csrf" value="abc123">'
|
|
h2 = '<input name="_csrf" value="zyx987">'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_csrf_meta_content(self) -> None:
|
|
h1 = '<meta name="csrf-token" content="abc123">'
|
|
h2 = '<meta name="csrf-token" content="zyx987">'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_canonical_element_tree_returns_string_for_valid_html(self) -> None:
|
|
# selectolax is permissive enough that we can't reliably force its
|
|
# parser to raise from pytest; the except-path fallback is exercised
|
|
# indirectly by the None/empty-string guards at the top of the function.
|
|
assert extraction_cache._canonical_element_tree("<div>ok</div>") is not None
|
|
|
|
def test_preserves_text_content(self) -> None:
|
|
out = extraction_cache._canonical_element_tree('<div id="x-abc123def">hello world</div>')
|
|
assert "hello world" in out
|
|
|
|
def test_different_text_produces_different_output(self) -> None:
|
|
out1 = extraction_cache._canonical_element_tree("<div>alpha</div>")
|
|
out2 = extraction_cache._canonical_element_tree("<div>beta</div>")
|
|
assert out1 != out2
|
|
|
|
def test_scrubs_csrf_input_case_insensitive(self) -> None:
|
|
"""CSRF <input name=...> match must be case-insensitive, matching prior regex behavior."""
|
|
h1 = '<input name="CSRF_TOKEN" value="abc123">'
|
|
h2 = '<input name="CSRF_TOKEN" value="zyx987">'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_csrf_meta_case_insensitive(self) -> None:
|
|
"""CSRF <meta name=...> match must be case-insensitive, matching prior regex behavior."""
|
|
h1 = '<meta name="CSRF-TOKEN" content="abc123">'
|
|
h2 = '<meta name="CSRF-TOKEN" content="zyx987">'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_preserves_semantic_input_name_values(self) -> None:
|
|
"""<input name=...> values carry field-name semantics (not transient IDs).
|
|
Two forms with different input names must NOT collapse to the same canonical.
|
|
"""
|
|
h1 = '<form><input name="company_name" type="text"><button>Go</button></form>'
|
|
h2 = '<form><input name="contact_phone" type="text"><button>Go</button></form>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_preserves_stable_business_ids_in_suspect_attrs(self) -> None:
|
|
"""Semantic identifiers without transient patterns must survive canonicalization.
|
|
|
|
Only UUIDs and random-looking hex suffixes are redacted inside suspect
|
|
attributes; stable business IDs like id='submit-button' must differentiate.
|
|
"""
|
|
h1 = '<button id="submit-button">go</button>'
|
|
h2 = '<button id="cancel-button">go</button>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_preserves_numeric_only_suffix_in_suspect_attrs(self) -> None:
|
|
"""Purely numeric suffixes (e.g. 'order-123456') are stable business IDs; don't collapse them."""
|
|
h1 = '<div id="order-123456">go</div>'
|
|
h2 = '<div id="order-987654">go</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_preserves_hex_letter_only_english_words_in_suspect_attrs(self) -> None:
|
|
"""Hex-letter-only strings like 'facade' or 'decade' are English words, not random IDs."""
|
|
h1 = '<div id="zone-facade">go</div>'
|
|
h2 = '<div id="zone-decade">go</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_preserves_non_v4_uuid_in_suspect_attrs(self) -> None:
|
|
"""v1/v3/v5 UUIDs are deterministic / namespace-based and can be stable business keys."""
|
|
# v1 UUID (version nibble = 1) — must NOT be collapsed
|
|
h1 = '<div id="3f8a9b12-1234-1678-9abc-def012345678">x</div>'
|
|
h2 = '<div id="fedcba98-8765-1321-abcd-123456789abc">x</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) != extraction_cache._canonical_element_tree(h2)
|
|
|
|
def test_scrubs_attr_case_insensitively(self) -> None:
|
|
"""Even if a parser surfaces an uppercase attribute name, it must still match the suspect set."""
|
|
# selectolax 0.3.34 normalizes to lowercase, but we want robustness if that ever changes.
|
|
# Exercise via direct set membership: this test pins the lower() call.
|
|
h1 = '<div ID="3f8a9b12-1234-4678-9abc-def012345678">x</div>'
|
|
h2 = '<div ID="fedcba98-8765-4321-abcd-123456789abc">x</div>'
|
|
assert extraction_cache._canonical_element_tree(h1) == extraction_cache._canonical_element_tree(h2)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compute_cache_key — structured-path canonicalization integration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_key_stable_across_nonce_params_in_url() -> None:
|
|
"""current_url with different nonce param values should still hit."""
|
|
assert _key(current_url="https://x/y?a=1&_csrf=abc") == _key(current_url="https://x/y?a=1&_csrf=xyz")
|
|
|
|
|
|
def test_key_stable_across_uuid_in_element_tree() -> None:
|
|
"""element_tree with different UUIDs in id= attributes should still hit."""
|
|
h1 = '<div id="3f8a9b12-1234-4678-9abc-def012345678">doc</div>'
|
|
h2 = '<div id="fedcba98-8765-4321-abcd-123456789abc">doc</div>'
|
|
assert _key(element_tree=h1) == _key(element_tree=h2)
|
|
|
|
|
|
def test_key_stable_across_csrf_token_in_element_tree() -> None:
|
|
"""element_tree with different CSRF tokens should still hit."""
|
|
h1 = '<input name="_csrf" value="abc123">'
|
|
h2 = '<input name="_csrf" value="zyx987">'
|
|
assert _key(element_tree=h1) == _key(element_tree=h2)
|
|
|
|
|
|
def test_key_stable_across_iso_timestamps_in_extracted_text() -> None:
|
|
"""extracted_text with same-day ISO timestamps should still hit."""
|
|
t1 = "Report\n2026-04-10T08:30:15.123456\nEnd"
|
|
t2 = "Report\n2026-04-10T23:59:59.999999\nEnd"
|
|
assert _key(extracted_text=t1) == _key(extracted_text=t2)
|
|
|
|
|
|
def test_key_changes_across_different_dates_in_extracted_text() -> None:
|
|
"""Midnight crossing in extracted_text must produce a different key."""
|
|
t1 = "Report\n2026-04-10T23:59:59\nEnd"
|
|
t2 = "Report\n2026-04-11T00:00:01\nEnd"
|
|
assert _key(extracted_text=t1) != _key(extracted_text=t2)
|
|
|
|
|
|
def test_call_path_discriminator_isolates_otherwise_identical_keys() -> None:
|
|
"""Different call_paths must produce different keys even when every other
|
|
input is identical — guards against silent cross-path cache hits (e.g.
|
|
script path replaying an agent-path extraction result)."""
|
|
assert _key(call_path="handler") != _key(call_path="script")
|
|
assert _key(call_path="handler") != _key(call_path="agent")
|
|
assert _key(call_path="script") != _key(call_path="agent")
|
|
|
|
|
|
def test_key_stable_across_iso_timestamps_in_data_extraction_goal() -> None:
|
|
"""Same-day ISO timestamps in the goal (e.g. 'extract updated after <ts>')
|
|
must not cause per-second key churn."""
|
|
g1 = "Extract records updated after\n2026-04-10T08:30:15.123456\nonward"
|
|
g2 = "Extract records updated after\n2026-04-10T23:59:59.999999\nonward"
|
|
assert _key(data_extraction_goal=g1) == _key(data_extraction_goal=g2)
|