feat(extraction-cache): structured telemetry fields (SKY-8870) (#5482)

2026-04-28 03:30:10 +00:00 · 2026-04-13 19:58:33 -05:00 · 2026-04-13 19:58:33 -05:00 · 55a7966193
commit 55a7966193
parent 92b10112bd
5 changed files with 306 additions and 66 deletions
--- a/skyvern/core/script_generations/real_skyvern_page_ai.py
+++ b/skyvern/core/script_generations/real_skyvern_page_ai.py
@ -931,25 +931,57 @@ class RealSkyvernPageAi(SkyvernPageAi):

        # Share the extract-information cache with the agent path. Best-effort
        # per the RFC review — any exception falls through to the full LLM
-        # call below.
+        # call below. The `try` is narrowed to just compute_cache_key + lookup
+        # so a downstream log failure can't re-enter the except block and
+        # double-count the call as both a hit/miss and a `lookup_error` in
+        # the Datadog miss-reason metric.
        workflow_run_id = context.workflow_run_id if context else None
        cache_key: str | None = None
+        lookup_result: extraction_cache.LookupResult | None = None
        try:
            cache_key = extraction_cache.compute_cache_key(
                rendered_prompt=extract_information_prompt,
                llm_key=None,
            )
-            cached = extraction_cache.get(workflow_run_id, cache_key)
-            if cached is not None:
-                LOG.info(
-                    "ai_extract cache hit — skipping LLM call",
-                    workflow_run_id=workflow_run_id,
-                    cache_key=cache_key,
-                )
-                return cached  # type: ignore[return-value]
+            lookup_result = extraction_cache.lookup(workflow_run_id, cache_key)
        except Exception:
-            LOG.warning("ai_extract cache lookup failed; falling through to LLM", exc_info=True)
-            cache_key = None
+            LOG.warning(
+                "ai_extract cache lookup failed; falling through to LLM",
+                workflow_run_id=workflow_run_id,
+                cache_key=cache_key,
+                cache_hit=False,
+                cache_scope=extraction_cache.SCOPE_RUN,
+                cache_age_seconds=None,
+                fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
+                cache_path="script",
+                exc_info=True,
+            )
+            # Preserve cache_key so the downstream store() can still warm the cache
+            # for subsequent identical calls even when lookup() fails transiently.
+
+        if lookup_result is not None and lookup_result.hit:
+            LOG.info(
+                "ai_extract cache hit — skipping LLM call",
+                workflow_run_id=workflow_run_id,
+                cache_key=cache_key,
+                cache_hit=True,
+                cache_scope=lookup_result.scope,
+                cache_age_seconds=lookup_result.age_seconds,
+                fallback_reason=None,
+                cache_path="script",
+            )
+            return lookup_result.value  # type: ignore[return-value]
+        if lookup_result is not None:
+            LOG.info(
+                "ai_extract cache miss",
+                workflow_run_id=workflow_run_id,
+                cache_key=cache_key,
+                cache_hit=False,
+                cache_scope=lookup_result.scope,
+                cache_age_seconds=None,
+                fallback_reason=lookup_result.fallback_reason,
+                cache_path="script",
+            )
        step = None
        if context and context.organization_id and context.task_id and context.step_id:
            step = await app.DATABASE.tasks.get_step(
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@ -4968,32 +4968,71 @@ class ForgeAgent:

        # Cache the summary LLM call — the inputs (goal, schema, URL) are
        # identical across download-loop iterations that revisit the same page.
+        # The `try` is narrowed to just compute_cache_key + lookup so a
+        # downstream log failure can't double-count as a lookup_error.
        workflow_run_id = context.workflow_run_id if context else None
        cache_key: str | None = None
-        cached = None
+        lookup_result: extraction_cache.LookupResult | None = None
        try:
            cache_key = extraction_cache.compute_cache_key(
                rendered_prompt=prompt,
                llm_key=None,
            )
-            cached = extraction_cache.get(workflow_run_id, cache_key)
+            lookup_result = extraction_cache.lookup(workflow_run_id, cache_key)
        except Exception:
-            LOG.warning("data-extraction-summary cache lookup failed", exc_info=True)
+            LOG.warning(
+                "data-extraction-summary cache lookup failed",
+                workflow_run_id=workflow_run_id,
+                cache_key=cache_key,
+                cache_hit=False,
+                cache_scope=extraction_cache.SCOPE_RUN,
+                cache_age_seconds=None,
+                fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
+                cache_path="agent",
+                exc_info=True,
+            )
+            # Preserve cache_key so the store() below can still warm the cache
+            # for subsequent identical calls even when lookup() fails transiently.

-        if cached is not None:
+        if lookup_result is not None and lookup_result.hit:
            LOG.info(
                "data-extraction-summary cache hit — skipping LLM call",
                workflow_run_id=workflow_run_id,
                cache_key=cache_key,
+                cache_hit=True,
+                cache_scope=lookup_result.scope,
+                cache_age_seconds=lookup_result.age_seconds,
+                fallback_reason=None,
+                cache_path="agent",
            )
-            data_extraction_summary_resp = cached
+            data_extraction_summary_resp = lookup_result.value
        else:
+            if lookup_result is not None:
+                LOG.info(
+                    "data-extraction-summary cache miss",
+                    workflow_run_id=workflow_run_id,
+                    cache_key=cache_key,
+                    cache_hit=False,
+                    cache_scope=lookup_result.scope,
+                    cache_age_seconds=None,
+                    fallback_reason=lookup_result.fallback_reason,
+                    cache_path="agent",
+                )
            data_extraction_summary_resp = await app.EXTRACTION_LLM_API_HANDLER(
                prompt=prompt, step=step, prompt_name="data-extraction-summary"
            )
-            if cache_key:
+            # Guard on both cache_key and response to match the other two call sites
+            # and avoid caching None — a cached None would later produce hit=True/value=None,
+            # which would then trip the RuntimeError below instead of falling through to a
+            # fresh LLM call as the pre-telemetry get()-returns-None path did.
+            if cache_key and data_extraction_summary_resp is not None:
                extraction_cache.store(workflow_run_id, cache_key, data_extraction_summary_resp)

+        if data_extraction_summary_resp is None:
+            raise RuntimeError(
+                "data_extraction_summary_resp unexpectedly None after cache/LLM block "
+                f"(workflow_run_id={workflow_run_id!r}, cache_key={cache_key!r})"
+            )
        return ExtractAction(
            reasoning=data_extraction_summary_resp.get("summary", "Extracting information from the page"),
            data_extraction_goal=task.data_extraction_goal,
--- a/skyvern/forge/sdk/cache/extraction_cache.py
+++ b/skyvern/forge/sdk/cache/extraction_cache.py
@ -45,7 +45,9 @@ from __future__ import annotations
 import hashlib
 import json
 import re
+import time
 from collections import OrderedDict
+from dataclasses import dataclass
 from typing import Any

 import structlog
@ -57,8 +59,51 @@ _MAX_WORKFLOW_RUNS = 256
 # Sentinel hashed in place of None so that None and "" produce different keys.
 _NULL_SENTINEL = "\x00__NULL__"

-# workflow_run_id -> ordered dict of {cache_key: extraction_result}
-_CACHE: OrderedDict[str, OrderedDict[str, Any]] = OrderedDict()
+# Cache scope identifiers. v1 ships with "run" only; "wpid" and "global"
+# are reserved for the Redis cross-run cache (SKY-8873/SKY-8874).
+SCOPE_RUN = "run"
+
+# Fallback reasons emitted on cache misses. Used by log-based metrics to
+# distinguish first-call-in-run (unavoidable) from key-not-found (possible
+# normalization opportunity) from lookup_error (bug or infra issue).
+FALLBACK_FIRST_CALL_IN_RUN = "first_call_in_run"
+FALLBACK_KEY_NOT_FOUND = "key_not_found"
+# Reserved for the TTL-backed Redis cache in v4 (SKY-8874). Never emitted in v1.
+FALLBACK_TTL_EXPIRED = "ttl_expired"
+FALLBACK_LOOKUP_ERROR = "lookup_error"
+
+
+@dataclass(frozen=True)
+class _CacheEntry:
+    """Internal wrapper storing a cached value alongside its insertion time.
+
+    `stored_at` is a monotonic clock reading, used only for computing the
+    `cache_age_seconds` field reported on cache hits.
+    """
+
+    value: Any
+    stored_at: float
+
+
+@dataclass(frozen=True)
+class LookupResult:
+    """Result of a cache lookup with telemetry metadata.
+
+    On a hit: ``hit=True``, ``value`` is the cached result, ``age_seconds``
+    is the elapsed seconds since ``store``, and ``fallback_reason`` is None.
+    On a miss: ``hit=False``, ``value`` is None, ``age_seconds`` is None,
+    and ``fallback_reason`` identifies why the lookup missed.
+    """
+
+    hit: bool
+    value: Any | None
+    age_seconds: float | None
+    fallback_reason: str | None
+    scope: str
+
+
+# workflow_run_id -> ordered dict of {cache_key: _CacheEntry}
+_CACHE: OrderedDict[str, OrderedDict[str, _CacheEntry]] = OrderedDict()

 # Simple hit/miss counters for post-deploy observability.
 _hits = 0
@ -161,28 +206,66 @@ def compute_cache_key(
    return hashlib.sha256(joined).hexdigest()


-def get(workflow_run_id: str | None, cache_key: str) -> Any | None:
-    """Return a cached extraction result, or None on miss."""
-    global _hits, _misses  # noqa: PLW0603
+def _miss(fallback_reason: str) -> LookupResult:
+    """Build a miss `LookupResult` for the v1 run-scoped cache and bump counters.
+
+    Sole owner of ``_misses`` mutations — ``lookup()`` delegates all miss
+    accounting here so the counter is never bumped without also ticking the
+    hit-rate logger.
+    """
+    global _misses  # noqa: PLW0603
+    _misses += 1
+    _maybe_log_hit_rate()
+    return LookupResult(
+        hit=False,
+        value=None,
+        age_seconds=None,
+        fallback_reason=fallback_reason,
+        scope=SCOPE_RUN,
+    )
+
+
+def lookup(workflow_run_id: str | None, cache_key: str) -> LookupResult | None:
+    """Look up a cached extraction result and return a structured telemetry record.
+
+    Returns a :class:`LookupResult` on a genuine hit or miss, or ``None`` when
+    the cache path is bypassed entirely (no ``workflow_run_id``).
+
+    Call sites should treat ``None`` as "cache not applicable here" — no hit/miss
+    log should be emitted and no metric counter should be bumped. ``None`` is
+    intentionally distinct from a miss so that Datadog dashboards are not
+    inflated with non-actionable zero-run lookups.
+
+    On a genuine miss, the returned ``LookupResult`` carries :attr:`~LookupResult.fallback_reason`
+    so log-based metrics can distinguish first-call-in-run (unavoidable) from
+    key-not-found (possible normalization opportunity) from lookup-error (infra issue).
+    """
+    global _hits  # noqa: PLW0603

    if not workflow_run_id:
        return None
+
    run_cache = _CACHE.get(workflow_run_id)
    if not run_cache:
-        _misses += 1
-        LOG.debug("extraction_cache.miss", workflow_run_id=workflow_run_id, cache_key=cache_key)
-        _maybe_log_hit_rate()
-        return None
-    result = run_cache.get(cache_key)
-    if result is not None:
-        _hits += 1
-        # Refresh LRU position so actively-read runs aren't evicted.
-        _CACHE.move_to_end(workflow_run_id)
-    else:
-        _misses += 1
-        LOG.debug("extraction_cache.miss", workflow_run_id=workflow_run_id, cache_key=cache_key)
+        return _miss(FALLBACK_FIRST_CALL_IN_RUN)
+
+    entry = run_cache.get(cache_key)
+    if entry is None:
+        return _miss(FALLBACK_KEY_NOT_FOUND)
+
+    _hits += 1
+    # Refresh LRU position so actively-read runs aren't evicted.
+    _CACHE.move_to_end(workflow_run_id)
    _maybe_log_hit_rate()
-    return result
+    # Clamp to zero to guard against monotonic clock edge cases.
+    age = max(0.0, time.monotonic() - entry.stored_at)
+    return LookupResult(
+        hit=True,
+        value=entry.value,
+        age_seconds=age,
+        fallback_reason=None,
+        scope=SCOPE_RUN,
+    )


 def _maybe_log_hit_rate() -> None:
@ -210,7 +293,7 @@ def store(workflow_run_id: str | None, cache_key: str, result: Any) -> None:
        _CACHE[workflow_run_id] = run_cache
    if cache_key in run_cache:
        run_cache.move_to_end(cache_key)
-    run_cache[cache_key] = result
+    run_cache[cache_key] = _CacheEntry(value=result, stored_at=time.monotonic())
    while len(run_cache) > _MAX_ENTRIES_PER_RUN:
        evicted_key, _ = run_cache.popitem(last=False)
        LOG.debug(
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@ -4288,25 +4288,59 @@ async def extract_information_for_navigation_goal(
        local_datetime=local_datetime_str,
    )

-    # Best-effort cache lookup — any failure falls through to LLM.
+    # Best-effort cache lookup — any failure falls through to LLM. The `try`
+    # is narrowed to just compute_cache_key + lookup so a downstream log
+    # failure can't re-enter the except block and double-count the call as
+    # both a hit/miss and a `lookup_error` in the Datadog miss-reason metric.
    cache_key: str | None = None
+    lookup_result: extraction_cache.LookupResult | None = None
    try:
        cache_key = extraction_cache.compute_cache_key(
            rendered_prompt=extract_information_prompt,
            llm_key=llm_key_override,
        )
-        cached = extraction_cache.get(task.workflow_run_id, cache_key)
-        if cached is not None:
-            LOG.info(
-                "extract_information cache hit — skipping LLM call",
-                task_id=task.task_id,
-                workflow_run_id=task.workflow_run_id,
-                cache_key=cache_key,
-            )
-            return ScrapeResult(scraped_data=cached)
+        lookup_result = extraction_cache.lookup(task.workflow_run_id, cache_key)
    except Exception:
-        LOG.warning("extract_information cache lookup failed; falling through to LLM", exc_info=True)
-        cache_key = None
+        LOG.warning(
+            "extract_information cache lookup failed; falling through to LLM",
+            task_id=task.task_id,
+            workflow_run_id=task.workflow_run_id,
+            cache_key=cache_key,
+            cache_hit=False,
+            cache_scope=extraction_cache.SCOPE_RUN,
+            cache_age_seconds=None,
+            fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
+            cache_path="agent",
+            exc_info=True,
+        )
+        # Preserve cache_key so the downstream store() can still warm the cache
+        # for subsequent identical calls even when lookup() fails transiently.
+
+    if lookup_result is not None and lookup_result.hit:
+        LOG.info(
+            "extract_information cache hit — skipping LLM call",
+            task_id=task.task_id,
+            workflow_run_id=task.workflow_run_id,
+            cache_key=cache_key,
+            cache_hit=True,
+            cache_scope=lookup_result.scope,
+            cache_age_seconds=lookup_result.age_seconds,
+            fallback_reason=None,
+            cache_path="agent",
+        )
+        return ScrapeResult(scraped_data=lookup_result.value)
+    if lookup_result is not None:
+        LOG.info(
+            "extract_information cache miss",
+            task_id=task.task_id,
+            workflow_run_id=task.workflow_run_id,
+            cache_key=cache_key,
+            cache_hit=False,
+            cache_scope=lookup_result.scope,
+            cache_age_seconds=None,
+            fallback_reason=lookup_result.fallback_reason,
+            cache_path="agent",
+        )

    # Use the appropriate LLM handler based on the feature flag
    llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
--- a/tests/unit/test_extraction_cache.py
+++ b/tests/unit/test_extraction_cache.py
@ -73,33 +73,57 @@ def test_key_is_stable_across_equivalent_schema_dict_orderings() -> None:
    assert _key(extracted_information_schema=schema_a) == _key(extracted_information_schema=schema_b)


-def test_get_returns_none_on_miss() -> None:
-    assert extraction_cache.get("wfr_1", _key()) is None
+def test_lookup_returns_miss_on_empty_cache() -> None:
+    result = extraction_cache.lookup("wfr_1", _key())
+    assert result.hit is False
+    assert result.value is None
+    assert result.age_seconds is None
+    assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
+    assert result.scope == extraction_cache.SCOPE_RUN


-def test_set_then_get_returns_stored_value() -> None:
+def test_store_then_lookup_returns_hit_with_age() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
-    assert extraction_cache.get("wfr_1", key) == {"docs": ["a.pdf"]}
+    result = extraction_cache.lookup("wfr_1", key)
+    assert result.hit is True
+    assert result.value == {"docs": ["a.pdf"]}
+    assert result.age_seconds is not None
+    assert result.age_seconds >= 0.0
+    assert result.fallback_reason is None
+    assert result.scope == extraction_cache.SCOPE_RUN
+
+
+def test_lookup_returns_key_not_found_when_run_exists_but_key_does_not() -> None:
+    """A run with other entries but missing this key must report key_not_found,
+    not first_call_in_run — downstream metrics use this split to distinguish
+    unavoidable first-call misses from potential normalization opportunities."""
+    extraction_cache.store("wfr_1", _key(current_url="https://example.com/A"), {"a": 1})
+    result = extraction_cache.lookup("wfr_1", _key(current_url="https://example.com/B"))
+    assert result.hit is False
+    assert result.value is None
+    assert result.fallback_reason == extraction_cache.FALLBACK_KEY_NOT_FOUND


 def test_cache_is_isolated_per_workflow_run_id() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
-    assert extraction_cache.get("wfr_2", key) is None
+    result = extraction_cache.lookup("wfr_2", key)
+    assert result.hit is False
+    assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN


 def test_empty_workflow_run_id_bypasses_cache() -> None:
    key = _key()
    extraction_cache.store(None, key, {"docs": ["a.pdf"]})
-    assert extraction_cache.get(None, key) is None
+    assert extraction_cache.lookup(None, key) is None


 def test_clear_workflow_run_drops_entries() -> None:
    key = _key()
    extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
    extraction_cache.clear_workflow_run("wfr_1")
-    assert extraction_cache.get("wfr_1", key) is None
+    assert extraction_cache.lookup("wfr_1", key).hit is False


 def test_fifo_eviction_when_run_cache_is_full() -> None:
@ -111,23 +135,47 @@ def test_fifo_eviction_when_run_cache_is_full() -> None:
        k = _key(current_url=f"https://example.com/{i}")
        extraction_cache.store("wfr_1", k, {"i": i})

-    assert extraction_cache.get("wfr_1", first_key) is None
+    assert extraction_cache.lookup("wfr_1", first_key).hit is False
    last_key = _key(current_url=f"https://example.com/{max_entries}")
-    assert extraction_cache.get("wfr_1", last_key) == {"i": max_entries}
+    last_result = extraction_cache.lookup("wfr_1", last_key)
+    assert last_result.hit is True
+    assert last_result.value == {"i": max_entries}


-def test_store_and_get_list_result() -> None:
+def test_store_and_lookup_list_result() -> None:
    """Extraction schemas with array roots produce list results — these must be cached too."""
    key = _key()
    extraction_cache.store("wfr_1", key, [{"doc": "a.pdf"}, {"doc": "b.pdf"}])
-    assert extraction_cache.get("wfr_1", key) == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]
+    result = extraction_cache.lookup("wfr_1", key)
+    assert result.hit is True
+    assert result.value == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]


-def test_store_and_get_string_result() -> None:
+def test_store_and_lookup_string_result() -> None:
    """Some extractions return a plain string — these must be cached too."""
    key = _key()
    extraction_cache.store("wfr_1", key, "plain text extraction")
-    assert extraction_cache.get("wfr_1", key) == "plain text extraction"
+    result = extraction_cache.lookup("wfr_1", key)
+    assert result.hit is True
+    assert result.value == "plain text extraction"
+
+
+def test_lookup_age_seconds_is_monotonic_delta(monkeypatch: pytest.MonkeyPatch) -> None:
+    """age_seconds should reflect elapsed time between store() and lookup()."""
+    fake_now = [1_000.0]
+
+    def _fake_monotonic() -> float:
+        return fake_now[0]
+
+    monkeypatch.setattr(extraction_cache.time, "monotonic", _fake_monotonic)
+
+    key = _key()
+    extraction_cache.store("wfr_1", key, {"docs": []})
+
+    fake_now[0] = 1_012.5
+    result = extraction_cache.lookup("wfr_1", key)
+    assert result.hit is True
+    assert result.age_seconds == pytest.approx(12.5)


 def test_key_changes_when_local_date_changes() -> None:
@ -190,7 +238,7 @@ def test_rendered_prompt_llm_key_affects_hash() -> None:
    assert k1 != k2


-def test_get_refreshes_lru_position() -> None:
+def test_lookup_refreshes_lru_position() -> None:
    """A cache hit should refresh the run's LRU position, preventing eviction."""
    max_runs = extraction_cache._MAX_WORKFLOW_RUNS
    key = _key()
@ -201,12 +249,16 @@ def test_get_refreshes_lru_position() -> None:
        extraction_cache.store(f"wfr_{i}", key, {"v": i})

    # Cache is at capacity. wfr_oldest is the LRU candidate.
-    # A get() hit should refresh its position to most-recent.
-    assert extraction_cache.get("wfr_oldest", key) == {"v": 0}
+    # A lookup() hit should refresh its position to most-recent.
+    refreshed = extraction_cache.lookup("wfr_oldest", key)
+    assert refreshed.hit is True
+    assert refreshed.value == {"v": 0}

    # Adding one more run triggers eviction. Without the LRU refresh,
    # wfr_oldest would be evicted; with it, wfr_1 (now the oldest) goes.
    extraction_cache.store("wfr_new", key, {"v": 999})

-    assert extraction_cache.get("wfr_oldest", key) == {"v": 0}
-    assert extraction_cache.get("wfr_1", key) is None  # evicted
+    oldest_after = extraction_cache.lookup("wfr_oldest", key)
+    assert oldest_after.hit is True
+    assert oldest_after.value == {"v": 0}
+    assert extraction_cache.lookup("wfr_1", key).hit is False  # evicted