feat(extraction-cache): structured telemetry fields (SKY-8870) (#5482)

This commit is contained in:
Aaron Perez 2026-04-13 19:58:33 -05:00 committed by GitHub
parent 92b10112bd
commit 55a7966193
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 306 additions and 66 deletions

View file

@ -931,25 +931,57 @@ class RealSkyvernPageAi(SkyvernPageAi):
# Share the extract-information cache with the agent path. Best-effort
# per the RFC review — any exception falls through to the full LLM
# call below.
# call below. The `try` is narrowed to just compute_cache_key + lookup
# so a downstream log failure can't re-enter the except block and
# double-count the call as both a hit/miss and a `lookup_error` in
# the Datadog miss-reason metric.
workflow_run_id = context.workflow_run_id if context else None
cache_key: str | None = None
lookup_result: extraction_cache.LookupResult | None = None
try:
cache_key = extraction_cache.compute_cache_key(
rendered_prompt=extract_information_prompt,
llm_key=None,
)
cached = extraction_cache.get(workflow_run_id, cache_key)
if cached is not None:
LOG.info(
"ai_extract cache hit — skipping LLM call",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
)
return cached # type: ignore[return-value]
lookup_result = extraction_cache.lookup(workflow_run_id, cache_key)
except Exception:
LOG.warning("ai_extract cache lookup failed; falling through to LLM", exc_info=True)
cache_key = None
LOG.warning(
"ai_extract cache lookup failed; falling through to LLM",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=extraction_cache.SCOPE_RUN,
cache_age_seconds=None,
fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
cache_path="script",
exc_info=True,
)
# Preserve cache_key so the downstream store() can still warm the cache
# for subsequent identical calls even when lookup() fails transiently.
if lookup_result is not None and lookup_result.hit:
LOG.info(
"ai_extract cache hit — skipping LLM call",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=True,
cache_scope=lookup_result.scope,
cache_age_seconds=lookup_result.age_seconds,
fallback_reason=None,
cache_path="script",
)
return lookup_result.value # type: ignore[return-value]
if lookup_result is not None:
LOG.info(
"ai_extract cache miss",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=lookup_result.scope,
cache_age_seconds=None,
fallback_reason=lookup_result.fallback_reason,
cache_path="script",
)
step = None
if context and context.organization_id and context.task_id and context.step_id:
step = await app.DATABASE.tasks.get_step(

View file

@ -4968,32 +4968,71 @@ class ForgeAgent:
# Cache the summary LLM call — the inputs (goal, schema, URL) are
# identical across download-loop iterations that revisit the same page.
# The `try` is narrowed to just compute_cache_key + lookup so a
# downstream log failure can't double-count as a lookup_error.
workflow_run_id = context.workflow_run_id if context else None
cache_key: str | None = None
cached = None
lookup_result: extraction_cache.LookupResult | None = None
try:
cache_key = extraction_cache.compute_cache_key(
rendered_prompt=prompt,
llm_key=None,
)
cached = extraction_cache.get(workflow_run_id, cache_key)
lookup_result = extraction_cache.lookup(workflow_run_id, cache_key)
except Exception:
LOG.warning("data-extraction-summary cache lookup failed", exc_info=True)
LOG.warning(
"data-extraction-summary cache lookup failed",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=extraction_cache.SCOPE_RUN,
cache_age_seconds=None,
fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
cache_path="agent",
exc_info=True,
)
# Preserve cache_key so the store() below can still warm the cache
# for subsequent identical calls even when lookup() fails transiently.
if cached is not None:
if lookup_result is not None and lookup_result.hit:
LOG.info(
"data-extraction-summary cache hit — skipping LLM call",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=True,
cache_scope=lookup_result.scope,
cache_age_seconds=lookup_result.age_seconds,
fallback_reason=None,
cache_path="agent",
)
data_extraction_summary_resp = cached
data_extraction_summary_resp = lookup_result.value
else:
if lookup_result is not None:
LOG.info(
"data-extraction-summary cache miss",
workflow_run_id=workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=lookup_result.scope,
cache_age_seconds=None,
fallback_reason=lookup_result.fallback_reason,
cache_path="agent",
)
data_extraction_summary_resp = await app.EXTRACTION_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="data-extraction-summary"
)
if cache_key:
# Guard on both cache_key and response to match the other two call sites
# and avoid caching None — a cached None would later produce hit=True/value=None,
# which would then trip the RuntimeError below instead of falling through to a
# fresh LLM call as the pre-telemetry get()-returns-None path did.
if cache_key and data_extraction_summary_resp is not None:
extraction_cache.store(workflow_run_id, cache_key, data_extraction_summary_resp)
if data_extraction_summary_resp is None:
raise RuntimeError(
"data_extraction_summary_resp unexpectedly None after cache/LLM block "
f"(workflow_run_id={workflow_run_id!r}, cache_key={cache_key!r})"
)
return ExtractAction(
reasoning=data_extraction_summary_resp.get("summary", "Extracting information from the page"),
data_extraction_goal=task.data_extraction_goal,

View file

@ -45,7 +45,9 @@ from __future__ import annotations
import hashlib
import json
import re
import time
from collections import OrderedDict
from dataclasses import dataclass
from typing import Any
import structlog
@ -57,8 +59,51 @@ _MAX_WORKFLOW_RUNS = 256
# Sentinel hashed in place of None so that None and "" produce different keys.
_NULL_SENTINEL = "\x00__NULL__"
# workflow_run_id -> ordered dict of {cache_key: extraction_result}
_CACHE: OrderedDict[str, OrderedDict[str, Any]] = OrderedDict()
# Cache scope identifiers. v1 ships with "run" only; "wpid" and "global"
# are reserved for the Redis cross-run cache (SKY-8873/SKY-8874).
SCOPE_RUN = "run"
# Fallback reasons emitted on cache misses. Used by log-based metrics to
# distinguish first-call-in-run (unavoidable) from key-not-found (possible
# normalization opportunity) from lookup_error (bug or infra issue).
FALLBACK_FIRST_CALL_IN_RUN = "first_call_in_run"
FALLBACK_KEY_NOT_FOUND = "key_not_found"
# Reserved for the TTL-backed Redis cache in v4 (SKY-8874). Never emitted in v1.
FALLBACK_TTL_EXPIRED = "ttl_expired"
FALLBACK_LOOKUP_ERROR = "lookup_error"
@dataclass(frozen=True)
class _CacheEntry:
"""Internal wrapper storing a cached value alongside its insertion time.
`stored_at` is a monotonic clock reading, used only for computing the
`cache_age_seconds` field reported on cache hits.
"""
value: Any
stored_at: float
@dataclass(frozen=True)
class LookupResult:
"""Result of a cache lookup with telemetry metadata.
On a hit: ``hit=True``, ``value`` is the cached result, ``age_seconds``
is the elapsed seconds since ``store``, and ``fallback_reason`` is None.
On a miss: ``hit=False``, ``value`` is None, ``age_seconds`` is None,
and ``fallback_reason`` identifies why the lookup missed.
"""
hit: bool
value: Any | None
age_seconds: float | None
fallback_reason: str | None
scope: str
# workflow_run_id -> ordered dict of {cache_key: _CacheEntry}
_CACHE: OrderedDict[str, OrderedDict[str, _CacheEntry]] = OrderedDict()
# Simple hit/miss counters for post-deploy observability.
_hits = 0
@ -161,28 +206,66 @@ def compute_cache_key(
return hashlib.sha256(joined).hexdigest()
def get(workflow_run_id: str | None, cache_key: str) -> Any | None:
"""Return a cached extraction result, or None on miss."""
global _hits, _misses # noqa: PLW0603
def _miss(fallback_reason: str) -> LookupResult:
"""Build a miss `LookupResult` for the v1 run-scoped cache and bump counters.
Sole owner of ``_misses`` mutations ``lookup()`` delegates all miss
accounting here so the counter is never bumped without also ticking the
hit-rate logger.
"""
global _misses # noqa: PLW0603
_misses += 1
_maybe_log_hit_rate()
return LookupResult(
hit=False,
value=None,
age_seconds=None,
fallback_reason=fallback_reason,
scope=SCOPE_RUN,
)
def lookup(workflow_run_id: str | None, cache_key: str) -> LookupResult | None:
"""Look up a cached extraction result and return a structured telemetry record.
Returns a :class:`LookupResult` on a genuine hit or miss, or ``None`` when
the cache path is bypassed entirely (no ``workflow_run_id``).
Call sites should treat ``None`` as "cache not applicable here" no hit/miss
log should be emitted and no metric counter should be bumped. ``None`` is
intentionally distinct from a miss so that Datadog dashboards are not
inflated with non-actionable zero-run lookups.
On a genuine miss, the returned ``LookupResult`` carries :attr:`~LookupResult.fallback_reason`
so log-based metrics can distinguish first-call-in-run (unavoidable) from
key-not-found (possible normalization opportunity) from lookup-error (infra issue).
"""
global _hits # noqa: PLW0603
if not workflow_run_id:
return None
run_cache = _CACHE.get(workflow_run_id)
if not run_cache:
_misses += 1
LOG.debug("extraction_cache.miss", workflow_run_id=workflow_run_id, cache_key=cache_key)
_maybe_log_hit_rate()
return None
result = run_cache.get(cache_key)
if result is not None:
_hits += 1
# Refresh LRU position so actively-read runs aren't evicted.
_CACHE.move_to_end(workflow_run_id)
else:
_misses += 1
LOG.debug("extraction_cache.miss", workflow_run_id=workflow_run_id, cache_key=cache_key)
return _miss(FALLBACK_FIRST_CALL_IN_RUN)
entry = run_cache.get(cache_key)
if entry is None:
return _miss(FALLBACK_KEY_NOT_FOUND)
_hits += 1
# Refresh LRU position so actively-read runs aren't evicted.
_CACHE.move_to_end(workflow_run_id)
_maybe_log_hit_rate()
return result
# Clamp to zero to guard against monotonic clock edge cases.
age = max(0.0, time.monotonic() - entry.stored_at)
return LookupResult(
hit=True,
value=entry.value,
age_seconds=age,
fallback_reason=None,
scope=SCOPE_RUN,
)
def _maybe_log_hit_rate() -> None:
@ -210,7 +293,7 @@ def store(workflow_run_id: str | None, cache_key: str, result: Any) -> None:
_CACHE[workflow_run_id] = run_cache
if cache_key in run_cache:
run_cache.move_to_end(cache_key)
run_cache[cache_key] = result
run_cache[cache_key] = _CacheEntry(value=result, stored_at=time.monotonic())
while len(run_cache) > _MAX_ENTRIES_PER_RUN:
evicted_key, _ = run_cache.popitem(last=False)
LOG.debug(

View file

@ -4288,25 +4288,59 @@ async def extract_information_for_navigation_goal(
local_datetime=local_datetime_str,
)
# Best-effort cache lookup — any failure falls through to LLM.
# Best-effort cache lookup — any failure falls through to LLM. The `try`
# is narrowed to just compute_cache_key + lookup so a downstream log
# failure can't re-enter the except block and double-count the call as
# both a hit/miss and a `lookup_error` in the Datadog miss-reason metric.
cache_key: str | None = None
lookup_result: extraction_cache.LookupResult | None = None
try:
cache_key = extraction_cache.compute_cache_key(
rendered_prompt=extract_information_prompt,
llm_key=llm_key_override,
)
cached = extraction_cache.get(task.workflow_run_id, cache_key)
if cached is not None:
LOG.info(
"extract_information cache hit — skipping LLM call",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
cache_key=cache_key,
)
return ScrapeResult(scraped_data=cached)
lookup_result = extraction_cache.lookup(task.workflow_run_id, cache_key)
except Exception:
LOG.warning("extract_information cache lookup failed; falling through to LLM", exc_info=True)
cache_key = None
LOG.warning(
"extract_information cache lookup failed; falling through to LLM",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=extraction_cache.SCOPE_RUN,
cache_age_seconds=None,
fallback_reason=extraction_cache.FALLBACK_LOOKUP_ERROR,
cache_path="agent",
exc_info=True,
)
# Preserve cache_key so the downstream store() can still warm the cache
# for subsequent identical calls even when lookup() fails transiently.
if lookup_result is not None and lookup_result.hit:
LOG.info(
"extract_information cache hit — skipping LLM call",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
cache_key=cache_key,
cache_hit=True,
cache_scope=lookup_result.scope,
cache_age_seconds=lookup_result.age_seconds,
fallback_reason=None,
cache_path="agent",
)
return ScrapeResult(scraped_data=lookup_result.value)
if lookup_result is not None:
LOG.info(
"extract_information cache miss",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
cache_key=cache_key,
cache_hit=False,
cache_scope=lookup_result.scope,
cache_age_seconds=None,
fallback_reason=lookup_result.fallback_reason,
cache_path="agent",
)
# Use the appropriate LLM handler based on the feature flag
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(

View file

@ -73,33 +73,57 @@ def test_key_is_stable_across_equivalent_schema_dict_orderings() -> None:
assert _key(extracted_information_schema=schema_a) == _key(extracted_information_schema=schema_b)
def test_get_returns_none_on_miss() -> None:
assert extraction_cache.get("wfr_1", _key()) is None
def test_lookup_returns_miss_on_empty_cache() -> None:
result = extraction_cache.lookup("wfr_1", _key())
assert result.hit is False
assert result.value is None
assert result.age_seconds is None
assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
assert result.scope == extraction_cache.SCOPE_RUN
def test_set_then_get_returns_stored_value() -> None:
def test_store_then_lookup_returns_hit_with_age() -> None:
key = _key()
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
assert extraction_cache.get("wfr_1", key) == {"docs": ["a.pdf"]}
result = extraction_cache.lookup("wfr_1", key)
assert result.hit is True
assert result.value == {"docs": ["a.pdf"]}
assert result.age_seconds is not None
assert result.age_seconds >= 0.0
assert result.fallback_reason is None
assert result.scope == extraction_cache.SCOPE_RUN
def test_lookup_returns_key_not_found_when_run_exists_but_key_does_not() -> None:
"""A run with other entries but missing this key must report key_not_found,
not first_call_in_run downstream metrics use this split to distinguish
unavoidable first-call misses from potential normalization opportunities."""
extraction_cache.store("wfr_1", _key(current_url="https://example.com/A"), {"a": 1})
result = extraction_cache.lookup("wfr_1", _key(current_url="https://example.com/B"))
assert result.hit is False
assert result.value is None
assert result.fallback_reason == extraction_cache.FALLBACK_KEY_NOT_FOUND
def test_cache_is_isolated_per_workflow_run_id() -> None:
key = _key()
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
assert extraction_cache.get("wfr_2", key) is None
result = extraction_cache.lookup("wfr_2", key)
assert result.hit is False
assert result.fallback_reason == extraction_cache.FALLBACK_FIRST_CALL_IN_RUN
def test_empty_workflow_run_id_bypasses_cache() -> None:
key = _key()
extraction_cache.store(None, key, {"docs": ["a.pdf"]})
assert extraction_cache.get(None, key) is None
assert extraction_cache.lookup(None, key) is None
def test_clear_workflow_run_drops_entries() -> None:
key = _key()
extraction_cache.store("wfr_1", key, {"docs": ["a.pdf"]})
extraction_cache.clear_workflow_run("wfr_1")
assert extraction_cache.get("wfr_1", key) is None
assert extraction_cache.lookup("wfr_1", key).hit is False
def test_fifo_eviction_when_run_cache_is_full() -> None:
@ -111,23 +135,47 @@ def test_fifo_eviction_when_run_cache_is_full() -> None:
k = _key(current_url=f"https://example.com/{i}")
extraction_cache.store("wfr_1", k, {"i": i})
assert extraction_cache.get("wfr_1", first_key) is None
assert extraction_cache.lookup("wfr_1", first_key).hit is False
last_key = _key(current_url=f"https://example.com/{max_entries}")
assert extraction_cache.get("wfr_1", last_key) == {"i": max_entries}
last_result = extraction_cache.lookup("wfr_1", last_key)
assert last_result.hit is True
assert last_result.value == {"i": max_entries}
def test_store_and_get_list_result() -> None:
def test_store_and_lookup_list_result() -> None:
"""Extraction schemas with array roots produce list results — these must be cached too."""
key = _key()
extraction_cache.store("wfr_1", key, [{"doc": "a.pdf"}, {"doc": "b.pdf"}])
assert extraction_cache.get("wfr_1", key) == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]
result = extraction_cache.lookup("wfr_1", key)
assert result.hit is True
assert result.value == [{"doc": "a.pdf"}, {"doc": "b.pdf"}]
def test_store_and_get_string_result() -> None:
def test_store_and_lookup_string_result() -> None:
"""Some extractions return a plain string — these must be cached too."""
key = _key()
extraction_cache.store("wfr_1", key, "plain text extraction")
assert extraction_cache.get("wfr_1", key) == "plain text extraction"
result = extraction_cache.lookup("wfr_1", key)
assert result.hit is True
assert result.value == "plain text extraction"
def test_lookup_age_seconds_is_monotonic_delta(monkeypatch: pytest.MonkeyPatch) -> None:
"""age_seconds should reflect elapsed time between store() and lookup()."""
fake_now = [1_000.0]
def _fake_monotonic() -> float:
return fake_now[0]
monkeypatch.setattr(extraction_cache.time, "monotonic", _fake_monotonic)
key = _key()
extraction_cache.store("wfr_1", key, {"docs": []})
fake_now[0] = 1_012.5
result = extraction_cache.lookup("wfr_1", key)
assert result.hit is True
assert result.age_seconds == pytest.approx(12.5)
def test_key_changes_when_local_date_changes() -> None:
@ -190,7 +238,7 @@ def test_rendered_prompt_llm_key_affects_hash() -> None:
assert k1 != k2
def test_get_refreshes_lru_position() -> None:
def test_lookup_refreshes_lru_position() -> None:
"""A cache hit should refresh the run's LRU position, preventing eviction."""
max_runs = extraction_cache._MAX_WORKFLOW_RUNS
key = _key()
@ -201,12 +249,16 @@ def test_get_refreshes_lru_position() -> None:
extraction_cache.store(f"wfr_{i}", key, {"v": i})
# Cache is at capacity. wfr_oldest is the LRU candidate.
# A get() hit should refresh its position to most-recent.
assert extraction_cache.get("wfr_oldest", key) == {"v": 0}
# A lookup() hit should refresh its position to most-recent.
refreshed = extraction_cache.lookup("wfr_oldest", key)
assert refreshed.hit is True
assert refreshed.value == {"v": 0}
# Adding one more run triggers eviction. Without the LRU refresh,
# wfr_oldest would be evicted; with it, wfr_1 (now the oldest) goes.
extraction_cache.store("wfr_new", key, {"v": 999})
assert extraction_cache.get("wfr_oldest", key) == {"v": 0}
assert extraction_cache.get("wfr_1", key) is None # evicted
oldest_after = extraction_cache.lookup("wfr_oldest", key)
assert oldest_after.hit is True
assert oldest_after.value == {"v": 0}
assert extraction_cache.lookup("wfr_1", key).hit is False # evicted