mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
217 lines
8.4 KiB
Python
217 lines
8.4 KiB
Python
from __future__ import annotations
|
|
|
|
|
|
def classify_from_failure_reason(
|
|
failure_reason: str | None,
|
|
exception: Exception | None = None,
|
|
fallback_to_unknown: bool = False,
|
|
) -> list[dict] | None:
|
|
"""Classify failure from failure_reason text and/or exception type.
|
|
|
|
Returns list of categories sorted by confidence, or None if no classification.
|
|
|
|
When ``fallback_to_unknown`` is True and no keywords match, returns a single
|
|
UNKNOWN category instead of None. Use True for paths that are *always* failures
|
|
(exception, max_steps, max_retries). Use False (the default) for terminate paths
|
|
where the absence of a classification may simply mean the termination was
|
|
user-guided / expected.
|
|
|
|
Categories (16):
|
|
ANTI_BOT_DETECTION, PROXY_ERROR, BROWSER_ERROR, NAVIGATION_FAILURE,
|
|
PAGE_LOAD_TIMEOUT, AUTH_FAILURE, LLM_ERROR, CREDENTIAL_ERROR,
|
|
DATA_EXTRACTION_FAILURE, ELEMENT_NOT_FOUND, WRONG_PAGE_STATE,
|
|
MAX_STEPS_EXCEEDED, LLM_REASONING_ERROR, INFRASTRUCTURE_ERROR,
|
|
PARAMETER_BINDING_ERROR, UNKNOWN
|
|
"""
|
|
if not failure_reason and not exception:
|
|
return None
|
|
|
|
reason = (failure_reason or "").lower()
|
|
exc_name = type(exception).__name__ if exception else ""
|
|
|
|
categories: list[dict] = []
|
|
|
|
# Bot detection / CAPTCHA — use specific phrases to avoid false positives
|
|
_auth_context_keywords = ["login", "auth", "password", "permission", "credential"]
|
|
_has_auth_context = any(kw in reason for kw in _auth_context_keywords)
|
|
_antibot_keywords = [
|
|
"captcha",
|
|
"cloudflare",
|
|
"bot detect",
|
|
"bot block",
|
|
"ip block",
|
|
"request block",
|
|
"anti-bot",
|
|
"human verification",
|
|
]
|
|
# "access denied" is ambiguous: it can be bot blocking OR auth failure.
|
|
# Only treat it as bot detection when there are no auth-related keywords nearby.
|
|
# Note: in Skyvern's context, failure_reason is LLM-generated from page observations,
|
|
# so RBAC-style messages like "Access denied: insufficient privileges" are unlikely.
|
|
# If this becomes a false-positive source, consider further narrowing (e.g. requiring
|
|
# "access denied" appears without ANY qualifier, or adding more exclusion keywords).
|
|
if not _has_auth_context:
|
|
_antibot_keywords.append("access denied")
|
|
|
|
if any(kw in reason for kw in _antibot_keywords):
|
|
categories.append(
|
|
{
|
|
"category": "ANTI_BOT_DETECTION",
|
|
"confidence_float": 0.7,
|
|
"reasoning": "Keywords matched in failure reason",
|
|
}
|
|
)
|
|
|
|
# Proxy errors — check before browser errors so proxy failures don't fall into BROWSER_ERROR.
|
|
# The exception name may contain "Browser" (e.g. UnknownErrorWhileCreatingBrowserContext) but the
|
|
# root cause is proxy pool exhaustion.
|
|
_proxy_exc_keywords = ["NoProxy", "ProxyError"]
|
|
_proxy_reason_keywords = ["no proxy available", "proxy unavailable"]
|
|
if any(kw in exc_name for kw in _proxy_exc_keywords) or any(kw in reason for kw in _proxy_reason_keywords):
|
|
categories.append(
|
|
{
|
|
"category": "PROXY_ERROR",
|
|
"confidence_float": 0.9,
|
|
"reasoning": f"Exception: {exc_name}" if exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Browser errors — only match if not already classified as PROXY_ERROR above
|
|
elif any(kw in exc_name for kw in ["Browser", "CDP", "TargetClosed"]) or any(
|
|
kw in reason for kw in ["browser context closed", "page closed", "browser crash"]
|
|
):
|
|
categories.append(
|
|
{
|
|
"category": "BROWSER_ERROR",
|
|
"confidence_float": 0.9,
|
|
"reasoning": f"Exception: {exc_name}" if exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Navigation failure
|
|
if "FailedToNavigateToUrl" in exc_name or any(
|
|
kw in reason for kw in ["failed to navigate", "404", "redirect loop"]
|
|
):
|
|
categories.append(
|
|
{
|
|
"category": "NAVIGATION_FAILURE",
|
|
"confidence_float": 0.9,
|
|
"reasoning": f"Exception: {exc_name}" if "FailedToNavigate" in exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Page load timeout
|
|
if "Timeout" in exc_name or "timeout" in reason:
|
|
categories.append(
|
|
{
|
|
"category": "PAGE_LOAD_TIMEOUT",
|
|
"confidence_float": 0.8,
|
|
"reasoning": f"Exception: {exc_name}" if "Timeout" in exc_name else "Timeout in failure reason",
|
|
}
|
|
)
|
|
|
|
# Auth failure — also catches "access denied" when auth context is present
|
|
if any(kw in reason for kw in ["login fail", "authentication fail", "auth fail", "mfa", "password"]) or (
|
|
"access denied" in reason and _has_auth_context
|
|
):
|
|
categories.append(
|
|
{
|
|
"category": "AUTH_FAILURE",
|
|
"confidence_float": 0.7,
|
|
"reasoning": "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Credential error
|
|
if "Bitwarden" in exc_name or any(kw in reason for kw in ["credential not found", "missing credential"]):
|
|
categories.append(
|
|
{
|
|
"category": "CREDENTIAL_ERROR",
|
|
"confidence_float": 0.8,
|
|
"reasoning": f"Exception: {exc_name}" if "Bitwarden" in exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# LLM error
|
|
if any(kw in exc_name for kw in ["LLM", "APIError", "RateLimit"]) or "rate limit" in reason:
|
|
categories.append(
|
|
{
|
|
"category": "LLM_ERROR",
|
|
"confidence_float": 0.9,
|
|
"reasoning": f"Exception: {exc_name}" if exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Scraping / data extraction failure
|
|
if "ScrapingFailed" in exc_name or any(kw in reason for kw in ["scraping", "extraction fail", "empty extraction"]):
|
|
categories.append(
|
|
{
|
|
"category": "DATA_EXTRACTION_FAILURE",
|
|
"confidence_float": 0.7,
|
|
"reasoning": f"Exception: {exc_name}" if "Scraping" in exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Element not found
|
|
if "ElementNotFound" in exc_name or any(kw in reason for kw in ["element not found", "no matching element"]):
|
|
categories.append(
|
|
{
|
|
"category": "ELEMENT_NOT_FOUND",
|
|
"confidence_float": 0.8,
|
|
"reasoning": f"Exception: {exc_name}" if "ElementNotFound" in exc_name else "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Wrong page state
|
|
if any(kw in reason for kw in ["unexpected page", "wrong page", "blank page"]):
|
|
categories.append(
|
|
{
|
|
"category": "WRONG_PAGE_STATE",
|
|
"confidence_float": 0.6,
|
|
"reasoning": "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Max steps exceeded
|
|
if any(kw in reason for kw in ["max steps", "maximum steps", "max number of", "step limit"]):
|
|
categories.append(
|
|
{
|
|
"category": "MAX_STEPS_EXCEEDED",
|
|
"confidence_float": 0.9,
|
|
"reasoning": "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# LLM reasoning error (wrong action, hallucination)
|
|
if any(kw in reason for kw in ["wrong action", "invalid action", "hallucin"]):
|
|
categories.append(
|
|
{
|
|
"category": "LLM_REASONING_ERROR",
|
|
"confidence_float": 0.6,
|
|
"reasoning": "Keywords matched",
|
|
}
|
|
)
|
|
|
|
# Internal configuration mismatch — not a site/selector failure.
|
|
_param_binding_keywords = [
|
|
"should have already been set through workflow run parameters",
|
|
"should have already been set through workflow run context init",
|
|
"pre-run invariant: workflow_definition and persisted parameter rows disagree",
|
|
]
|
|
if any(kw in reason for kw in _param_binding_keywords):
|
|
categories.append(
|
|
{
|
|
"category": "PARAMETER_BINDING_ERROR",
|
|
"confidence_float": 0.95,
|
|
"reasoning": "Keywords matched",
|
|
}
|
|
)
|
|
|
|
if not categories:
|
|
if fallback_to_unknown:
|
|
return [{"category": "UNKNOWN", "confidence_float": 0.5, "reasoning": "No keyword match found"}]
|
|
return None
|
|
|
|
# Sort by confidence descending
|
|
categories.sort(key=lambda x: x["confidence_float"], reverse=True)
|
|
return categories
|