fix: fail immediately on non-retriable navigation errors (SKY-8875) (#4976)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Andrew Neilson 2026-04-13 10:53:30 -07:00 committed by GitHub
parent fbc8e2245d
commit ef1665bfe4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 283 additions and 76 deletions

View file

@ -16,6 +16,18 @@ DOWNLOAD_FILE_PREFIX = "downloads"
SAVE_DOWNLOADED_FILES_TIMEOUT = 180
GET_DOWNLOADED_FILES_TIMEOUT = 30
NAVIGATION_MAX_RETRY_TIME = 5
PERMANENT_NAV_ERRORS = ("net::ERR_INVALID_URL",)
PROXY_SENSITIVE_NAV_ERRORS = (
"net::ERR_NAME_NOT_RESOLVED",
"net::ERR_NAME_RESOLUTION_FAILED",
"net::ERR_CERT_",
"net::ERR_SSL_",
)
# Errors that should not be retried within the same browser context/proxy.
# The outer context-recreation retry in get_or_create_page may still attempt
# recovery for proxy-sensitive errors by picking a different proxy node.
SKIP_INNER_NAV_RETRY_ERRORS = PERMANENT_NAV_ERRORS + PROXY_SENSITIVE_NAV_ERRORS
AUTO_COMPLETION_POTENTIAL_VALUES_COUNT = 3
DROPDOWN_MENU_MAX_DISTANCE = 100
BROWSER_DOWNLOADING_SUFFIX = ".crdownload"

View file

@ -904,7 +904,9 @@ class BaseTaskBlock(Block):
# self.block_type (which does not change between retries).
warn_if_file_download_max_steps_low(self, workflow_run_id=workflow_run_id)
_is_file_download = self.block_type == BlockType.FILE_DOWNLOAD
_navigate_wait_until = "domcontentloaded" if _is_file_download else "load"
_navigate_wait_until: Literal["load", "domcontentloaded", "commit"] = (
"domcontentloaded" if _is_file_download else "load"
)
# TODO (kerem) we should always retry on terminated. We should make a distinction between retriable and
# non-retryable terminations

View file

@ -1136,12 +1136,8 @@ def _parse_single_action(action_str: str) -> dict[str, Any] | None:
# Get arguments
action_inputs = {}
for kw in call.keywords:
if kw.arg and isinstance(kw.value, (ast.Constant, ast.Str)):
if isinstance(kw.value, ast.Constant):
value = kw.value.value
else: # ast.Str for older Python versions
value = kw.value.s
action_inputs[kw.arg] = value
if kw.arg and isinstance(kw.value, ast.Constant):
action_inputs[kw.arg] = kw.value.value
return {
"action_type": func_name,

View file

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Protocol
from typing import Literal, Protocol
from playwright.async_api import BrowserContext, Page, Playwright
@ -43,7 +43,7 @@ class BrowserState(Protocol):
page: Page,
url: str,
retry_times: int = NAVIGATION_MAX_RETRY_TIME,
wait_until: str = "load",
wait_until: Literal["load", "domcontentloaded", "commit"] = "load",
) -> None: ...
async def get_or_create_page(

View file

@ -0,0 +1,86 @@
from __future__ import annotations
import asyncio
import time
from collections.abc import Awaitable, Callable
from typing import Literal
import structlog
from skyvern.constants import PERMANENT_NAV_ERRORS, SKIP_INNER_NAV_RETRY_ERRORS
from skyvern.exceptions import FailedToNavigateToUrl
LOG = structlog.get_logger()
NavigateFunc = Callable[[str], Awaitable[object]]
SettleFunc = Callable[[], Awaitable[None]]
SleepFunc = Callable[[float], Awaitable[None]]
# Progressive wait_until degradation. Degrading to `domcontentloaded` and
# then `commit` lets navigation succeed once the DOM or response is ready.
_DEGRADATION_MAP: dict[str, list[str]] = {
"load": ["load", "domcontentloaded", "commit"],
"domcontentloaded": ["domcontentloaded", "commit"],
"commit": ["commit"],
}
def is_skip_inner_retry_error(error_message: str) -> bool:
return any(pattern in error_message for pattern in SKIP_INNER_NAV_RETRY_ERRORS)
def is_permanent_navigation_error(error_message: str) -> bool:
return any(pattern in error_message for pattern in PERMANENT_NAV_ERRORS)
async def navigate_with_retry(
navigate: NavigateFunc,
url: str,
retry_times: int,
settle: SettleFunc,
wait_until: Literal["load", "domcontentloaded", "commit"] = "load",
sleep: SleepFunc = asyncio.sleep,
) -> None:
degradation = _DEGRADATION_MAP.get(wait_until, [wait_until])
for attempt in range(retry_times):
strategy = degradation[min(attempt, len(degradation) - 1)]
LOG.info("Trying to navigate to url", url=url, retry_time=attempt, wait_until=strategy)
try:
start_time = time.monotonic()
await navigate(strategy)
elapsed = time.monotonic() - start_time
LOG.info("Page loading time", loading_time=elapsed, url=url, wait_until=strategy)
await settle()
LOG.info("Successfully navigated to url", url=url, retry_time=attempt, wait_until=strategy)
return
except Exception as error:
error_str = str(error)
if is_skip_inner_retry_error(error_str):
LOG.warning(
"Non-retriable navigation error, failing immediately",
url=url,
error=error_str,
)
raise FailedToNavigateToUrl(url=url, error_message=error_str) from error
if attempt >= retry_times - 1:
LOG.exception(
"Failed to navigate after retries",
url=url,
retry_times=retry_times,
error=error_str,
)
raise FailedToNavigateToUrl(url=url, error_message=error_str) from error
LOG.warning(
"Error while navigating to url, retrying",
exc_info=True,
url=url,
retry_time=attempt,
wait_until=strategy,
error=error_str,
)
await sleep(1)

View file

@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio
import random
import time
from typing import Literal
from urllib.parse import urlparse
import structlog
@ -21,6 +22,7 @@ from skyvern.schemas.runs import ProxyLocationInput
from skyvern.webeye.browser_artifacts import BrowserArtifacts, VideoArtifact
from skyvern.webeye.browser_factory import BrowserCleanupFunc, BrowserContextFactory
from skyvern.webeye.browser_state import BrowserState
from skyvern.webeye.navigation import is_permanent_navigation_error, navigate_with_retry
from skyvern.webeye.scraper import scraper
from skyvern.webeye.scraper.scraped_page import CleanupElementTreeFunc, ScrapedPage, ScrapeExcludeFunc
from skyvern.webeye.utils.page import ScreenshotMode, SkyvernFrame
@ -135,71 +137,15 @@ class RealBrowserState(BrowserState):
page: Page,
url: str,
retry_times: int = NAVIGATION_MAX_RETRY_TIME,
wait_until: str = "load",
wait_until: Literal["load", "domcontentloaded", "commit"] = "load",
) -> None:
# SKY-8818: progressive wait_until degradation. Many pages never fire
# `load` because a subresource stalls; degrading to `domcontentloaded` and
# then `commit` lets navigation succeed once the DOM or response is ready.
# Monotonic: a retry never upgrades to a STRONGER wait state than the caller asked for.
_degradation_map: dict[str, list[str]] = {
"load": ["load", "domcontentloaded", "commit"],
"domcontentloaded": ["domcontentloaded", "commit"],
"commit": ["commit"],
}
degradation = _degradation_map.get(wait_until, [wait_until])
try:
for retry_time in range(retry_times):
strategy = degradation[min(retry_time, len(degradation) - 1)]
LOG.info(
"Trying to navigate to url",
url=url,
retry_time=retry_time,
wait_until=strategy,
)
try:
start_time = time.time()
await page.goto(
url,
timeout=settings.BROWSER_LOADING_TIMEOUT_MS,
wait_until=strategy,
)
end_time = time.time()
LOG.info(
"Page loading time",
loading_time=end_time - start_time,
url=url,
wait_until=strategy,
)
await self._wait_for_settle()
LOG.info(
"Successfully navigated to url",
url=url,
retry_time=retry_time,
wait_until=strategy,
)
return
except Exception as e:
if retry_time >= retry_times - 1:
raise FailedToNavigateToUrl(url=url, error_message=str(e))
LOG.warning(
f"Error while navigating to url: {str(e)}",
exc_info=True,
url=url,
retry_time=retry_time,
wait_until=strategy,
)
# Wait for 1 seconds before retrying
await asyncio.sleep(1)
except Exception as e:
LOG.exception(
f"Failed to navigate to {url} after {retry_times} retries: {str(e)}",
url=url,
)
raise e
await navigate_with_retry(
navigate=lambda strategy: page.goto(url, timeout=settings.BROWSER_LOADING_TIMEOUT_MS, wait_until=strategy),
url=url,
retry_times=retry_times,
settle=self._wait_for_settle,
wait_until=wait_until,
)
async def get_working_page(self) -> Page | None:
# HACK: currently, assuming the last page is always the working page.
@ -348,12 +294,14 @@ class RealBrowserState(BrowserState):
browser_profile_id=browser_profile_id,
)
except Exception as e:
error_message = str(e)
error_message = e.error_message if isinstance(e, FailedToNavigateToUrl) else str(e)
if is_permanent_navigation_error(error_message):
raise
if "net::ERR" not in error_message:
raise e
raise
if not await self.close_current_open_page():
LOG.warning("Failed to close the current open page")
raise e
raise
await self.check_and_fix_state(
url=url,
proxy_location=proxy_location,

View file

View file

@ -0,0 +1,163 @@
from __future__ import annotations
from unittest.mock import AsyncMock
import pytest
from skyvern.exceptions import FailedToNavigateToUrl
from skyvern.webeye.navigation import navigate_with_retry
from skyvern.webeye.real_browser_state import RealBrowserState
@pytest.mark.parametrize(
"error_message",
[
pytest.param("net::ERR_NAME_NOT_RESOLVED", id="dns-not-resolved"),
pytest.param("net::ERR_NAME_RESOLUTION_FAILED", id="dns-resolution-failed"),
pytest.param("net::ERR_INVALID_URL", id="invalid-url"),
pytest.param("net::ERR_CERT_AUTHORITY_INVALID", id="cert-authority-invalid"),
pytest.param("net::ERR_CERT_DATE_INVALID", id="cert-date-invalid"),
pytest.param("net::ERR_SSL_PROTOCOL_ERROR", id="ssl-protocol-error"),
],
)
@pytest.mark.asyncio
async def test_skip_inner_retry_error_fails_immediately(error_message: str) -> None:
page = AsyncMock()
page.goto = AsyncMock(side_effect=Exception(error_message))
settle = AsyncMock()
sleep = AsyncMock()
with pytest.raises(FailedToNavigateToUrl):
await navigate_with_retry(
navigate=lambda strategy: page.goto("http://example.invalid", timeout=30000, wait_until=strategy),
url="http://example.invalid",
retry_times=5,
settle=settle,
sleep=sleep,
)
assert page.goto.call_count == 1
settle.assert_not_awaited()
sleep.assert_not_awaited()
@pytest.mark.parametrize(
"error_message, retry_times",
[
pytest.param("net::ERR_TIMED_OUT", 3, id="timeout"),
pytest.param("net::ERR_CONNECTION_RESET", 2, id="connection-reset"),
],
)
@pytest.mark.asyncio
async def test_retriable_error_exhausts_all_attempts(
error_message: str,
retry_times: int,
) -> None:
page = AsyncMock()
page.goto = AsyncMock(side_effect=Exception(error_message))
settle = AsyncMock()
sleep = AsyncMock()
with pytest.raises(FailedToNavigateToUrl):
await navigate_with_retry(
navigate=lambda strategy: page.goto("http://example.com", timeout=30000, wait_until=strategy),
url="http://example.com",
retry_times=retry_times,
settle=settle,
sleep=sleep,
)
assert page.goto.call_count == retry_times
assert sleep.await_count == retry_times - 1
settle.assert_not_awaited()
@pytest.mark.asyncio
async def test_transient_error_recovers_on_retry() -> None:
page = AsyncMock()
page.goto = AsyncMock(side_effect=[Exception("net::ERR_CONNECTION_RESET"), None])
settle = AsyncMock()
sleep = AsyncMock()
await navigate_with_retry(
navigate=lambda strategy: page.goto("http://example.com", timeout=30000, wait_until=strategy),
url="http://example.com",
retry_times=3,
settle=settle,
sleep=sleep,
)
assert page.goto.call_count == 2
assert sleep.await_count == 1
settle.assert_awaited_once()
@pytest.mark.asyncio
async def test_get_or_create_page_does_not_retry_permanent_failed_navigation() -> None:
browser_state = RealBrowserState(pw=AsyncMock())
browser_state.get_working_page = AsyncMock(return_value=None)
browser_state.check_and_fix_state = AsyncMock(
side_effect=FailedToNavigateToUrl(
url="http://example.invalid",
error_message="net::ERR_INVALID_URL",
)
)
browser_state.close_current_open_page = AsyncMock(return_value=True)
with pytest.raises(FailedToNavigateToUrl):
await browser_state.get_or_create_page(url="http://example.invalid")
assert browser_state.check_and_fix_state.await_count == 1
browser_state.close_current_open_page.assert_not_awaited()
@pytest.mark.asyncio
async def test_get_or_create_page_retries_dns_error_with_context_recreation() -> None:
page = AsyncMock()
browser_state = RealBrowserState(pw=AsyncMock())
browser_state.get_working_page = AsyncMock(side_effect=[None, page])
browser_state.check_and_fix_state = AsyncMock(
side_effect=[
FailedToNavigateToUrl(
url="http://example.com",
error_message="net::ERR_NAME_NOT_RESOLVED",
),
None,
]
)
browser_state.close_current_open_page = AsyncMock(return_value=True)
browser_state.validate_browser_context = AsyncMock(return_value=True)
browser_state._RealBrowserState__assert_page = AsyncMock(return_value=page)
result = await browser_state.get_or_create_page(url="http://example.com")
assert result is page
assert browser_state.check_and_fix_state.await_count == 2
browser_state.close_current_open_page.assert_awaited_once()
browser_state.validate_browser_context.assert_awaited_once_with(page)
@pytest.mark.asyncio
async def test_get_or_create_page_retries_retriable_failed_navigation() -> None:
page = AsyncMock()
browser_state = RealBrowserState(pw=AsyncMock())
browser_state.get_working_page = AsyncMock(side_effect=[None, page])
browser_state.check_and_fix_state = AsyncMock(
side_effect=[
FailedToNavigateToUrl(
url="http://example.com",
error_message="net::ERR_CONNECTION_RESET",
),
None,
]
)
browser_state.close_current_open_page = AsyncMock(return_value=True)
browser_state.validate_browser_context = AsyncMock(return_value=True)
browser_state._RealBrowserState__assert_page = AsyncMock(return_value=page)
result = await browser_state.get_or_create_page(url="http://example.com")
assert result is page
assert browser_state.check_and_fix_state.await_count == 2
browser_state.close_current_open_page.assert_awaited_once()
browser_state.validate_browser_context.assert_awaited_once_with(page)