From e9cbab5b7f6e49687102b7952d36cd9d56516ebb Mon Sep 17 00:00:00 2001 From: A <258483684+la14-1@users.noreply.github.com> Date: Mon, 23 Mar 2026 21:47:58 -0700 Subject: [PATCH] fix(sprite): add retry for list failures, increase timeout, refresh auth on expiry (#2936) Three fixes for Sprite E2E failures in long-running batches (73+ min): 1. Retry `_sprite_provision_verify`: list failures now retry 3x with exponential backoff (5s, 10s, 20s) instead of failing immediately. Fixes kilocode batch 6 "Could not list Sprite instances" errors. 2. Increase `CREATE_TIMEOUT_SECS` default from 300s to 600s and add `Client.Timeout`, `request canceled`, and `authentication failed` to the transient error retry pattern in `spriteRetry`. Also uses linear backoff (3s * attempt) instead of fixed 3s delay. Fixes hermes batch 7 HTTP timeout errors. 3. Add `_sprite_refresh_auth` + `cloud_refresh_auth` interface. The E2E orchestrator calls `cloud_refresh_auth` before each provisioning batch. For Sprite, this re-validates the token via `sprite org list` and attempts `sprite auth refresh` if expired. Fixes junie batch 8 "authentication failed" errors. Fixes #2934 Agent: ux-engineer Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 --- packages/cli/src/sprite/sprite.ts | 16 +++-- sh/e2e/e2e.sh | 7 ++ sh/e2e/lib/clouds/sprite.sh | 107 +++++++++++++++++++++++++----- sh/e2e/lib/common.sh | 9 +++ 4 files changed, 116 insertions(+), 23 deletions(-) diff --git a/packages/cli/src/sprite/sprite.ts b/packages/cli/src/sprite/sprite.ts index d9232336..907e7a51 100644 --- a/packages/cli/src/sprite/sprite.ts +++ b/packages/cli/src/sprite/sprite.ts @@ -24,8 +24,10 @@ import { const CONNECTIVITY_POLL_DELAY = Number.parseInt(process.env.SPRITE_CONNECTIVITY_POLL_DELAY || "5", 10); -/** Timeout for the `sprite create` API call (seconds). Prevents indefinite hangs. */ -const CREATE_TIMEOUT_SECS = Number.parseInt(process.env.SPRITE_CREATE_TIMEOUT || "300", 10); +/** Timeout for the `sprite create` API call (seconds). Prevents indefinite hangs. + * Raised from 300s to 600s to accommodate slower Sprite API responses in long + * E2E runs where HTTP timeouts were observed (net/http: Client.Timeout). #2934 */ +const CREATE_TIMEOUT_SECS = Number.parseInt(process.env.SPRITE_CREATE_TIMEOUT || "600", 10); // ─── State ─────────────────────────────────────────────────────────────────── @@ -84,10 +86,14 @@ async function spriteRetry(desc: string, fn: () => Promise): Promise { break; } - // Only retry on transient network errors - if (/TLS handshake timeout|connection closed|connection reset|connection refused|i\/o timeout/i.test(msg)) { + // Only retry on transient network errors and auth expiry (#2934) + if ( + /TLS handshake timeout|connection closed|connection reset|connection refused|i\/o timeout|Client\.Timeout|request canceled|authentication failed/i.test( + msg, + ) + ) { logWarn(`${desc}: Transient error, retrying (${attempt}/${maxRetries})...`); - await sleep(3000); + await sleep(3000 * attempt); continue; } diff --git a/sh/e2e/e2e.sh b/sh/e2e/e2e.sh index d2d3731c..44a30a21 100755 --- a/sh/e2e/e2e.sh +++ b/sh/e2e/e2e.sh @@ -390,6 +390,10 @@ run_agents_for_cloud() { batch_num=$((batch_num + 1)) log_header "Batch ${batch_num} (${cloud})" + # Refresh auth before each batch — prevents token expiry in long + # E2E runs (60+ min). No-op for clouds without refresh support. #2934 + cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}" + pids="" for ba in ${batch_agents}; do local_result_file="${log_dir}/${cloud}-${ba}.result" @@ -421,6 +425,9 @@ run_agents_for_cloud() { batch_num=$((batch_num + 1)) log_header "Batch ${batch_num} (${cloud})" + # Refresh auth before partial batch too — same reason as above. #2934 + cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}" + pids="" for ba in ${batch_agents}; do local_result_file="${log_dir}/${cloud}-${ba}.result" diff --git a/sh/e2e/lib/clouds/sprite.sh b/sh/e2e/lib/clouds/sprite.sh index 7f0200be..45b1cba8 100644 --- a/sh/e2e/lib/clouds/sprite.sh +++ b/sh/e2e/lib/clouds/sprite.sh @@ -137,12 +137,59 @@ _sprite_headless_env() { fi } +# --------------------------------------------------------------------------- +# _sprite_refresh_auth +# +# Re-validate Sprite credentials by running `sprite org list`. If the token +# has expired (common after ~60 min), re-run `sprite auth login --headless` +# to obtain a fresh token. Updates _SPRITE_ORG on success. +# +# Called before each E2E provisioning batch to prevent auth expiry failures +# in long-running E2E suites (73+ min). See #2934. +# --------------------------------------------------------------------------- +_sprite_refresh_auth() { + local org_output + org_output=$(sprite org list 2>/dev/null || true) + + if [ -n "${org_output}" ]; then + # Token is still valid — update org in case it changed + local refreshed_org + refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}') + if [ -n "${refreshed_org}" ]; then + _SPRITE_ORG="${refreshed_org}" + fi + log_info "Sprite auth token is still valid" + return 0 + fi + + # Token expired — attempt re-auth via sprite auth refresh + log_warn "Sprite auth token expired — attempting refresh..." + if sprite auth refresh >/dev/null 2>&1; then + org_output=$(sprite org list 2>/dev/null || true) + if [ -n "${org_output}" ]; then + local refreshed_org + refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}') + if [ -n "${refreshed_org}" ]; then + _SPRITE_ORG="${refreshed_org}" + fi + log_ok "Sprite auth token refreshed successfully" + return 0 + fi + fi + + log_err "Sprite auth refresh failed — subsequent operations may fail" + return 1 +} + # --------------------------------------------------------------------------- # _sprite_provision_verify APP LOG_DIR # # Verify sprite VM exists after provisioning by checking `sprite list` output # for the APP name. Write sentinel and metadata files for downstream steps. # +# Retries up to 3 times with exponential backoff (5s, 10s, 20s) to handle +# transient list failures from CLI rate-limiting or config corruption (#2934). +# # Writes: # $LOG_DIR/$APP.ip — "sprite-cli" sentinel (no IP — Sprite uses names) # $LOG_DIR/$APP.meta — instance metadata (JSON) @@ -150,31 +197,55 @@ _sprite_headless_env() { _sprite_provision_verify() { local app="$1" local log_dir="$2" + local _max_retries=3 + local _retry_delay=5 - # Check instance exists in sprite list - _sprite_fix_config - local sprite_output - sprite_output=$(_sprite_cmd list 2>/dev/null || true) + local _attempt=0 + while [ "${_attempt}" -lt "${_max_retries}" ]; do + # Fix config before each attempt (concurrent writes may corrupt it) + _sprite_fix_config + local sprite_output + sprite_output=$(_sprite_cmd list 2>/dev/null || true) - if [ -z "${sprite_output}" ]; then - log_err "Could not list Sprite instances" - return 1 - fi + if [ -z "${sprite_output}" ]; then + _attempt=$((_attempt + 1)) + if [ "${_attempt}" -lt "${_max_retries}" ]; then + log_warn "Could not list Sprite instances — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})" + sleep "${_retry_delay}" + _retry_delay=$((_retry_delay * 2)) + continue + fi + log_err "Could not list Sprite instances after ${_max_retries} attempts" + return 1 + fi - if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then - log_err "Sprite instance ${app} not found in sprite list" - return 1 - fi + if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then + _attempt=$((_attempt + 1)) + if [ "${_attempt}" -lt "${_max_retries}" ]; then + log_warn "Sprite instance ${app} not found — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})" + sleep "${_retry_delay}" + _retry_delay=$((_retry_delay * 2)) + continue + fi + log_err "Sprite instance ${app} not found in sprite list after ${_max_retries} attempts" + return 1 + fi - log_ok "Sprite instance ${app} exists" + # Found the instance + log_ok "Sprite instance ${app} exists" - # Write sentinel — Sprite has no IP; use "sprite-cli" as marker - printf '%s' "sprite-cli" > "${log_dir}/${app}.ip" + # Write sentinel — Sprite has no IP; use "sprite-cli" as marker + printf '%s' "sprite-cli" > "${log_dir}/${app}.ip" - # Write metadata file - printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta" + # Write metadata file + printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta" - return 0 + return 0 + done + + # Should not reach here, but guard against it + log_err "Sprite instance ${app} verification exhausted retries" + return 1 } # --------------------------------------------------------------------------- diff --git a/sh/e2e/lib/common.sh b/sh/e2e/lib/common.sh index 589725dc..dea36233 100644 --- a/sh/e2e/lib/common.sh +++ b/sh/e2e/lib/common.sh @@ -133,6 +133,15 @@ cloud_install_wait() { fi } +# Refresh auth token if the cloud driver supports it (e.g. Sprite tokens +# expire after ~60 min). Called before each provisioning batch to prevent +# auth expiry failures in long-running E2E suites. See #2934. +cloud_refresh_auth() { + if type "_${ACTIVE_CLOUD}_refresh_auth" >/dev/null 2>&1; then + "_${ACTIVE_CLOUD}_refresh_auth" "$@" + fi +} + # --------------------------------------------------------------------------- # Per-agent provision timeout overrides #