fix(sprite): add retry for list failures, increase timeout, refresh auth on expiry (#2936)

Three fixes for Sprite E2E failures in long-running batches (73+ min):

1. Retry `_sprite_provision_verify`: list failures now retry 3x with
   exponential backoff (5s, 10s, 20s) instead of failing immediately.
   Fixes kilocode batch 6 "Could not list Sprite instances" errors.

2. Increase `CREATE_TIMEOUT_SECS` default from 300s to 600s and add
   `Client.Timeout`, `request canceled`, and `authentication failed`
   to the transient error retry pattern in `spriteRetry`. Also uses
   linear backoff (3s * attempt) instead of fixed 3s delay.
   Fixes hermes batch 7 HTTP timeout errors.

3. Add `_sprite_refresh_auth` + `cloud_refresh_auth` interface. The
   E2E orchestrator calls `cloud_refresh_auth` before each provisioning
   batch. For Sprite, this re-validates the token via `sprite org list`
   and attempts `sprite auth refresh` if expired.
   Fixes junie batch 8 "authentication failed" errors.

Fixes #2934

Agent: ux-engineer

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
A 2026-03-23 21:47:58 -07:00 committed by GitHub
parent 50319e0d39
commit e9cbab5b7f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 116 additions and 23 deletions

View file

@ -24,8 +24,10 @@ import {
const CONNECTIVITY_POLL_DELAY = Number.parseInt(process.env.SPRITE_CONNECTIVITY_POLL_DELAY || "5", 10);
/** Timeout for the `sprite create` API call (seconds). Prevents indefinite hangs. */
const CREATE_TIMEOUT_SECS = Number.parseInt(process.env.SPRITE_CREATE_TIMEOUT || "300", 10);
/** Timeout for the `sprite create` API call (seconds). Prevents indefinite hangs.
* Raised from 300s to 600s to accommodate slower Sprite API responses in long
* E2E runs where HTTP timeouts were observed (net/http: Client.Timeout). #2934 */
const CREATE_TIMEOUT_SECS = Number.parseInt(process.env.SPRITE_CREATE_TIMEOUT || "600", 10);
// ─── State ───────────────────────────────────────────────────────────────────
@ -84,10 +86,14 @@ async function spriteRetry<T>(desc: string, fn: () => Promise<T>): Promise<T> {
break;
}
// Only retry on transient network errors
if (/TLS handshake timeout|connection closed|connection reset|connection refused|i\/o timeout/i.test(msg)) {
// Only retry on transient network errors and auth expiry (#2934)
if (
/TLS handshake timeout|connection closed|connection reset|connection refused|i\/o timeout|Client\.Timeout|request canceled|authentication failed/i.test(
msg,
)
) {
logWarn(`${desc}: Transient error, retrying (${attempt}/${maxRetries})...`);
await sleep(3000);
await sleep(3000 * attempt);
continue;
}

View file

@ -390,6 +390,10 @@ run_agents_for_cloud() {
batch_num=$((batch_num + 1))
log_header "Batch ${batch_num} (${cloud})"
# Refresh auth before each batch — prevents token expiry in long
# E2E runs (60+ min). No-op for clouds without refresh support. #2934
cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}"
pids=""
for ba in ${batch_agents}; do
local_result_file="${log_dir}/${cloud}-${ba}.result"
@ -421,6 +425,9 @@ run_agents_for_cloud() {
batch_num=$((batch_num + 1))
log_header "Batch ${batch_num} (${cloud})"
# Refresh auth before partial batch too — same reason as above. #2934
cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}"
pids=""
for ba in ${batch_agents}; do
local_result_file="${log_dir}/${cloud}-${ba}.result"

View file

@ -137,12 +137,59 @@ _sprite_headless_env() {
fi
}
# ---------------------------------------------------------------------------
# _sprite_refresh_auth
#
# Re-validate Sprite credentials by running `sprite org list`. If the token
# has expired (common after ~60 min), re-run `sprite auth login --headless`
# to obtain a fresh token. Updates _SPRITE_ORG on success.
#
# Called before each E2E provisioning batch to prevent auth expiry failures
# in long-running E2E suites (73+ min). See #2934.
# ---------------------------------------------------------------------------
_sprite_refresh_auth() {
local org_output
org_output=$(sprite org list 2>/dev/null || true)
if [ -n "${org_output}" ]; then
# Token is still valid — update org in case it changed
local refreshed_org
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
if [ -n "${refreshed_org}" ]; then
_SPRITE_ORG="${refreshed_org}"
fi
log_info "Sprite auth token is still valid"
return 0
fi
# Token expired — attempt re-auth via sprite auth refresh
log_warn "Sprite auth token expired — attempting refresh..."
if sprite auth refresh >/dev/null 2>&1; then
org_output=$(sprite org list 2>/dev/null || true)
if [ -n "${org_output}" ]; then
local refreshed_org
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
if [ -n "${refreshed_org}" ]; then
_SPRITE_ORG="${refreshed_org}"
fi
log_ok "Sprite auth token refreshed successfully"
return 0
fi
fi
log_err "Sprite auth refresh failed — subsequent operations may fail"
return 1
}
# ---------------------------------------------------------------------------
# _sprite_provision_verify APP LOG_DIR
#
# Verify sprite VM exists after provisioning by checking `sprite list` output
# for the APP name. Write sentinel and metadata files for downstream steps.
#
# Retries up to 3 times with exponential backoff (5s, 10s, 20s) to handle
# transient list failures from CLI rate-limiting or config corruption (#2934).
#
# Writes:
# $LOG_DIR/$APP.ip — "sprite-cli" sentinel (no IP — Sprite uses names)
# $LOG_DIR/$APP.meta — instance metadata (JSON)
@ -150,31 +197,55 @@ _sprite_headless_env() {
_sprite_provision_verify() {
local app="$1"
local log_dir="$2"
local _max_retries=3
local _retry_delay=5
# Check instance exists in sprite list
_sprite_fix_config
local sprite_output
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
local _attempt=0
while [ "${_attempt}" -lt "${_max_retries}" ]; do
# Fix config before each attempt (concurrent writes may corrupt it)
_sprite_fix_config
local sprite_output
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
if [ -z "${sprite_output}" ]; then
log_err "Could not list Sprite instances"
return 1
fi
if [ -z "${sprite_output}" ]; then
_attempt=$((_attempt + 1))
if [ "${_attempt}" -lt "${_max_retries}" ]; then
log_warn "Could not list Sprite instances — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
sleep "${_retry_delay}"
_retry_delay=$((_retry_delay * 2))
continue
fi
log_err "Could not list Sprite instances after ${_max_retries} attempts"
return 1
fi
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
log_err "Sprite instance ${app} not found in sprite list"
return 1
fi
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
_attempt=$((_attempt + 1))
if [ "${_attempt}" -lt "${_max_retries}" ]; then
log_warn "Sprite instance ${app} not found — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
sleep "${_retry_delay}"
_retry_delay=$((_retry_delay * 2))
continue
fi
log_err "Sprite instance ${app} not found in sprite list after ${_max_retries} attempts"
return 1
fi
log_ok "Sprite instance ${app} exists"
# Found the instance
log_ok "Sprite instance ${app} exists"
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
# Write metadata file
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
# Write metadata file
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
return 0
return 0
done
# Should not reach here, but guard against it
log_err "Sprite instance ${app} verification exhausted retries"
return 1
}
# ---------------------------------------------------------------------------

View file

@ -133,6 +133,15 @@ cloud_install_wait() {
fi
}
# Refresh auth token if the cloud driver supports it (e.g. Sprite tokens
# expire after ~60 min). Called before each provisioning batch to prevent
# auth expiry failures in long-running E2E suites. See #2934.
cloud_refresh_auth() {
if type "_${ACTIVE_CLOUD}_refresh_auth" >/dev/null 2>&1; then
"_${ACTIVE_CLOUD}_refresh_auth" "$@"
fi
}
# ---------------------------------------------------------------------------
# Per-agent provision timeout overrides
#