mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-04-30 12:59:32 +00:00
fix(sprite): add retry for list failures, increase timeout, refresh auth on expiry (#2936)
Three fixes for Sprite E2E failures in long-running batches (73+ min): 1. Retry `_sprite_provision_verify`: list failures now retry 3x with exponential backoff (5s, 10s, 20s) instead of failing immediately. Fixes kilocode batch 6 "Could not list Sprite instances" errors. 2. Increase `CREATE_TIMEOUT_SECS` default from 300s to 600s and add `Client.Timeout`, `request canceled`, and `authentication failed` to the transient error retry pattern in `spriteRetry`. Also uses linear backoff (3s * attempt) instead of fixed 3s delay. Fixes hermes batch 7 HTTP timeout errors. 3. Add `_sprite_refresh_auth` + `cloud_refresh_auth` interface. The E2E orchestrator calls `cloud_refresh_auth` before each provisioning batch. For Sprite, this re-validates the token via `sprite org list` and attempts `sprite auth refresh` if expired. Fixes junie batch 8 "authentication failed" errors. Fixes #2934 Agent: ux-engineer Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
50319e0d39
commit
e9cbab5b7f
4 changed files with 116 additions and 23 deletions
|
|
@ -137,12 +137,59 @@ _sprite_headless_env() {
|
|||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _sprite_refresh_auth
|
||||
#
|
||||
# Re-validate Sprite credentials by running `sprite org list`. If the token
|
||||
# has expired (common after ~60 min), re-run `sprite auth login --headless`
|
||||
# to obtain a fresh token. Updates _SPRITE_ORG on success.
|
||||
#
|
||||
# Called before each E2E provisioning batch to prevent auth expiry failures
|
||||
# in long-running E2E suites (73+ min). See #2934.
|
||||
# ---------------------------------------------------------------------------
|
||||
_sprite_refresh_auth() {
|
||||
local org_output
|
||||
org_output=$(sprite org list 2>/dev/null || true)
|
||||
|
||||
if [ -n "${org_output}" ]; then
|
||||
# Token is still valid — update org in case it changed
|
||||
local refreshed_org
|
||||
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
|
||||
if [ -n "${refreshed_org}" ]; then
|
||||
_SPRITE_ORG="${refreshed_org}"
|
||||
fi
|
||||
log_info "Sprite auth token is still valid"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Token expired — attempt re-auth via sprite auth refresh
|
||||
log_warn "Sprite auth token expired — attempting refresh..."
|
||||
if sprite auth refresh >/dev/null 2>&1; then
|
||||
org_output=$(sprite org list 2>/dev/null || true)
|
||||
if [ -n "${org_output}" ]; then
|
||||
local refreshed_org
|
||||
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
|
||||
if [ -n "${refreshed_org}" ]; then
|
||||
_SPRITE_ORG="${refreshed_org}"
|
||||
fi
|
||||
log_ok "Sprite auth token refreshed successfully"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log_err "Sprite auth refresh failed — subsequent operations may fail"
|
||||
return 1
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _sprite_provision_verify APP LOG_DIR
|
||||
#
|
||||
# Verify sprite VM exists after provisioning by checking `sprite list` output
|
||||
# for the APP name. Write sentinel and metadata files for downstream steps.
|
||||
#
|
||||
# Retries up to 3 times with exponential backoff (5s, 10s, 20s) to handle
|
||||
# transient list failures from CLI rate-limiting or config corruption (#2934).
|
||||
#
|
||||
# Writes:
|
||||
# $LOG_DIR/$APP.ip — "sprite-cli" sentinel (no IP — Sprite uses names)
|
||||
# $LOG_DIR/$APP.meta — instance metadata (JSON)
|
||||
|
|
@ -150,31 +197,55 @@ _sprite_headless_env() {
|
|||
_sprite_provision_verify() {
|
||||
local app="$1"
|
||||
local log_dir="$2"
|
||||
local _max_retries=3
|
||||
local _retry_delay=5
|
||||
|
||||
# Check instance exists in sprite list
|
||||
_sprite_fix_config
|
||||
local sprite_output
|
||||
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
|
||||
local _attempt=0
|
||||
while [ "${_attempt}" -lt "${_max_retries}" ]; do
|
||||
# Fix config before each attempt (concurrent writes may corrupt it)
|
||||
_sprite_fix_config
|
||||
local sprite_output
|
||||
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
|
||||
|
||||
if [ -z "${sprite_output}" ]; then
|
||||
log_err "Could not list Sprite instances"
|
||||
return 1
|
||||
fi
|
||||
if [ -z "${sprite_output}" ]; then
|
||||
_attempt=$((_attempt + 1))
|
||||
if [ "${_attempt}" -lt "${_max_retries}" ]; then
|
||||
log_warn "Could not list Sprite instances — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
|
||||
sleep "${_retry_delay}"
|
||||
_retry_delay=$((_retry_delay * 2))
|
||||
continue
|
||||
fi
|
||||
log_err "Could not list Sprite instances after ${_max_retries} attempts"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
|
||||
log_err "Sprite instance ${app} not found in sprite list"
|
||||
return 1
|
||||
fi
|
||||
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
|
||||
_attempt=$((_attempt + 1))
|
||||
if [ "${_attempt}" -lt "${_max_retries}" ]; then
|
||||
log_warn "Sprite instance ${app} not found — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
|
||||
sleep "${_retry_delay}"
|
||||
_retry_delay=$((_retry_delay * 2))
|
||||
continue
|
||||
fi
|
||||
log_err "Sprite instance ${app} not found in sprite list after ${_max_retries} attempts"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_ok "Sprite instance ${app} exists"
|
||||
# Found the instance
|
||||
log_ok "Sprite instance ${app} exists"
|
||||
|
||||
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
|
||||
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
|
||||
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
|
||||
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
|
||||
|
||||
# Write metadata file
|
||||
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
|
||||
# Write metadata file
|
||||
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
|
||||
|
||||
return 0
|
||||
return 0
|
||||
done
|
||||
|
||||
# Should not reach here, but guard against it
|
||||
log_err "Sprite instance ${app} verification exhausted retries"
|
||||
return 1
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue