mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-04-28 11:59:29 +00:00
GCP, Sprite, and DigitalOcean had commented-out code `# local agent="$2"` in their `_headless_env` functions. Hetzner already used the cleaner style `# $2 = agent (unused but part of the interface)`. Normalize to match. Co-authored-by: spawn-qa-bot <qa@openrouter.ai> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
394 lines
14 KiB
Bash
394 lines
14 KiB
Bash
#!/bin/bash
|
|
# e2e/lib/clouds/sprite.sh — Sprite cloud driver for multi-cloud E2E
|
|
#
|
|
# Implements the standard cloud driver interface (_sprite_* prefixed functions).
|
|
# Sourced by common.sh's load_cloud_driver() which wires these to generic names.
|
|
#
|
|
# Sprite uses its own CLI for execution — NO SSH is used.
|
|
# All remote commands run via: printf CMD | sprite exec -s NAME -- bash
|
|
#
|
|
# Depends on: log_step, log_ok, log_err, log_warn, log_info, format_duration,
|
|
# untrack_app (provided by common.sh)
|
|
set -eo pipefail
|
|
|
|
# Detected org — set during _sprite_validate_env.
|
|
# Passed as -o flag to all sprite CLI calls to avoid config file races
|
|
# from concurrent sprite exec calls corrupting ~/.sprites/sprites.json.
|
|
_SPRITE_ORG=""
|
|
|
|
# Helper: build org flags for sprite CLI calls.
|
|
# Outputs "-o" and the org name as separate lines for use with _sprite_cmd.
|
|
_sprite_org_flags() {
|
|
if [ -n "${_SPRITE_ORG}" ]; then
|
|
printf '%s\n%s' "-o" "${_SPRITE_ORG}"
|
|
fi
|
|
}
|
|
|
|
# Helper: run sprite CLI with org flags safely (no word-splitting).
|
|
# Usage: _sprite_cmd [extra args...]
|
|
# Reads org flags via _sprite_org_flags and builds a proper argument array.
|
|
_sprite_cmd() {
|
|
local _args
|
|
_args=()
|
|
if [ -n "${_SPRITE_ORG}" ]; then
|
|
_args+=("-o" "${_SPRITE_ORG}")
|
|
fi
|
|
sprite "${_args[@]}" "$@"
|
|
}
|
|
|
|
# Helper: fix corrupted sprite config (double-closing-brace from concurrent writes)
|
|
_sprite_fix_config() {
|
|
local cfg="${HOME}/.sprites/sprites.json"
|
|
if [ -f "${cfg}" ]; then
|
|
# Check for double-brace corruption (most common race condition pattern).
|
|
# The sprite CLI's concurrent writes append an extra } at the end.
|
|
# Use grep on the whole file for any line that is just }}
|
|
if grep -q '^}}$' "${cfg}" 2>/dev/null; then
|
|
local tmp
|
|
tmp=$(mktemp "${cfg}.XXXXXX") || return
|
|
sed 's/^}}$/}/' "${cfg}" > "${tmp}" 2>/dev/null && mv "${tmp}" "${cfg}" 2>/dev/null || rm -f "${tmp}"
|
|
fi
|
|
# Also check if last non-empty line ends with }}
|
|
local last_content
|
|
last_content=$(tail -5 "${cfg}" | grep -v '^$' | tail -1)
|
|
if printf '%s' "${last_content}" | grep -q '}}$'; then
|
|
local tmp
|
|
tmp=$(mktemp "${cfg}.XXXXXX") || return
|
|
# Replace the LAST occurrence of }} with }
|
|
sed '$ s/}}$/}/' "${cfg}" > "${tmp}" 2>/dev/null && mv "${tmp}" "${cfg}" 2>/dev/null || rm -f "${tmp}"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_max_parallel
|
|
#
|
|
# Sprite CLI gets rate-limited with too many concurrent calls.
|
|
# Cap to 2 agents at a time.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_max_parallel() {
|
|
printf '1'
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_install_wait
|
|
#
|
|
# Sprite exec is slower per-call than SSH — give installs more time to complete.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_install_wait() {
|
|
printf '300'
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_validate_env
|
|
#
|
|
# Check that the sprite CLI is installed and credentials are valid.
|
|
# Returns 0 on success, 1 on failure.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_validate_env() {
|
|
if ! command -v sprite >/dev/null 2>&1; then
|
|
log_err "sprite CLI not found. Install from https://docs.sprite.dev"
|
|
return 1
|
|
fi
|
|
|
|
local org_output
|
|
org_output=$(sprite org list 2>/dev/null || true)
|
|
if [ -z "${org_output}" ]; then
|
|
log_err "Sprite credentials are not valid. Run: sprite auth login"
|
|
return 1
|
|
fi
|
|
|
|
# Extract org name and cache it — all subsequent sprite CLI calls use -o flag
|
|
# to avoid concurrent config file reads/writes corrupting sprites.json
|
|
_SPRITE_ORG=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
|
|
if [ -z "${_SPRITE_ORG}" ]; then
|
|
# Fallback: try SPRITE_ORG env var
|
|
_SPRITE_ORG="${SPRITE_ORG:-}"
|
|
fi
|
|
|
|
# Validate org name contains only safe characters (alphanumeric, dash, underscore)
|
|
# to prevent injection via crafted org names in subsequent CLI calls.
|
|
if [ -n "${_SPRITE_ORG}" ] && ! printf '%s' "${_SPRITE_ORG}" | grep -qE '^[A-Za-z0-9_-]+$'; then
|
|
log_err "Invalid Sprite org name: ${_SPRITE_ORG}"
|
|
return 1
|
|
fi
|
|
|
|
if [ -n "${_SPRITE_ORG}" ]; then
|
|
log_ok "Sprite credentials validated (org: ${_SPRITE_ORG})"
|
|
else
|
|
log_ok "Sprite credentials validated"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_headless_env APP AGENT
|
|
#
|
|
# Print export lines to stdout for headless provisioning.
|
|
# These are eval'd by the provisioning harness before invoking the CLI.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_headless_env() {
|
|
local app="$1"
|
|
# $2 = agent (unused but part of the interface)
|
|
|
|
printf 'export SPRITE_NAME="%s"\n' "${app}"
|
|
if [ -n "${_SPRITE_ORG}" ]; then
|
|
printf 'export SPRITE_ORG="%s"\n' "${_SPRITE_ORG}"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_refresh_auth
|
|
#
|
|
# Re-validate Sprite credentials by running `sprite org list`. If the token
|
|
# has expired (common after ~60 min), re-run `sprite auth login --headless`
|
|
# to obtain a fresh token. Updates _SPRITE_ORG on success.
|
|
#
|
|
# Called before each E2E provisioning batch to prevent auth expiry failures
|
|
# in long-running E2E suites (73+ min). See #2934.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_refresh_auth() {
|
|
local org_output
|
|
org_output=$(sprite org list 2>/dev/null || true)
|
|
|
|
if [ -n "${org_output}" ]; then
|
|
# Token is still valid — update org in case it changed
|
|
local refreshed_org
|
|
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
|
|
if [ -n "${refreshed_org}" ]; then
|
|
_SPRITE_ORG="${refreshed_org}"
|
|
fi
|
|
log_info "Sprite auth token is still valid"
|
|
return 0
|
|
fi
|
|
|
|
# Token expired — attempt re-auth via sprite auth refresh
|
|
log_warn "Sprite auth token expired — attempting refresh..."
|
|
if sprite auth refresh >/dev/null 2>&1; then
|
|
org_output=$(sprite org list 2>/dev/null || true)
|
|
if [ -n "${org_output}" ]; then
|
|
local refreshed_org
|
|
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
|
|
if [ -n "${refreshed_org}" ]; then
|
|
_SPRITE_ORG="${refreshed_org}"
|
|
fi
|
|
log_ok "Sprite auth token refreshed successfully"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
log_err "Sprite auth refresh failed — subsequent operations may fail"
|
|
return 1
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_provision_verify APP LOG_DIR
|
|
#
|
|
# Verify sprite VM exists after provisioning by checking `sprite list` output
|
|
# for the APP name. Write sentinel and metadata files for downstream steps.
|
|
#
|
|
# Retries up to 3 times with exponential backoff (5s, 10s, 20s) to handle
|
|
# transient list failures from CLI rate-limiting or config corruption (#2934).
|
|
#
|
|
# Writes:
|
|
# $LOG_DIR/$APP.ip — "sprite-cli" sentinel (no IP — Sprite uses names)
|
|
# $LOG_DIR/$APP.meta — instance metadata (JSON)
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_provision_verify() {
|
|
local app="$1"
|
|
local log_dir="$2"
|
|
local _max_retries=3
|
|
local _retry_delay=5
|
|
|
|
local _attempt=0
|
|
while [ "${_attempt}" -lt "${_max_retries}" ]; do
|
|
# Fix config before each attempt (concurrent writes may corrupt it)
|
|
_sprite_fix_config
|
|
local sprite_output
|
|
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
|
|
|
|
if [ -z "${sprite_output}" ]; then
|
|
_attempt=$((_attempt + 1))
|
|
if [ "${_attempt}" -lt "${_max_retries}" ]; then
|
|
log_warn "Could not list Sprite instances — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
|
|
sleep "${_retry_delay}"
|
|
_retry_delay=$((_retry_delay * 2))
|
|
continue
|
|
fi
|
|
log_err "Could not list Sprite instances after ${_max_retries} attempts"
|
|
return 1
|
|
fi
|
|
|
|
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
|
|
_attempt=$((_attempt + 1))
|
|
if [ "${_attempt}" -lt "${_max_retries}" ]; then
|
|
log_warn "Sprite instance ${app} not found — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
|
|
sleep "${_retry_delay}"
|
|
_retry_delay=$((_retry_delay * 2))
|
|
continue
|
|
fi
|
|
log_err "Sprite instance ${app} not found in sprite list after ${_max_retries} attempts"
|
|
return 1
|
|
fi
|
|
|
|
# Found the instance
|
|
log_ok "Sprite instance ${app} exists"
|
|
|
|
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
|
|
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
|
|
|
|
# Write metadata file
|
|
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
|
|
|
|
return 0
|
|
done
|
|
|
|
# Should not reach here, but guard against it
|
|
log_err "Sprite instance ${app} verification exhausted retries"
|
|
return 1
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_exec APP CMD
|
|
#
|
|
# Execute CMD on the Sprite instance via the sprite CLI.
|
|
# Pipes CMD via stdin to bash to avoid shell injection from embedded strings.
|
|
# Retries up to 3 times when the sprite CLI itself fails (config corruption).
|
|
# Returns the exit code of the remote command.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_exec() {
|
|
local app="$1"
|
|
local cmd="$2"
|
|
local _attempt=0
|
|
local _max=3
|
|
local _stderr_tmp
|
|
_stderr_tmp=$(mktemp /tmp/sprite-exec-err.XXXXXX) || return 1
|
|
|
|
# Base64-encode the command to prevent shell injection when passed to the
|
|
# remote bash. The encoded string contains only [A-Za-z0-9+/=] characters,
|
|
# making it safe to pipe through the sprite CLI exec interface.
|
|
local encoded_cmd
|
|
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
|
|
|
|
# Validate base64 output contains only safe characters (defense-in-depth).
|
|
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
|
|
rm -f "${_stderr_tmp}"
|
|
return 1
|
|
fi
|
|
|
|
while [ "${_attempt}" -lt "${_max}" ]; do
|
|
_sprite_fix_config
|
|
# Decode and execute on the remote side — the encoded payload is safe
|
|
# against shell metacharacters (;, |, $(), backticks).
|
|
printf '%s' "${encoded_cmd}" | _sprite_cmd exec -s "${app}" -- bash -c 'base64 -d | bash' 2>"${_stderr_tmp}"
|
|
local _rc=$?
|
|
if [ "${_rc}" -eq 0 ]; then
|
|
rm -f "${_stderr_tmp}"
|
|
return 0
|
|
fi
|
|
# Retry on sprite CLI errors (config corruption, connection issues)
|
|
if grep -qiE 'config|migrate|initialize|connection refused' "${_stderr_tmp}" 2>/dev/null; then
|
|
_attempt=$((_attempt + 1))
|
|
if [ "${_attempt}" -lt "${_max}" ]; then
|
|
sleep 2
|
|
continue
|
|
fi
|
|
fi
|
|
rm -f "${_stderr_tmp}"
|
|
return "${_rc}"
|
|
done
|
|
rm -f "${_stderr_tmp}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_teardown APP
|
|
#
|
|
# Destroy the Sprite instance and untrack it.
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_teardown() {
|
|
local app="$1"
|
|
|
|
log_step "Tearing down ${app}..."
|
|
|
|
_sprite_cmd destroy --force "${app}" >/dev/null 2>&1 || true
|
|
|
|
# Brief wait for destruction to propagate
|
|
sleep 2
|
|
|
|
# Verify deletion
|
|
local sprite_output
|
|
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
|
|
|
|
if printf '%s' "${sprite_output}" | grep -qF "${app}"; then
|
|
log_warn "Sprite instance ${app} may still exist"
|
|
else
|
|
log_ok "Sprite instance ${app} torn down"
|
|
fi
|
|
|
|
untrack_app "${app}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _sprite_cleanup_stale
|
|
#
|
|
# List all Sprite instances, filter for e2e-* names, and destroy any
|
|
# older than 30 minutes (based on the unix timestamp embedded in the name).
|
|
# ---------------------------------------------------------------------------
|
|
_sprite_cleanup_stale() {
|
|
local now
|
|
now=$(date +%s)
|
|
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
|
|
|
|
# List all sprites
|
|
local sprite_output
|
|
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
|
|
|
|
if [ -z "${sprite_output}" ]; then
|
|
log_info "Could not list Sprite instances or none found — skipping cleanup"
|
|
return 0
|
|
fi
|
|
|
|
# Extract names matching e2e-* pattern (one per line)
|
|
local instance_names
|
|
instance_names=$(printf '%s\n' "${sprite_output}" | grep -oE 'e2e-[a-zA-Z0-9_-]+' || true)
|
|
|
|
if [ -z "${instance_names}" ]; then
|
|
log_ok "No stale e2e Sprite instances found"
|
|
return 0
|
|
fi
|
|
|
|
local cleaned=0
|
|
local skipped=0
|
|
|
|
for instance_name in ${instance_names}; do
|
|
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
|
|
# The timestamp is the last dash-separated segment
|
|
local ts
|
|
ts=$(printf '%s' "${instance_name}" | sed 's/.*-//')
|
|
|
|
# Validate it looks like a unix timestamp (all digits, 10 chars)
|
|
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
|
|
log_warn "Skipping ${instance_name} — cannot parse timestamp"
|
|
skipped=$((skipped + 1))
|
|
continue
|
|
fi
|
|
|
|
local age=$((now - ts))
|
|
if [ "${age}" -gt "${max_age}" ]; then
|
|
local age_str
|
|
age_str=$(format_duration "${age}")
|
|
log_step "Destroying stale Sprite instance ${instance_name} (age: ${age_str})"
|
|
_sprite_teardown "${instance_name}" || log_warn "Failed to tear down ${instance_name}"
|
|
cleaned=$((cleaned + 1))
|
|
else
|
|
skipped=$((skipped + 1))
|
|
fi
|
|
done
|
|
|
|
if [ "${cleaned}" -gt 0 ]; then
|
|
log_ok "Cleaned ${cleaned} stale Sprite instance(s)"
|
|
fi
|
|
if [ "${skipped}" -gt 0 ]; then
|
|
log_info "Skipped ${skipped} recent Sprite instance(s)"
|
|
fi
|
|
}
|