spawn/sh/e2e/lib/clouds/sprite.sh
A b0f9f4e7af
refactor(e2e): normalize unused-arg comments in headless_env functions (#3113)
GCP, Sprite, and DigitalOcean had commented-out code `# local agent="$2"`
in their `_headless_env` functions. Hetzner already used the cleaner style
`# $2 = agent (unused but part of the interface)`. Normalize to match.

Co-authored-by: spawn-qa-bot <qa@openrouter.ai>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 03:51:07 +07:00

394 lines
14 KiB
Bash

#!/bin/bash
# e2e/lib/clouds/sprite.sh — Sprite cloud driver for multi-cloud E2E
#
# Implements the standard cloud driver interface (_sprite_* prefixed functions).
# Sourced by common.sh's load_cloud_driver() which wires these to generic names.
#
# Sprite uses its own CLI for execution — NO SSH is used.
# All remote commands run via: printf CMD | sprite exec -s NAME -- bash
#
# Depends on: log_step, log_ok, log_err, log_warn, log_info, format_duration,
# untrack_app (provided by common.sh)
set -eo pipefail
# Detected org — set during _sprite_validate_env.
# Passed as -o flag to all sprite CLI calls to avoid config file races
# from concurrent sprite exec calls corrupting ~/.sprites/sprites.json.
_SPRITE_ORG=""
# Helper: build org flags for sprite CLI calls.
# Outputs "-o" and the org name as separate lines for use with _sprite_cmd.
_sprite_org_flags() {
if [ -n "${_SPRITE_ORG}" ]; then
printf '%s\n%s' "-o" "${_SPRITE_ORG}"
fi
}
# Helper: run sprite CLI with org flags safely (no word-splitting).
# Usage: _sprite_cmd [extra args...]
# Reads org flags via _sprite_org_flags and builds a proper argument array.
_sprite_cmd() {
local _args
_args=()
if [ -n "${_SPRITE_ORG}" ]; then
_args+=("-o" "${_SPRITE_ORG}")
fi
sprite "${_args[@]}" "$@"
}
# Helper: fix corrupted sprite config (double-closing-brace from concurrent writes)
_sprite_fix_config() {
local cfg="${HOME}/.sprites/sprites.json"
if [ -f "${cfg}" ]; then
# Check for double-brace corruption (most common race condition pattern).
# The sprite CLI's concurrent writes append an extra } at the end.
# Use grep on the whole file for any line that is just }}
if grep -q '^}}$' "${cfg}" 2>/dev/null; then
local tmp
tmp=$(mktemp "${cfg}.XXXXXX") || return
sed 's/^}}$/}/' "${cfg}" > "${tmp}" 2>/dev/null && mv "${tmp}" "${cfg}" 2>/dev/null || rm -f "${tmp}"
fi
# Also check if last non-empty line ends with }}
local last_content
last_content=$(tail -5 "${cfg}" | grep -v '^$' | tail -1)
if printf '%s' "${last_content}" | grep -q '}}$'; then
local tmp
tmp=$(mktemp "${cfg}.XXXXXX") || return
# Replace the LAST occurrence of }} with }
sed '$ s/}}$/}/' "${cfg}" > "${tmp}" 2>/dev/null && mv "${tmp}" "${cfg}" 2>/dev/null || rm -f "${tmp}"
fi
fi
}
# ---------------------------------------------------------------------------
# _sprite_max_parallel
#
# Sprite CLI gets rate-limited with too many concurrent calls.
# Cap to 2 agents at a time.
# ---------------------------------------------------------------------------
_sprite_max_parallel() {
printf '1'
}
# ---------------------------------------------------------------------------
# _sprite_install_wait
#
# Sprite exec is slower per-call than SSH — give installs more time to complete.
# ---------------------------------------------------------------------------
_sprite_install_wait() {
printf '300'
}
# ---------------------------------------------------------------------------
# _sprite_validate_env
#
# Check that the sprite CLI is installed and credentials are valid.
# Returns 0 on success, 1 on failure.
# ---------------------------------------------------------------------------
_sprite_validate_env() {
if ! command -v sprite >/dev/null 2>&1; then
log_err "sprite CLI not found. Install from https://docs.sprite.dev"
return 1
fi
local org_output
org_output=$(sprite org list 2>/dev/null || true)
if [ -z "${org_output}" ]; then
log_err "Sprite credentials are not valid. Run: sprite auth login"
return 1
fi
# Extract org name and cache it — all subsequent sprite CLI calls use -o flag
# to avoid concurrent config file reads/writes corrupting sprites.json
_SPRITE_ORG=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
if [ -z "${_SPRITE_ORG}" ]; then
# Fallback: try SPRITE_ORG env var
_SPRITE_ORG="${SPRITE_ORG:-}"
fi
# Validate org name contains only safe characters (alphanumeric, dash, underscore)
# to prevent injection via crafted org names in subsequent CLI calls.
if [ -n "${_SPRITE_ORG}" ] && ! printf '%s' "${_SPRITE_ORG}" | grep -qE '^[A-Za-z0-9_-]+$'; then
log_err "Invalid Sprite org name: ${_SPRITE_ORG}"
return 1
fi
if [ -n "${_SPRITE_ORG}" ]; then
log_ok "Sprite credentials validated (org: ${_SPRITE_ORG})"
else
log_ok "Sprite credentials validated"
fi
return 0
}
# ---------------------------------------------------------------------------
# _sprite_headless_env APP AGENT
#
# Print export lines to stdout for headless provisioning.
# These are eval'd by the provisioning harness before invoking the CLI.
# ---------------------------------------------------------------------------
_sprite_headless_env() {
local app="$1"
# $2 = agent (unused but part of the interface)
printf 'export SPRITE_NAME="%s"\n' "${app}"
if [ -n "${_SPRITE_ORG}" ]; then
printf 'export SPRITE_ORG="%s"\n' "${_SPRITE_ORG}"
fi
}
# ---------------------------------------------------------------------------
# _sprite_refresh_auth
#
# Re-validate Sprite credentials by running `sprite org list`. If the token
# has expired (common after ~60 min), re-run `sprite auth login --headless`
# to obtain a fresh token. Updates _SPRITE_ORG on success.
#
# Called before each E2E provisioning batch to prevent auth expiry failures
# in long-running E2E suites (73+ min). See #2934.
# ---------------------------------------------------------------------------
_sprite_refresh_auth() {
local org_output
org_output=$(sprite org list 2>/dev/null || true)
if [ -n "${org_output}" ]; then
# Token is still valid — update org in case it changed
local refreshed_org
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
if [ -n "${refreshed_org}" ]; then
_SPRITE_ORG="${refreshed_org}"
fi
log_info "Sprite auth token is still valid"
return 0
fi
# Token expired — attempt re-auth via sprite auth refresh
log_warn "Sprite auth token expired — attempting refresh..."
if sprite auth refresh >/dev/null 2>&1; then
org_output=$(sprite org list 2>/dev/null || true)
if [ -n "${org_output}" ]; then
local refreshed_org
refreshed_org=$(printf '%s' "${org_output}" | sed -n 's/.*Currently selected org: *//p' | awk '{print $1}')
if [ -n "${refreshed_org}" ]; then
_SPRITE_ORG="${refreshed_org}"
fi
log_ok "Sprite auth token refreshed successfully"
return 0
fi
fi
log_err "Sprite auth refresh failed — subsequent operations may fail"
return 1
}
# ---------------------------------------------------------------------------
# _sprite_provision_verify APP LOG_DIR
#
# Verify sprite VM exists after provisioning by checking `sprite list` output
# for the APP name. Write sentinel and metadata files for downstream steps.
#
# Retries up to 3 times with exponential backoff (5s, 10s, 20s) to handle
# transient list failures from CLI rate-limiting or config corruption (#2934).
#
# Writes:
# $LOG_DIR/$APP.ip — "sprite-cli" sentinel (no IP — Sprite uses names)
# $LOG_DIR/$APP.meta — instance metadata (JSON)
# ---------------------------------------------------------------------------
_sprite_provision_verify() {
local app="$1"
local log_dir="$2"
local _max_retries=3
local _retry_delay=5
local _attempt=0
while [ "${_attempt}" -lt "${_max_retries}" ]; do
# Fix config before each attempt (concurrent writes may corrupt it)
_sprite_fix_config
local sprite_output
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
if [ -z "${sprite_output}" ]; then
_attempt=$((_attempt + 1))
if [ "${_attempt}" -lt "${_max_retries}" ]; then
log_warn "Could not list Sprite instances — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
sleep "${_retry_delay}"
_retry_delay=$((_retry_delay * 2))
continue
fi
log_err "Could not list Sprite instances after ${_max_retries} attempts"
return 1
fi
if ! printf '%s' "${sprite_output}" | grep -qF "${app}"; then
_attempt=$((_attempt + 1))
if [ "${_attempt}" -lt "${_max_retries}" ]; then
log_warn "Sprite instance ${app} not found — retrying in ${_retry_delay}s (${_attempt}/${_max_retries})"
sleep "${_retry_delay}"
_retry_delay=$((_retry_delay * 2))
continue
fi
log_err "Sprite instance ${app} not found in sprite list after ${_max_retries} attempts"
return 1
fi
# Found the instance
log_ok "Sprite instance ${app} exists"
# Write sentinel — Sprite has no IP; use "sprite-cli" as marker
printf '%s' "sprite-cli" > "${log_dir}/${app}.ip"
# Write metadata file
printf '{"name":"%s"}\n' "${app}" > "${log_dir}/${app}.meta"
return 0
done
# Should not reach here, but guard against it
log_err "Sprite instance ${app} verification exhausted retries"
return 1
}
# ---------------------------------------------------------------------------
# _sprite_exec APP CMD
#
# Execute CMD on the Sprite instance via the sprite CLI.
# Pipes CMD via stdin to bash to avoid shell injection from embedded strings.
# Retries up to 3 times when the sprite CLI itself fails (config corruption).
# Returns the exit code of the remote command.
# ---------------------------------------------------------------------------
_sprite_exec() {
local app="$1"
local cmd="$2"
local _attempt=0
local _max=3
local _stderr_tmp
_stderr_tmp=$(mktemp /tmp/sprite-exec-err.XXXXXX) || return 1
# Base64-encode the command to prevent shell injection when passed to the
# remote bash. The encoded string contains only [A-Za-z0-9+/=] characters,
# making it safe to pipe through the sprite CLI exec interface.
local encoded_cmd
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
# Validate base64 output contains only safe characters (defense-in-depth).
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
rm -f "${_stderr_tmp}"
return 1
fi
while [ "${_attempt}" -lt "${_max}" ]; do
_sprite_fix_config
# Decode and execute on the remote side — the encoded payload is safe
# against shell metacharacters (;, |, $(), backticks).
printf '%s' "${encoded_cmd}" | _sprite_cmd exec -s "${app}" -- bash -c 'base64 -d | bash' 2>"${_stderr_tmp}"
local _rc=$?
if [ "${_rc}" -eq 0 ]; then
rm -f "${_stderr_tmp}"
return 0
fi
# Retry on sprite CLI errors (config corruption, connection issues)
if grep -qiE 'config|migrate|initialize|connection refused' "${_stderr_tmp}" 2>/dev/null; then
_attempt=$((_attempt + 1))
if [ "${_attempt}" -lt "${_max}" ]; then
sleep 2
continue
fi
fi
rm -f "${_stderr_tmp}"
return "${_rc}"
done
rm -f "${_stderr_tmp}"
}
# ---------------------------------------------------------------------------
# _sprite_teardown APP
#
# Destroy the Sprite instance and untrack it.
# ---------------------------------------------------------------------------
_sprite_teardown() {
local app="$1"
log_step "Tearing down ${app}..."
_sprite_cmd destroy --force "${app}" >/dev/null 2>&1 || true
# Brief wait for destruction to propagate
sleep 2
# Verify deletion
local sprite_output
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
if printf '%s' "${sprite_output}" | grep -qF "${app}"; then
log_warn "Sprite instance ${app} may still exist"
else
log_ok "Sprite instance ${app} torn down"
fi
untrack_app "${app}"
}
# ---------------------------------------------------------------------------
# _sprite_cleanup_stale
#
# List all Sprite instances, filter for e2e-* names, and destroy any
# older than 30 minutes (based on the unix timestamp embedded in the name).
# ---------------------------------------------------------------------------
_sprite_cleanup_stale() {
local now
now=$(date +%s)
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
# List all sprites
local sprite_output
sprite_output=$(_sprite_cmd list 2>/dev/null || true)
if [ -z "${sprite_output}" ]; then
log_info "Could not list Sprite instances or none found — skipping cleanup"
return 0
fi
# Extract names matching e2e-* pattern (one per line)
local instance_names
instance_names=$(printf '%s\n' "${sprite_output}" | grep -oE 'e2e-[a-zA-Z0-9_-]+' || true)
if [ -z "${instance_names}" ]; then
log_ok "No stale e2e Sprite instances found"
return 0
fi
local cleaned=0
local skipped=0
for instance_name in ${instance_names}; do
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
# The timestamp is the last dash-separated segment
local ts
ts=$(printf '%s' "${instance_name}" | sed 's/.*-//')
# Validate it looks like a unix timestamp (all digits, 10 chars)
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
log_warn "Skipping ${instance_name} — cannot parse timestamp"
skipped=$((skipped + 1))
continue
fi
local age=$((now - ts))
if [ "${age}" -gt "${max_age}" ]; then
local age_str
age_str=$(format_duration "${age}")
log_step "Destroying stale Sprite instance ${instance_name} (age: ${age_str})"
_sprite_teardown "${instance_name}" || log_warn "Failed to tear down ${instance_name}"
cleaned=$((cleaned + 1))
else
skipped=$((skipped + 1))
fi
done
if [ "${cleaned}" -gt 0 ]; then
log_ok "Cleaned ${cleaned} stale Sprite instance(s)"
fi
if [ "${skipped}" -gt 0 ]; then
log_info "Skipped ${skipped} recent Sprite instance(s)"
fi
}