spawn/sh/e2e/lib/provision.sh
Ahmed Abushagur 45caf4b96b
fix(sprite): fix all 6 Sprite agent installs for E2E (#2057)
* fix(sprite): fix all 6 Sprite agent installs for E2E

- Use `npm install -g --prefix` instead of `npm config set prefix` to
  avoid creating .npmrc that conflicts with nvm on Sprite VMs
- Fix shell environment setup to only modify .bash_profile (not .bashrc)
  so non-interactive bash -c commands retain PATH config
- Add $HOME/.cargo/bin to PATH for zeroclaw (Sprite has no ~/.cargo/env)
- Add $HOME/.local/bin to PATH config for Sprite shell environment
- Add sprite E2E cloud driver with org detection, config corruption fix,
  direct command embedding (not $1 positional), and retry logic
- Fix provision.sh to kill full process tree after timeout (prevents
  orphaned sprite exec sessions from corrupting config)
- Fix verify.sh zeroclaw check to not rely on ~/.cargo/env existing

Tested: 6/6 Sprite agents pass E2E (claude, codex, openclaw, zeroclaw,
opencode, kilocode). Hermes is not in the Sprite manifest.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: biome format - collapse runSprite call to single line

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: L <6723574+louisgv@users.noreply.github.com>
2026-03-01 07:15:09 -05:00

134 lines
4.9 KiB
Bash

#!/bin/bash
# e2e/lib/provision.sh — Provision an agent VM via spawn CLI (cloud-agnostic)
set -eo pipefail
# ---------------------------------------------------------------------------
# provision_agent AGENT APP_NAME LOG_DIR
#
# Runs spawn in headless mode with a timeout. The provision process hangs on
# the interactive SSH session (step 12 of the orchestration), so we kill it
# after PROVISION_TIMEOUT seconds. The install itself usually succeeds; we
# verify via instance existence and .spawnrc presence afterward.
#
# Uses cloud driver functions:
# cloud_headless_env — cloud-specific env var exports
# cloud_provision_verify — check instance exists, write IP + metadata
# cloud_exec — remote command execution
# ---------------------------------------------------------------------------
provision_agent() {
local agent="$1"
local app_name="$2"
local log_dir="$3"
local exit_file="${log_dir}/${app_name}.exit"
local stdout_file="${log_dir}/${app_name}.stdout"
local stderr_file="${log_dir}/${app_name}.stderr"
# Resolve CLI entry point
# SPAWN_CLI_DIR overrides auto-resolution — use this to force local source code
local cli_entry
if [ -n "${SPAWN_CLI_DIR:-}" ]; then
cli_entry="${SPAWN_CLI_DIR}/packages/cli/src/index.ts"
else
cli_entry="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)/packages/cli/src/index.ts"
fi
if [ ! -f "${cli_entry}" ]; then
log_err "CLI entry point not found: ${cli_entry}"
return 1
fi
log_step "Provisioning ${agent} as ${app_name} on ${ACTIVE_CLOUD} (timeout: ${PROVISION_TIMEOUT}s)"
# Remove stale exit file
rm -f "${exit_file}"
# Environment for headless provisioning
# MODEL_ID bypasses the interactive model selection prompt (required by openclaw)
(
export SPAWN_NON_INTERACTIVE=1
export SPAWN_SKIP_GITHUB_AUTH=1
export SPAWN_SKIP_API_VALIDATION=1
export SPAWN_NO_UPDATE_CHECK=1
export BUN_RUNTIME_TRANSPILER_CACHE_PATH=0
export SPAWN_CLI_DIR="${SPAWN_CLI_DIR:-}"
export MODEL_ID="${MODEL_ID:-openrouter/auto}"
export OPENROUTER_API_KEY="${OPENROUTER_API_KEY}"
# Apply cloud-specific env vars (safe: only processes export VAR="VALUE" lines)
while IFS= read -r _env_line; do
if [[ "${_env_line}" =~ ^export[[:space:]]+([A-Za-z_][A-Za-z0-9_]*)=\"(.*)\"$ ]]; then
export "${BASH_REMATCH[1]}"="${BASH_REMATCH[2]}"
fi
done <<CLOUD_ENV
$(cloud_headless_env "${app_name}" "${agent}")
CLOUD_ENV
bun run "${cli_entry}" "${agent}" "${ACTIVE_CLOUD}" --headless --output json \
> "${stdout_file}" 2> "${stderr_file}"
printf '%s' "$?" > "${exit_file}"
) &
local pid=$!
# Poll for completion or timeout (bash 3.2 compatible — no wait -n)
local waited=0
while [ "${waited}" -lt "${PROVISION_TIMEOUT}" ]; do
if [ -f "${exit_file}" ]; then
break
fi
sleep 5
waited=$((waited + 5))
done
# Kill if still running (the interactive SSH/CLI session hangs)
if [ ! -f "${exit_file}" ]; then
log_warn "Provision timed out after ${PROVISION_TIMEOUT}s — killing (install may still succeed)"
# Kill the entire process tree — the subshell spawns bun → sprite exec -tty
# which won't die from just killing the subshell PID. Without this, orphaned
# sprite exec sessions keep running and corrupt the sprite config file.
pkill -P "${pid}" 2>/dev/null || true
kill "${pid}" 2>/dev/null || true
wait "${pid}" 2>/dev/null || true
# Also kill any lingering sprite exec processes for this specific app
pkill -f "sprite.*exec.*${app_name}" 2>/dev/null || true
sleep 1
fi
# Even if provision "failed" (timeout), the instance may exist and install may have completed.
# Verify instance existence via cloud driver.
if ! cloud_provision_verify "${app_name}" "${log_dir}"; then
log_err "Instance ${app_name} does not exist after provisioning"
if [ -f "${stderr_file}" ]; then
log_err "Stderr tail:"
tail -20 "${stderr_file}" >&2 || true
fi
return 1
fi
log_ok "Instance ${app_name} verified"
# Wait for install to complete (.spawnrc is written near the end)
local effective_install_wait
effective_install_wait=$(cloud_install_wait)
log_step "Waiting for install to complete (polling .spawnrc, up to ${effective_install_wait}s)..."
local install_waited=0
local install_ok=0
while [ "${install_waited}" -lt "${effective_install_wait}" ]; do
if cloud_exec "${app_name}" "test -f ~/.spawnrc" >/dev/null 2>&1; then
install_ok=1
break
fi
sleep 10
install_waited=$((install_waited + 10))
done
if [ "${install_ok}" -eq 1 ]; then
# Settle time for agent binary install to finish after .spawnrc is written
sleep 5
log_ok "Install completed (.spawnrc found)"
return 0
else
log_warn ".spawnrc not found after ${effective_install_wait}s — install may still be running"
return 0 # Continue to verification; it will catch real failures
fi
}