spawn/sh/e2e/lib/provision.sh

#!/bin/bash
# e2e/lib/provision.sh — Provision an agent VM via spawn CLI (cloud-agnostic)
set -eo pipefail

# ---------------------------------------------------------------------------
# provision_agent AGENT APP_NAME LOG_DIR
#
# Runs spawn in headless mode with a timeout. The provision process hangs on
# the interactive SSH session (step 12 of the orchestration), so we kill it
# after PROVISION_TIMEOUT seconds. The install itself usually succeeds; we
# verify via instance existence and .spawnrc presence afterward.
#
# Uses cloud driver functions:
#   cloud_headless_env  — cloud-specific env var exports
#   cloud_provision_verify — check instance exists, write IP + metadata
#   cloud_exec          — remote command execution
# ---------------------------------------------------------------------------
provision_agent() {
  local agent="$1"
  local app_name="$2"
  local log_dir="$3"

  local exit_file="${log_dir}/${app_name}.exit"
  local stdout_file="${log_dir}/${app_name}.stdout"
  local stderr_file="${log_dir}/${app_name}.stderr"

  # Resolve CLI entry point
  # SPAWN_CLI_DIR overrides auto-resolution — use this to force local source code
  local cli_entry
  if [ -n "${SPAWN_CLI_DIR:-}" ]; then
    cli_entry="${SPAWN_CLI_DIR}/packages/cli/src/index.ts"
  else
    cli_entry="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)/packages/cli/src/index.ts"
  fi

  if [ ! -f "${cli_entry}" ]; then
    log_err "CLI entry point not found: ${cli_entry}"
    return 1
  fi

  log_step "Provisioning ${agent} as ${app_name} on ${ACTIVE_CLOUD} (timeout: ${PROVISION_TIMEOUT}s)"

  # Remove stale exit file
  rm -f "${exit_file}"

  # Environment for headless provisioning
  # MODEL_ID bypasses the interactive model selection prompt (required by openclaw)
  (
    export SPAWN_NON_INTERACTIVE=1
    export SPAWN_SKIP_GITHUB_AUTH=1
    export SPAWN_SKIP_API_VALIDATION=1
    export SPAWN_NO_UPDATE_CHECK=1
    export BUN_RUNTIME_TRANSPILER_CACHE_PATH=0
    export SPAWN_CLI_DIR="${SPAWN_CLI_DIR:-}"
    export MODEL_ID="${MODEL_ID:-openrouter/auto}"
    export OPENROUTER_API_KEY="${OPENROUTER_API_KEY}"

    # Apply cloud-specific env vars (safe: only processes export VAR="VALUE" lines)
    while IFS= read -r _env_line; do
      if [[ "${_env_line}" =~ ^export[[:space:]]+([A-Za-z_][A-Za-z0-9_]*)=\"(.*)\"$ ]]; then
        export "${BASH_REMATCH[1]}"="${BASH_REMATCH[2]}"
      fi
    done <<CLOUD_ENV
$(cloud_headless_env "${app_name}" "${agent}")
CLOUD_ENV

    bun run "${cli_entry}" "${agent}" "${ACTIVE_CLOUD}" --headless --output json \
      > "${stdout_file}" 2> "${stderr_file}"
    printf '%s' "$?" > "${exit_file}"
  ) &
  local pid=$!

  # Poll for completion or timeout (bash 3.2 compatible — no wait -n)
  local waited=0
  while [ "${waited}" -lt "${PROVISION_TIMEOUT}" ]; do
    if [ -f "${exit_file}" ]; then
      break
    fi
    sleep 5
    waited=$((waited + 5))
  done

  # Kill if still running (the interactive SSH/CLI session hangs)
  if [ ! -f "${exit_file}" ]; then
    log_warn "Provision timed out after ${PROVISION_TIMEOUT}s — killing (install may still succeed)"
    # Kill the entire process tree — the subshell spawns bun → sprite exec -tty
    # which won't die from just killing the subshell PID. Without this, orphaned
    # sprite exec sessions keep running and corrupt the sprite config file.
    pkill -P "${pid}" 2>/dev/null || true
    kill "${pid}" 2>/dev/null || true
    wait "${pid}" 2>/dev/null || true
    # Also kill any lingering sprite exec processes for this specific app
    pkill -f "sprite.*exec.*${app_name}" 2>/dev/null || true
    sleep 1
  fi

  # Even if provision "failed" (timeout), the instance may exist and install may have completed.
  # Verify instance existence via cloud driver.
  if ! cloud_provision_verify "${app_name}" "${log_dir}"; then
    log_err "Instance ${app_name} does not exist after provisioning"
    if [ -f "${stderr_file}" ]; then
      log_err "Stderr tail:"
      tail -20 "${stderr_file}" >&2 || true
    fi
    return 1
  fi

  log_ok "Instance ${app_name} verified"

  # Wait for install to complete (.spawnrc is written near the end)
  local effective_install_wait
  effective_install_wait=$(cloud_install_wait)
  log_step "Waiting for install to complete (polling .spawnrc, up to ${effective_install_wait}s)..."
  local install_waited=0
  local install_ok=0
  while [ "${install_waited}" -lt "${effective_install_wait}" ]; do
    if cloud_exec "${app_name}" "test -f ~/.spawnrc" >/dev/null 2>&1; then
      install_ok=1
      break
    fi
    sleep 10
    install_waited=$((install_waited + 10))
  done

  if [ "${install_ok}" -eq 1 ]; then
    # Settle time for agent binary install to finish after .spawnrc is written
    sleep 5
    log_ok "Install completed (.spawnrc found)"
    return 0
  else
    log_warn ".spawnrc not found after ${effective_install_wait}s — install may still be running"
    return 0  # Continue to verification; it will catch real failures
  fi
}