spawn/sh/e2e/lib/provision.sh

#!/bin/bash
# e2e/lib/provision.sh — Provision an agent VM via spawn CLI (cloud-agnostic)
set -eo pipefail

# ---------------------------------------------------------------------------
# provision_agent AGENT APP_NAME LOG_DIR
#
# Runs spawn in headless mode with a timeout. The provision process hangs on
# the interactive SSH session (step 12 of the orchestration), so we kill it
# after PROVISION_TIMEOUT seconds. The install itself usually succeeds; we
# verify via instance existence and .spawnrc presence afterward.
#
# Uses cloud driver functions:
#   cloud_headless_env  — cloud-specific env var exports
#   cloud_provision_verify — check instance exists, write IP + metadata
#   cloud_exec          — remote command execution
# ---------------------------------------------------------------------------
provision_agent() {
  local agent="$1"
  local app_name="$2"
  local log_dir="$3"

  local exit_file="${log_dir}/${app_name}.exit"
  local stdout_file="${log_dir}/${app_name}.stdout"
  local stderr_file="${log_dir}/${app_name}.stderr"

  # Resolve CLI entry point
  # SPAWN_CLI_DIR overrides auto-resolution — use this to force local source code
  local cli_entry
  if [ -n "${SPAWN_CLI_DIR:-}" ]; then
    cli_entry="${SPAWN_CLI_DIR}/packages/cli/src/index.ts"
  else
    cli_entry="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)/packages/cli/src/index.ts"
  fi

  if [ ! -f "${cli_entry}" ]; then
    log_err "CLI entry point not found: ${cli_entry}"
    return 1
  fi

  log_step "Provisioning ${agent} as ${app_name} on ${ACTIVE_CLOUD} (timeout: ${PROVISION_TIMEOUT}s)"

  # Remove stale exit file
  rm -f "${exit_file}"

  # Get cloud-specific env var exports
  local cloud_env
  cloud_env=$(cloud_headless_env "${app_name}" "${agent}")

  # Environment for headless provisioning
  # MODEL_ID bypasses the interactive model selection prompt (required by openclaw)
  (
    export SPAWN_NON_INTERACTIVE=1
    export SPAWN_SKIP_GITHUB_AUTH=1
    export SPAWN_SKIP_API_VALIDATION=1
    export SPAWN_NO_UPDATE_CHECK=1
    export BUN_RUNTIME_TRANSPILER_CACHE_PATH=0
    export SPAWN_CLI_DIR="${SPAWN_CLI_DIR:-}"
    export MODEL_ID="${MODEL_ID:-openrouter/auto}"
    export OPENROUTER_API_KEY="${OPENROUTER_API_KEY}"

    # Apply cloud-specific env vars
    eval "${cloud_env}"

    bun run "${cli_entry}" "${agent}" "${ACTIVE_CLOUD}" --headless --output json \
      > "${stdout_file}" 2> "${stderr_file}"
    printf '%s' "$?" > "${exit_file}"
  ) &
  local pid=$!

  # Poll for completion or timeout (bash 3.2 compatible — no wait -n)
  local waited=0
  while [ "${waited}" -lt "${PROVISION_TIMEOUT}" ]; do
    if [ -f "${exit_file}" ]; then
      break
    fi
    sleep 5
    waited=$((waited + 5))
  done

  # Kill if still running (the interactive SSH session hangs)
  if [ ! -f "${exit_file}" ]; then
    log_warn "Provision timed out after ${PROVISION_TIMEOUT}s — killing (install may still succeed)"
    kill "${pid}" 2>/dev/null || true
    wait "${pid}" 2>/dev/null || true
  fi

  # Even if provision "failed" (timeout), the instance may exist and install may have completed.
  # Verify instance existence via cloud driver.
  if ! cloud_provision_verify "${app_name}" "${log_dir}"; then
    log_err "Instance ${app_name} does not exist after provisioning"
    if [ -f "${stderr_file}" ]; then
      log_err "Stderr tail:"
      tail -20 "${stderr_file}" >&2 || true
    fi
    return 1
  fi

  log_ok "Instance ${app_name} verified"

  # Wait for install to complete (.spawnrc is written near the end)
  local effective_install_wait
  effective_install_wait=$(cloud_install_wait)
  log_step "Waiting for install to complete (polling .spawnrc, up to ${effective_install_wait}s)..."
  local install_waited=0
  local install_ok=0
  while [ "${install_waited}" -lt "${effective_install_wait}" ]; do
    if cloud_exec "${app_name}" "test -f ~/.spawnrc" >/dev/null 2>&1; then
      install_ok=1
      break
    fi
    sleep 10
    install_waited=$((install_waited + 10))
  done

  if [ "${install_ok}" -eq 1 ]; then
    # Settle time for agent binary install to finish after .spawnrc is written
    sleep 5
    log_ok "Install completed (.spawnrc found)"
    return 0
  else
    log_warn ".spawnrc not found after ${effective_install_wait}s — install may still be running"
    return 0  # Continue to verification; it will catch real failures
  fi
}