spawn/test/e2e.sh
L 32522882c1
feat: remove OVH cloud and make featured_cloud an array (#1474)
- Remove OVH as a cloud provider: delete ovh/ directory (lib + 11 agent
  scripts), remove from manifest.json clouds and all ovh/* matrix entries,
  update README matrix table, remove OVH destroy case in CLI commands,
  and clean up all test harness references (mock.sh, mock-curl-script.sh,
  record.sh, e2e.sh, cloud-lib-api-surface.test.ts, test-infra-sync.test.ts)

- Make featured_cloud an array (string[]) so agents can recommend multiple
  clouds; update manifest.ts type, all 10 manifest.json values, and the
  prioritizeCloudsByCredentials() comparison in commands.ts

- Sandbox OAuth in subprocess tests: add OPENROUTER_API_KEY=sk-or-test-fake
  to the default env in cli-entry-edge-cases.test.ts and
  cmdrun-resolution.test.ts so get_or_prompt_api_key() never triggers the
  real OAuth browser flow during test runs

- Fix upload-file-security.test.ts SSH cloud count (5→4) after OVH removal

- Bump CLI version 0.5.6 → 0.5.7

Co-authored-by: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-02-19 11:06:27 -08:00

1377 lines
43 KiB
Bash

#!/bin/bash
set -eo pipefail
# E2E Tests — Real server provisioning, agent install, and verification
# By default runs ONE agent per cloud (smoke test). Use --all for the full matrix.
#
# Usage:
# bash test/e2e.sh # One agent per cloud (smoke test)
# bash test/e2e.sh --all # All agents on all clouds (full matrix)
# bash test/e2e.sh fly # One agent on fly
# bash test/e2e.sh fly openclaw # Single combo
# bash test/e2e.sh fly --all # All agents on fly
# bash test/e2e.sh --cleanup # Destroy stale e2e-* servers
# bash test/e2e.sh --history # Show timing history
# bash test/e2e.sh --compare openclaw # Compare agent across clouds
#
# Environment:
# OPENROUTER_API_KEY — Required for all tests
# E2E_CANARY_AGENT — Agent to use for smoke tests (default: openclaw)
# E2E_AUTO_FIX — Set to "1" to spawn Claude agents for failures (default: 0)
# E2E_OPTIMIZE — Set to "1" to spawn Claude agents for slow-but-passing tests (default: 0)
# E2E_TIMEOUT — Per-combo timeout in seconds (default: 900)
#
# Each agent script runs with SPAWN_NON_INTERACTIVE=1 so safe_read() fails
# immediately instead of hanging on /dev/tty. Cloud-specific env vars
# (HETZNER_LOCATION, FLY_REGION, etc.) are auto-set to sane defaults.
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
E2E_TIMEOUT="${E2E_TIMEOUT:-900}"
E2E_AUTO_FIX="${E2E_AUTO_FIX:-0}"
E2E_OPTIMIZE="${E2E_OPTIMIZE:-0}"
E2E_ALL=0
E2E_CANARY_AGENT="${E2E_CANARY_AGENT:-openclaw}"
E2E_RESULTS_DIR=""
E2E_SERVER_PREFIX="e2e"
E2E_PIDS=""
E2E_TIMINGS_FILE="${REPO_ROOT}/.docs/e2e-timings.json"
E2E_SLOW_THRESHOLD=180 # seconds — flag as slow even if passing
# --- Logging ---
_e2e_log() {
printf '[%s] [e2e] %s\n' "$(date +'%H:%M:%S')" "$*"
}
_e2e_pass() {
printf ' \033[32m✓\033[0m %s\n' "$*"
}
_e2e_fail() {
printf ' \033[31m✗\033[0m %s\n' "$*"
}
# --- Cloud config lookup (bash 3.2 compatible — no associative arrays) ---
# Get the env var name used for server/app name
_get_name_env_var() {
case "$1" in
fly) echo "FLY_APP_NAME" ;;
hetzner) echo "HETZNER_SERVER_NAME" ;;
digitalocean) echo "DO_DROPLET_NAME" ;;
aws) echo "LIGHTSAIL_SERVER_NAME" ;;
daytona) echo "DAYTONA_SANDBOX_NAME" ;;
gcp) echo "GCP_INSTANCE_NAME" ;;
sprite) echo "SPRITE_NAME" ;;
*) echo "" ;;
esac
}
# Get the env var name used for cloud token
_get_token_env_var() {
case "$1" in
fly) echo "FLY_API_TOKEN" ;;
hetzner) echo "HCLOUD_TOKEN" ;;
digitalocean) echo "DO_API_TOKEN" ;;
daytona) echo "DAYTONA_API_KEY" ;;
*) echo "" ;;
esac
}
# --- Credential helpers ---
# Try to load a token from the spawn config file into the env var.
# Returns 0 if token was loaded, 1 if not.
_load_token_from_config() {
local cloud="$1"
local token_var
token_var=$(_get_token_env_var "$cloud")
[[ -z "$token_var" ]] && return 1
# Already set — nothing to do
local current="${!token_var:-}"
[[ -n "$current" ]] && return 0
local config_file="${HOME}/.config/spawn/${cloud}.json"
[[ -f "$config_file" ]] || return 1
local saved
saved=$(python3 -c "import json, sys; data=json.load(open(sys.argv[1])); print(data.get('api_key','') or data.get('token',''))" "$config_file" 2>/dev/null)
if [[ -n "$saved" ]]; then
export "$token_var=$saved"
return 0
fi
return 1
}
# Interactive credential collection — runs BEFORE non-interactive tests.
# For each token-based cloud, ensures the env var is set by:
# 1. Checking the env var
# 2. Loading from ~/.config/spawn/{cloud}.json
# 3. Prompting the user (Enter to skip)
_collect_credentials() {
local clouds="$1"
local collected=""
local skipped=""
for cloud in $clouds; do
local token_var
token_var=$(_get_token_env_var "$cloud")
# CLI-auth clouds (aws, gcp, sprite) — no token to collect
[[ -z "$token_var" ]] && continue
# Already in env?
if [[ -n "${!token_var:-}" ]]; then
collected="${collected} ${cloud}"
continue
fi
# Try config file
if _load_token_from_config "$cloud"; then
_e2e_log "Loaded ${token_var} from ~/.config/spawn/${cloud}.json"
collected="${collected} ${cloud}"
continue
fi
# Fly: try CLI auth (fly auth token)
if [[ "$cloud" == "fly" ]] && _try_fly_cli_token; then
_e2e_log "Loaded FLY_API_TOKEN from fly CLI auth"
collected="${collected} ${cloud}"
continue
fi
# No TTY? Can't prompt — skip
if ! echo -n "" > /dev/tty 2>/dev/null; then
skipped="${skipped} ${cloud}"
continue
fi
# Interactive prompt
printf ' %s: paste %s (Enter to skip): ' "$cloud" "$token_var"
local token=""
read -r token </dev/tty
if [[ -n "$token" ]]; then
export "$token_var=$token"
collected="${collected} ${cloud}"
else
skipped="${skipped} ${cloud}"
fi
done
if [[ -n "$skipped" ]]; then
_e2e_log "Skipped (no credentials):${skipped}"
fi
}
# Try to get FLY_API_TOKEN from the flyctl CLI (fly auth token)
_try_fly_cli_token() {
local fly_cmd=""
if command -v fly &>/dev/null; then
fly_cmd="fly"
elif command -v flyctl &>/dev/null; then
fly_cmd="flyctl"
else
return 1
fi
local token
token=$("$fly_cmd" auth token 2>/dev/null) || return 1
if [[ -n "$token" ]]; then
export FLY_API_TOKEN="$token"
return 0
fi
return 1
}
# --- Credential check ---
# Check if a cloud has credentials available (non-interactive)
_cloud_has_credentials() {
local cloud="$1"
local token_var
token_var=$(_get_token_env_var "$cloud")
# Clouds that use CLI auth rather than env var tokens
case "$cloud" in
aws) command -v aws &>/dev/null && aws sts get-caller-identity &>/dev/null 2>&1; return $? ;;
gcp) command -v gcloud &>/dev/null && gcloud auth print-access-token &>/dev/null 2>&1; return $? ;;
sprite) command -v sprite &>/dev/null; return $? ;;
local) return 0 ;;
esac
# Token-based clouds: check env var, then spawn config file, then CLI
if [[ -n "$token_var" ]]; then
local token_val="${!token_var:-}"
if [[ -n "$token_val" ]]; then
return 0
fi
# Check spawn config file
local config_file="${HOME}/.config/spawn/${cloud}.json"
if [[ -f "$config_file" ]]; then
return 0
fi
# Fly: also check CLI auth
if [[ "$cloud" == "fly" ]]; then
_try_fly_cli_token &>/dev/null && return 0
fi
fi
return 1
}
# --- Cleanup ---
_cleanup_e2e() {
local exit_code=$?
# Kill any remaining background test jobs
if [[ -n "${E2E_PIDS:-}" ]]; then
for pid in ${E2E_PIDS}; do
kill "$pid" 2>/dev/null || true
done
fi
# Clean up results dir
if [[ -n "${E2E_RESULTS_DIR:-}" ]] && [[ -d "${E2E_RESULTS_DIR}" ]]; then
rm -rf "${E2E_RESULTS_DIR}"
fi
exit "$exit_code"
}
trap _cleanup_e2e EXIT SIGTERM SIGINT
# --- macOS-compatible timeout ---
_run_with_timeout() {
local secs="$1"; shift
"$@" &
local pid=$!
local elapsed=0
while kill -0 "$pid" 2>/dev/null; do
if [[ "$elapsed" -ge "$secs" ]]; then
kill "$pid" 2>/dev/null
sleep 1
kill -9 "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
return 124
fi
sleep 1
elapsed=$((elapsed + 1))
done
wait "$pid" 2>/dev/null
}
# --- Stale server cleanup ---
_cleanup_stale_servers() {
local cloud="$1"
_e2e_log "Cleaning up stale ${E2E_SERVER_PREFIX}-* servers on ${cloud}..."
case "$cloud" in
fly)
source "${REPO_ROOT}/fly/lib/common.sh"
local org
org=$(get_fly_org 2>/dev/null) || return 0
local apps_json
apps_json=$(fly_api GET "/apps?org_slug=$org" 2>/dev/null) || return 0
local stale_apps
stale_apps=$(printf '%s' "$apps_json" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
apps = data if isinstance(data, list) else data.get('apps', [])
for a in apps:
name = a.get('name', '')
if name.startswith('${E2E_SERVER_PREFIX}-'):
print(name)
" 2>/dev/null || true)
for app in $stale_apps; do
_e2e_log " Destroying stale app: $app"
destroy_server "$app" 2>/dev/null || true
done
;;
hetzner)
source "${REPO_ROOT}/hetzner/lib/common.sh"
local servers_json
servers_json=$(hetzner_api GET "/servers" 2>/dev/null) || return 0
local stale_servers
stale_servers=$(printf '%s' "$servers_json" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
for s in data.get('servers', []):
name = s.get('name', '')
sid = s.get('id', '')
if name.startswith('${E2E_SERVER_PREFIX}-'):
print(sid)
" 2>/dev/null || true)
for sid in $stale_servers; do
_e2e_log " Destroying stale server: $sid"
destroy_server "$sid" 2>/dev/null || true
done
;;
digitalocean)
source "${REPO_ROOT}/digitalocean/lib/common.sh"
local droplets_json
droplets_json=$(do_api GET "/droplets" 2>/dev/null) || return 0
local stale_droplets
stale_droplets=$(printf '%s' "$droplets_json" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
for d in data.get('droplets', []):
name = d.get('name', '')
did = d.get('id', '')
if name.startswith('${E2E_SERVER_PREFIX}-'):
print(did)
" 2>/dev/null || true)
for did in $stale_droplets; do
_e2e_log " Destroying stale droplet: $did"
destroy_server "$did" 2>/dev/null || true
done
;;
esac
}
# Destroy a specific e2e test server by name.
# Clouds that take a name directly are easy; others need a name→ID lookup.
_destroy_e2e_server() {
local cloud="$1" server_name="$2"
case "$cloud" in
fly)
source "${REPO_ROOT}/fly/lib/common.sh" 2>/dev/null || return 0
destroy_server "$server_name" 2>/dev/null || true
;;
aws)
source "${REPO_ROOT}/aws/lib/common.sh" 2>/dev/null || return 0
destroy_server "$server_name" 2>/dev/null || true
;;
gcp)
source "${REPO_ROOT}/gcp/lib/common.sh" 2>/dev/null || return 0
destroy_server "$server_name" 2>/dev/null || true
;;
hetzner)
source "${REPO_ROOT}/hetzner/lib/common.sh" 2>/dev/null || return 0
local servers_json sid
servers_json=$(hetzner_api GET "/servers?name=${server_name}" 2>/dev/null) || return 0
sid=$(printf '%s' "$servers_json" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
for s in data.get('servers', []):
if s.get('name') == '${server_name}':
print(s['id']); break
" 2>/dev/null) || return 0
[[ -n "$sid" ]] && destroy_server "$sid" 2>/dev/null || true
;;
digitalocean)
source "${REPO_ROOT}/digitalocean/lib/common.sh" 2>/dev/null || return 0
local droplets_json did
droplets_json=$(do_api GET "/droplets?tag_name=${server_name}" 2>/dev/null) || return 0
did=$(printf '%s' "$droplets_json" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
for d in data.get('droplets', []):
if d.get('name') == '${server_name}':
print(d['id']); break
" 2>/dev/null) || return 0
[[ -n "$did" ]] && destroy_server "$did" 2>/dev/null || true
;;
daytona)
source "${REPO_ROOT}/daytona/lib/common.sh" 2>/dev/null || return 0
destroy_server "$server_name" 2>/dev/null || true
;;
esac
}
# --- Non-interactive env setup ---
# Export all env vars needed to run agent scripts without any interactive prompts.
# Called by both preflight and per-combo tests.
_setup_noninteractive_env() {
local cloud="$1"
export SPAWN_NON_INTERACTIVE=1
export MODEL_ID="${MODEL_ID:-openrouter/auto}"
export SPAWN_SKIP_GITHUB_AUTH=1
case "$cloud" in
hetzner)
export HETZNER_LOCATION="${HETZNER_LOCATION:-fsn1}"
export HETZNER_SERVER_TYPE="${HETZNER_SERVER_TYPE:-cx23}"
;;
fly)
export FLY_REGION="${FLY_REGION:-iad}"
export FLY_VM_SIZE="${FLY_VM_SIZE:-shared-cpu-1x}"
export FLY_VM_MEMORY="${FLY_VM_MEMORY:-1024}"
;;
gcp)
export GCP_ZONE="${GCP_ZONE:-us-central1-a}"
export GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-micro}"
;;
esac
}
# --- Per-cloud preflight ---
# Run cloud_authenticate() once per cloud BEFORE parallel agent tests.
# This installs CLIs, imports SSH keys, and validates tokens so that
# 15 parallel agent scripts don't race on the same shared resources.
_preflight_cloud() {
local cloud="$1"
local log_file="${E2E_RESULTS_DIR}/preflight_${cloud}.log"
local env_file="${E2E_RESULTS_DIR}/preflight_${cloud}.env"
_e2e_log "Pre-flight: ${cloud}..."
# Run cloud_authenticate in a subshell, then dump the validated token
# so the parent can export it for agent scripts.
local token_var
token_var=$(_get_token_env_var "$cloud")
(
_setup_noninteractive_env "$cloud"
source "${REPO_ROOT}/${cloud}/lib/common.sh"
cloud_authenticate
# Write validated token to env file for parent to pick up
if [[ -n "$token_var" ]] && [[ -n "${!token_var:-}" ]]; then
printf '%s' "${!token_var}" > "$env_file"
fi
) > "$log_file" 2>&1
local rc=$?
if [[ $rc -ne 0 ]]; then
local last_err
last_err=$(grep -iE "error|fail|cannot|not found|invalid" "$log_file" 2>/dev/null | tail -1 || true)
_e2e_fail "pre-flight ${cloud}: ${last_err:-exit code $rc}"
return 1
fi
# Import validated token into parent so agent scripts skip re-validation
if [[ -n "$token_var" ]] && [[ -f "$env_file" ]] && [[ -s "$env_file" ]]; then
local token_val
token_val=$(cat "$env_file")
export "$token_var=$token_val"
rm -f "$env_file"
fi
_e2e_pass "pre-flight ${cloud}"
return 0
}
# --- Per-combo test function ---
run_e2e_test() {
local cloud="$1" agent="$2"
local server_name="${E2E_SERVER_PREFIX}-${agent}-$(date +%s)-$$"
local log_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.log"
local start_time
start_time=$(date +%s)
_e2e_log "${cloud}/${agent} starting..."
# Set the cloud-specific server name env var so the script skips interactive prompt
local name_var
name_var=$(_get_name_env_var "$cloud")
if [[ -n "$name_var" ]]; then
export "$name_var"="$server_name"
fi
_setup_noninteractive_env "$cloud"
# Run the agent script with stdin from /dev/null (no interactive prompts)
local exit_code=0
_run_with_timeout "$E2E_TIMEOUT" bash "${REPO_ROOT}/${cloud}/${agent}.sh" \
< /dev/null > "$log_file" 2>&1 || exit_code=$?
local elapsed=$(( $(date +%s) - start_time ))
# Determine result
# The script will always "fail" at the interactive session step (no TTY),
# but "setup completed successfully" printed before that means everything
# up to session launch worked.
local result="fail"
local reason=""
if [[ "$exit_code" -eq 124 ]]; then
reason="timeout (${E2E_TIMEOUT}s)"
elif grep -q "setup completed successfully" "$log_file" 2>/dev/null; then
result="pass"
reason="setup complete (session expected to fail without TTY)"
else
reason="exit code ${exit_code}"
# Try to extract last meaningful error
local last_error
last_error=$(grep -iE "error|fail|fatal|cannot|not found" "$log_file" 2>/dev/null | tail -3 || true)
if [[ -n "$last_error" ]]; then
reason="${reason}: $(printf '%s' "$last_error" | head -1)"
fi
fi
# Write results
printf '%s\n' "$result" > "${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
printf '%s\n' "$elapsed" > "${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
printf '%s\n' "$reason" > "${E2E_RESULTS_DIR}/${cloud}_${agent}.reason"
# Destroy the test server — don't leak cloud resources
_destroy_e2e_server "$cloud" "$server_name"
# Progress output
if [[ "$result" == "pass" ]]; then
_e2e_pass "${cloud}/${agent} ${elapsed}s"
else
_e2e_fail "${cloud}/${agent} ${elapsed}s (${reason})"
fi
}
# --- Auto-fix function ---
_find_working_reference() {
local agent="$1" exclude_cloud="$2"
for cloud_dir in "${REPO_ROOT}"/*/; do
local cloud_name
cloud_name=$(basename "$cloud_dir")
[[ "$cloud_name" == "$exclude_cloud" ]] && continue
[[ -f "${cloud_dir}${agent}.sh" ]] || continue
printf '%s' "${cloud_dir}${agent}.sh"
return 0
done
return 1
}
# Build the prompt for a single failing combo (used by per-cloud agent)
_build_failure_context() {
local cloud="$1" agent="$2"
local log_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.log"
local script="${REPO_ROOT}/${cloud}/${agent}.sh"
printf '### %s/%s\n\n' "$cloud" "$agent"
printf 'Last 50 lines of output:\n```\n'
if [[ -f "$log_file" ]]; then
tail -50 "$log_file"
else
printf '(no log file)\n'
fi
printf '```\n\n'
printf 'Script (%s/%s.sh):\n```bash\n' "$cloud" "$agent"
if [[ -f "$script" ]]; then
cat "$script"
fi
printf '```\n\n'
local ref_script=""
ref_script=$(_find_working_reference "$agent" "$cloud" 2>/dev/null) || true
if [[ -n "$ref_script" ]] && [[ -f "$ref_script" ]]; then
printf 'Reference (working on another cloud — %s):\n```bash\n' "$(basename "$(dirname "$ref_script")")"
cat "$ref_script"
printf '```\n\n'
fi
}
# Spawn one Claude agent to fix a single failing combo
auto_fix_combo() {
local cloud="$1" agent="$2"
if ! command -v claude &>/dev/null; then
_e2e_log "claude CLI not found — skipping auto-fix for ${cloud}/${agent}"
return 1
fi
local prompt
prompt=$(_build_failure_context "$cloud" "$agent")
local cloud_lib=""
if [[ -f "${REPO_ROOT}/${cloud}/lib/common.sh" ]]; then
cloud_lib=$(cat "${REPO_ROOT}/${cloud}/lib/common.sh")
fi
_e2e_log "Spawning Claude agent for ${cloud}/${agent}..."
claude -p "You are fixing an E2E test failure for **${cloud}/${agent}**.
## Cloud Library (${cloud}/lib/common.sh)
\`\`\`bash
${cloud_lib}
\`\`\`
## Failure
${prompt}
## Instructions
Fix the failing script: ${cloud}/${agent}.sh
1. Read the error output to understand what went wrong
2. Compare with the reference script (working on another cloud) if available
3. Fix the issue — common problems: wrong install command, missing PATH, timeout in non-TTY
4. Run \`bash -n\` on every modified file
Only modify files under ${cloud}/. Do not modify lib/common.sh or shared/." 2>&1 | tee -a "${E2E_RESULTS_DIR}/autofix_${cloud}_${agent}.log" || true
}
# --- Timing history ---
# Save a test result to the timings JSON file
# Usage: _save_timing cloud/agent elapsed status
_save_timing() {
local combo="$1" elapsed="$2" status="$3"
local today
today=$(date +%Y-%m-%d)
mkdir -p "$(dirname "$E2E_TIMINGS_FILE")"
python3 -c "
import json, sys, os
combo = sys.argv[1]
elapsed = int(sys.argv[2])
status = sys.argv[3]
today = sys.argv[4]
path = sys.argv[5]
data = {}
if os.path.exists(path):
try:
with open(path) as f:
data = json.load(f)
except (json.JSONDecodeError, IOError):
data = {}
if combo not in data:
data[combo] = {'runs': [], 'best': {}}
entry = {'date': today, 'total': elapsed, 'status': status}
data[combo]['runs'].insert(0, entry)
# Keep last 10 runs
data[combo]['runs'] = data[combo]['runs'][:10]
# Update best if this is a pass and faster
if status == 'pass':
best = data[combo].get('best', {})
if not best.get('total') or elapsed < best['total']:
data[combo]['best'] = {'total': elapsed, 'date': today}
with open(path, 'w') as f:
json.dump(data, f, indent=2)
" "$combo" "$elapsed" "$status" "$today" "$E2E_TIMINGS_FILE" 2>/dev/null || true
}
# Show timing history from the JSON file
_show_history() {
if [[ ! -f "$E2E_TIMINGS_FILE" ]]; then
_e2e_log "No timing history found at ${E2E_TIMINGS_FILE}"
return 0
fi
python3 -c "
import json, sys
path = sys.argv[1]
with open(path) as f:
data = json.load(f)
if not data:
print('No timing data recorded yet.')
sys.exit(0)
for combo in sorted(data.keys()):
info = data[combo]
best = info.get('best', {})
best_total = best.get('total', '-')
best_date = best.get('date', '-')
runs = info.get('runs', [])
print(f'\\n━━━ {combo} ━━━')
print(f' Best: {best_total}s ({best_date})')
print(f' Recent runs:')
for r in runs[:5]:
status_icon = '✓' if r['status'] == 'pass' else '✗'
print(f' {status_icon} {r[\"date\"]} {r[\"total\"]}s ({r[\"status\"]})')
" "$E2E_TIMINGS_FILE"
}
# Compare a single agent across all clouds
_show_compare() {
local agent="$1"
if [[ ! -f "$E2E_TIMINGS_FILE" ]]; then
_e2e_log "No timing history found at ${E2E_TIMINGS_FILE}"
return 0
fi
python3 -c "
import json, sys
agent = sys.argv[1]
path = sys.argv[2]
with open(path) as f:
data = json.load(f)
matches = {k: v for k, v in data.items() if k.endswith('/' + agent)}
if not matches:
print(f'No timing data for agent: {agent}')
sys.exit(0)
print(f'\\n━━━ {agent} across clouds ━━━')
print(f'{\"CLOUD\":<15} {\"BEST\":<10} {\"LATEST\":<10} {\"STATUS\":<8}')
print('-' * 45)
for combo in sorted(matches.keys()):
cloud = combo.split('/')[0]
info = matches[combo]
best = info.get('best', {}).get('total', '-')
runs = info.get('runs', [])
if runs:
latest = runs[0]['total']
status = runs[0]['status']
else:
latest = '-'
status = '-'
best_s = f'{best}s' if isinstance(best, int) else best
latest_s = f'{latest}s' if isinstance(latest, int) else latest
print(f'{cloud:<15} {best_s:<10} {latest_s:<10} {status:<8}')
" "$agent" "$E2E_TIMINGS_FILE"
}
# Check if a passing combo is slow and needs optimization
# Returns 0 (true) if optimization is needed, 1 if not
# Prints the reason to stdout
_check_slow() {
local combo="$1" elapsed="$2"
python3 -c "
import json, sys, os
combo = sys.argv[1]
elapsed = int(sys.argv[2])
threshold = int(sys.argv[3])
path = sys.argv[4]
agent = combo.split('/')[1]
cloud = combo.split('/')[0]
reasons = []
# Trigger 1: Absolute slow
if elapsed > threshold:
reasons.append(f'absolute_slow: {elapsed}s exceeds {threshold}s threshold')
# Load history for regression + peer comparison
data = {}
if os.path.exists(path):
try:
with open(path) as f:
data = json.load(f)
except (json.JSONDecodeError, IOError):
pass
# Trigger 2: Regression vs best
if combo in data:
best = data[combo].get('best', {}).get('total')
if best and elapsed > best * 1.5:
reasons.append(f'regression: {elapsed}s is >50%% slower than best {best}s')
# Trigger 3: Slow vs peers (same agent on other clouds)
peer_times = []
for key, val in data.items():
if key.endswith('/' + agent) and key != combo:
peer_best = val.get('best', {}).get('total')
if peer_best:
peer_times.append((key.split('/')[0], peer_best))
if peer_times:
fastest_cloud, fastest_time = min(peer_times, key=lambda x: x[1])
if elapsed > fastest_time * 2:
reasons.append(f'slow_vs_peers: {elapsed}s is >2x slower than {fastest_cloud} ({fastest_time}s)')
if reasons:
print('|'.join(reasons))
sys.exit(0)
else:
sys.exit(1)
" "$combo" "$elapsed" "$E2E_SLOW_THRESHOLD" "$E2E_TIMINGS_FILE" 2>/dev/null
}
# Build context for optimization agent (peer timings, history)
_build_optimization_context() {
local combo="$1" elapsed="$2"
python3 -c "
import json, sys, os
combo = sys.argv[1]
elapsed = int(sys.argv[2])
path = sys.argv[3]
agent = combo.split('/')[1]
cloud = combo.split('/')[0]
data = {}
if os.path.exists(path):
try:
with open(path) as f:
data = json.load(f)
except (json.JSONDecodeError, IOError):
pass
lines = []
# Best time
best = '-'
if combo in data:
b = data[combo].get('best', {}).get('total')
if b:
best = f'{b}s'
lines.append(f'- Total time: {elapsed}s (best ever: {best})')
# Peer timings
lines.append(f'- Same agent on other clouds:')
for key in sorted(data.keys()):
if key.endswith('/' + agent) and key != combo:
peer_cloud = key.split('/')[0]
peer_best = data[key].get('best', {}).get('total', '?')
lines.append(f' - {peer_cloud}: {peer_best}s')
# History
if combo in data:
runs = data[combo].get('runs', [])
if runs:
lines.append(f'- History:')
for r in runs[:5]:
lines.append(f' - {r[\"date\"]}: {r[\"total\"]}s ({r[\"status\"]})')
print('\\n'.join(lines))
" "$combo" "$elapsed" "$E2E_TIMINGS_FILE" 2>/dev/null || true
}
# Build optimization context for a single slow combo (used by per-cloud agent)
_build_slow_context() {
local cloud="$1" agent="$2" elapsed="$3" reasons="$4"
local script="${REPO_ROOT}/${cloud}/${agent}.sh"
printf '### %s/%s (%ss)\n\n' "$cloud" "$agent" "$elapsed"
printf 'Why flagged:\n'
printf '%s\n' "$reasons" | while IFS= read -r r; do
printf '- %s\n' "$r"
done
printf '\n'
local timing_context
timing_context=$(_build_optimization_context "${cloud}/${agent}" "$elapsed")
printf 'Timings:\n%s\n\n' "$timing_context"
printf 'Script (%s/%s.sh):\n```bash\n' "$cloud" "$agent"
if [[ -f "$script" ]]; then
cat "$script"
fi
printf '```\n\n'
local ref_script=""
ref_script=$(_find_working_reference "$agent" "$cloud" 2>/dev/null) || true
if [[ -n "$ref_script" ]] && [[ -f "$ref_script" ]]; then
printf 'Reference (fastest peer — %s):\n```bash\n' "$(basename "$(dirname "$ref_script")")"
cat "$ref_script"
printf '```\n\n'
fi
}
# Spawn one Claude agent to optimize a single slow combo
optimize_slow_combo() {
local cloud="$1" agent="$2" elapsed="$3" reasons="$4"
if ! command -v claude &>/dev/null; then
_e2e_log "claude CLI not found — skipping optimization for ${cloud}/${agent}"
return 1
fi
local prompt
prompt=$(_build_slow_context "$cloud" "$agent" "$elapsed" "$reasons")
local cloud_lib=""
if [[ -f "${REPO_ROOT}/${cloud}/lib/common.sh" ]]; then
cloud_lib=$(cat "${REPO_ROOT}/${cloud}/lib/common.sh")
fi
_e2e_log "Spawning Claude agent for ${cloud}/${agent} (${elapsed}s)..."
claude -p "You are optimizing a slow E2E test for **${cloud}/${agent}**.
The script PASSES but is too slow.
## Cloud Library (${cloud}/lib/common.sh)
\`\`\`bash
${cloud_lib}
\`\`\`
## Slow Script
${prompt}
## Instructions
Optimize the script: ${cloud}/${agent}.sh
1. Compare timings with the fastest peer cloud for the same agent
2. Identify what makes it slow (heavy installer, compiling native deps, unnecessary steps)
3. Make it faster — use lighter install methods, skip unnecessary setup, parallelize where possible
4. Run \`bash -n\` on every modified file
5. Don't break anything — the script must still pass E2E
Only modify files under ${cloud}/. Do not modify lib/common.sh or shared/." 2>&1 | tee -a "${E2E_RESULTS_DIR}/optimize_${cloud}_${agent}.log" || true
}
# --- Main ---
main() {
local filter_cloud="" filter_agent=""
# Parse args: strip --all flag, assign positional cloud/agent
for arg in "$@"; do
case "$arg" in
--all) E2E_ALL=1 ;;
*)
if [[ -z "$filter_cloud" ]]; then
filter_cloud="$arg"
else
filter_agent="$arg"
fi
;;
esac
done
# Handle --cleanup
if [[ "$filter_cloud" == "--cleanup" ]]; then
_e2e_log "Running stale server cleanup..."
for cloud in fly hetzner digitalocean; do
if _cloud_has_credentials "$cloud"; then
_cleanup_stale_servers "$cloud"
fi
done
_e2e_log "Cleanup complete"
return 0
fi
# Handle --history
if [[ "$filter_cloud" == "--history" ]]; then
_show_history
return 0
fi
# Handle --compare AGENT
if [[ "$filter_cloud" == "--compare" ]]; then
if [[ -z "$filter_agent" ]]; then
_e2e_log "Usage: bash test/e2e.sh --compare AGENT_NAME"
return 1
fi
_show_compare "$filter_agent"
return 0
fi
# Get OPENROUTER_API_KEY
if [[ -z "${OPENROUTER_API_KEY:-}" ]]; then
# Non-interactive: fail fast with a clear message
if ! echo -n "" > /dev/tty 2>/dev/null; then
_e2e_log "ERROR: OPENROUTER_API_KEY not set and no TTY available"
_e2e_log "Export it before running: export OPENROUTER_API_KEY=sk-or-v1-..."
return 1
fi
# Interactive: offer OAuth or paste
source "${REPO_ROOT}/shared/common.sh" 2>/dev/null || true
_e2e_log "OPENROUTER_API_KEY not set — let's grab one"
echo ""
printf ' 1) Open browser (OAuth) — quickest, logs you in via openrouter.ai\n'
printf ' 2) Paste a key — get one from https://openrouter.ai/settings/keys\n'
printf ' 3) Quit\n'
echo ""
printf ' Pick [1/2/3]: '
read -r _choice </dev/tty
case "${_choice}" in
1)
_e2e_log "Starting OAuth flow..."
OPENROUTER_API_KEY=$(try_oauth_flow 5180) || {
_e2e_log "OAuth failed — falling back to manual paste"
printf ' Paste your API key: '
read -r OPENROUTER_API_KEY </dev/tty
}
;;
2)
printf ' Paste your API key: '
read -r OPENROUTER_API_KEY </dev/tty
;;
*)
_e2e_log "Aborted."
return 1
;;
esac
if [[ -z "${OPENROUTER_API_KEY:-}" ]]; then
_e2e_log "ERROR: No API key provided"
return 1
fi
export OPENROUTER_API_KEY
_e2e_log "API key set — continuing"
fi
# Create results directory
E2E_RESULTS_DIR=$(mktemp -d "${TMPDIR:-/tmp}/e2e-results-XXXXXX")
# Testable clouds (excludes local, sprite which don't provision real servers the same way)
local testable_clouds="fly hetzner digitalocean aws daytona gcp"
# --- Credential collection (interactive) ---
# Load tokens from config files and prompt for any missing ones
# BEFORE we go non-interactive. This lets the user provide tokens
# that aren't in env vars or config files.
echo ""
_e2e_log "━━━ Credential Collection ━━━"
echo ""
_collect_credentials "$testable_clouds"
echo ""
# Discover clouds with available credentials
local available_clouds=""
if [[ -n "$filter_cloud" ]]; then
if _cloud_has_credentials "$filter_cloud"; then
available_clouds="$filter_cloud"
else
_e2e_log "ERROR: No credentials found for ${filter_cloud}"
_e2e_log "Set the appropriate token env var or configure via the cloud's CLI"
return 1
fi
else
for cloud in $testable_clouds; do
if _cloud_has_credentials "$cloud"; then
available_clouds="${available_clouds} ${cloud}"
fi
done
available_clouds=$(printf '%s' "$available_clouds" | sed 's/^ //')
fi
if [[ -z "$available_clouds" ]]; then
_e2e_log "No cloud credentials available. Set token env vars for at least one cloud."
_e2e_log "Supported clouds: ${testable_clouds}"
return 1
fi
_e2e_log "Available clouds: ${available_clouds}"
# --- Pre-flight: validate each cloud once ---
# Installs CLIs, imports SSH keys, validates tokens sequentially so that
# the parallel agent tests don't race on shared resources.
echo ""
_e2e_log "━━━ Pre-flight ━━━"
echo ""
local ready_clouds=""
local preflight_skipped=""
for cloud in $available_clouds; do
if _preflight_cloud "$cloud"; then
ready_clouds="${ready_clouds} ${cloud}"
else
preflight_skipped="${preflight_skipped} ${cloud}"
fi
done
ready_clouds=$(printf '%s' "$ready_clouds" | sed 's/^ //')
if [[ -n "$preflight_skipped" ]]; then
echo ""
_e2e_log "Skipped clouds (pre-flight failed):${preflight_skipped}"
_e2e_log "Check logs in ${E2E_RESULTS_DIR}/preflight_*.log"
fi
if [[ -z "$ready_clouds" ]]; then
_e2e_log "All clouds failed pre-flight. Check credentials and CLIs."
return 1
fi
# Collect combos for clouds that passed pre-flight.
# Default: one canary agent per cloud. --all or explicit agent: full set.
local combos=""
local combo_count=0
for cloud in $ready_clouds; do
if [[ -n "$filter_agent" ]]; then
# Explicit agent requested
if [[ -f "${REPO_ROOT}/${cloud}/${filter_agent}.sh" ]]; then
combos="${combos} ${cloud}/${filter_agent}"
combo_count=$((combo_count + 1))
fi
elif [[ "$E2E_ALL" == "1" ]]; then
# --all: every agent on this cloud
for script in "${REPO_ROOT}/${cloud}"/*.sh; do
[[ -f "$script" ]] || continue
local agent
agent=$(basename "$script" .sh)
[[ "$agent" == "lib" ]] && continue
combos="${combos} ${cloud}/${agent}"
combo_count=$((combo_count + 1))
done
else
# Smoke test: one canary agent per cloud
local canary="${E2E_CANARY_AGENT}"
if [[ ! -f "${REPO_ROOT}/${cloud}/${canary}.sh" ]]; then
# Canary not available on this cloud — pick the first agent
canary=""
for script in "${REPO_ROOT}/${cloud}"/*.sh; do
[[ -f "$script" ]] || continue
local a
a=$(basename "$script" .sh)
[[ "$a" == "lib" ]] && continue
canary="$a"
break
done
fi
if [[ -n "$canary" ]]; then
combos="${combos} ${cloud}/${canary}"
combo_count=$((combo_count + 1))
fi
fi
done
combos=$(printf '%s' "$combos" | sed 's/^ //')
if [[ -z "$combos" ]]; then
_e2e_log "No test combos found for ready clouds: ${ready_clouds}"
return 1
fi
local mode_label="smoke test"
[[ "$E2E_ALL" == "1" ]] && mode_label="full matrix"
[[ -n "$filter_agent" ]] && mode_label="filtered"
_e2e_log "Testing ${combo_count} combo(s) [${mode_label}]: ${combos}"
echo ""
# Pre-cleanup: destroy stale e2e-* servers
for cloud in $ready_clouds; do
_cleanup_stale_servers "$cloud" 2>/dev/null || true
done
# Run all combos in parallel (background subshells)
E2E_PIDS=""
for combo in $combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
(
run_e2e_test "$cloud" "$agent"
) &
E2E_PIDS="${E2E_PIDS} $!"
done
# Wait for all to finish
_e2e_log "Waiting for ${combo_count} test(s) to complete (timeout: ${E2E_TIMEOUT}s each)..."
for pid in ${E2E_PIDS}; do
wait "$pid" 2>/dev/null || true
done
E2E_PIDS=""
# Collect and report results
echo ""
_e2e_log "━━━ E2E Results ━━━"
echo ""
local total_pass=0
local total_fail=0
local failed_combos=""
for combo in $combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
local result_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
local timing_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
local reason_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.reason"
local result="fail"
local elapsed="?"
local reason="no result file"
[[ -f "$result_file" ]] && result=$(cat "$result_file")
[[ -f "$timing_file" ]] && elapsed=$(cat "$timing_file")
[[ -f "$reason_file" ]] && reason=$(cat "$reason_file")
if [[ "$result" == "pass" ]]; then
_e2e_pass "${cloud}/${agent} ${elapsed}s"
total_pass=$((total_pass + 1))
else
_e2e_fail "${cloud}/${agent} ${elapsed}s (${reason})"
total_fail=$((total_fail + 1))
failed_combos="${failed_combos} ${combo}"
fi
done
echo ""
local summary="Total: ${total_pass} passed, ${total_fail} failed out of ${combo_count}"
if [[ -n "${preflight_skipped:-}" ]]; then
summary="${summary} (skipped:${preflight_skipped})"
fi
_e2e_log "$summary"
# Save timings to history
for combo in $combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
local result_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
local timing_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
local result="fail"
local elapsed="0"
[[ -f "$result_file" ]] && result=$(cat "$result_file")
[[ -f "$timing_file" ]] && elapsed=$(cat "$timing_file")
_save_timing "$combo" "$elapsed" "$result"
done
# Optimization phase: check passing combos for slowness
local slow_combos=""
if [[ "$E2E_OPTIMIZE" == "1" ]]; then
for combo in $combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
local result_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
local timing_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
local result="fail"
local elapsed="0"
[[ -f "$result_file" ]] && result=$(cat "$result_file")
[[ -f "$timing_file" ]] && elapsed=$(cat "$timing_file")
if [[ "$result" == "pass" ]]; then
local slow_reasons=""
slow_reasons=$(_check_slow "$combo" "$elapsed") || true
if [[ -n "$slow_reasons" ]]; then
slow_combos="${slow_combos} ${combo}:${elapsed}:${slow_reasons}"
fi
fi
done
fi
if [[ -n "${slow_combos}" ]]; then
echo ""
_e2e_log "━━━ Optimization Phase ━━━"
echo ""
# Print all slow combos
for entry in $slow_combos; do
local combo="${entry%%:*}"
local rest="${entry#*:}"
local elapsed="${rest%%:*}"
local reasons="${rest#*:}"
printf ' \033[33m⚡\033[0m %s %ss (%s)\n' "$combo" "$elapsed" "$(printf '%s' "$reasons" | tr '|' ', ')"
done
echo ""
# Spawn one Claude agent per slow combo, all in parallel
local opt_pids=""
for entry in $slow_combos; do
local combo="${entry%%:*}"
local rest="${entry#*:}"
local elapsed="${rest%%:*}"
local reasons
reasons=$(printf '%s' "${rest#*:}" | tr '|' '\n')
local cloud="${combo%%/*}"
local agent="${combo##*/}"
(
optimize_slow_combo "$cloud" "$agent" "$elapsed" "$reasons"
) &
opt_pids="${opt_pids} $!"
done
# Wait for all optimization agents
for pid in $opt_pids; do
wait "$pid" 2>/dev/null || true
done
# Re-run optimized combos to verify
echo ""
_e2e_log "━━━ Re-running Optimized Combos ━━━"
echo ""
for entry in $slow_combos; do
local combo="${entry%%:*}"
local old_elapsed="${entry#*:}"
old_elapsed="${old_elapsed%%:*}"
local cloud="${combo%%/*}"
local agent="${combo##*/}"
run_e2e_test "$cloud" "$agent" || true
local result_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
local timing_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
local result="fail"
local new_elapsed="?"
[[ -f "$result_file" ]] && result=$(cat "$result_file")
[[ -f "$timing_file" ]] && new_elapsed=$(cat "$timing_file")
if [[ "$result" == "pass" ]]; then
_e2e_pass "${combo} ${new_elapsed}s (was ${old_elapsed}s)"
_save_timing "$combo" "$new_elapsed" "$result"
else
_e2e_fail "${combo} ${new_elapsed}s (optimization broke it — was ${old_elapsed}s)"
fi
done
fi
# Auto-fix failures — one Claude agent per combo, all in parallel
if [[ "$total_fail" -gt 0 ]] && [[ "$E2E_AUTO_FIX" == "1" ]]; then
echo ""
_e2e_log "━━━ Auto-Fix Phase ━━━"
echo ""
# Spawn one agent per failing combo in parallel
local fix_pids=""
for combo in $failed_combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
(
auto_fix_combo "$cloud" "$agent"
) &
fix_pids="${fix_pids} $!"
done
# Wait for all fix agents
for pid in $fix_pids; do
wait "$pid" 2>/dev/null || true
done
# Re-run fixed combos
echo ""
_e2e_log "━━━ Re-running Fixed Combos ━━━"
echo ""
local rerun_pass=0
local rerun_fail=0
for combo in $failed_combos; do
local cloud="${combo%%/*}"
local agent="${combo##*/}"
run_e2e_test "$cloud" "$agent" || true
local result_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.result"
local timing_file="${E2E_RESULTS_DIR}/${cloud}_${agent}.timing"
local result="fail"
local elapsed="?"
[[ -f "$result_file" ]] && result=$(cat "$result_file")
[[ -f "$timing_file" ]] && elapsed=$(cat "$timing_file")
if [[ "$result" == "pass" ]]; then
_e2e_pass "${cloud}/${agent} ${elapsed}s (FIXED)"
rerun_pass=$((rerun_pass + 1))
else
_e2e_fail "${cloud}/${agent} ${elapsed}s (still failing)"
rerun_fail=$((rerun_fail + 1))
fi
done
echo ""
_e2e_log "Auto-fix: ${rerun_pass} fixed, ${rerun_fail} still failing"
fi
echo ""
_e2e_log "━━━ E2E Complete ━━━"
# Exit with failure if any tests failed (and weren't fixed)
if [[ "$total_fail" -gt 0 ]]; then
if [[ "$E2E_AUTO_FIX" == "1" ]] && [[ "${rerun_fail:-0}" -eq 0 ]]; then
return 0
fi
return 1
fi
return 0
}
main "$@"