spawn/sh/e2e/e2e.sh

#!/bin/bash
# sh/e2e/e2e.sh — Unified multi-cloud E2E test orchestrator
#
# Usage:
#   e2e.sh --cloud aws                          # AWS only, all agents
#   e2e.sh --cloud hetzner claude codex         # Hetzner, specific agents
#   e2e.sh --cloud aws --cloud hetzner          # Both clouds IN PARALLEL
#   e2e.sh --cloud all                          # ALL clouds IN PARALLEL
#   e2e.sh --cloud all --parallel 3             # All clouds, 3 agents parallel per cloud
#   e2e.sh --cloud aws --skip-input-test        # Skip live input tests
#   e2e.sh --cloud aws --sequential             # Force sequential agents (no parallelism)
set -eo pipefail

# ---------------------------------------------------------------------------
# Resolve script directory and source libraries
# ---------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Auto-set SPAWN_CLI_DIR to repo root so shell scripts use local source instead
# of downloading pre-bundled .js from GitHub releases. Can be overridden by env.
if [ -z "${SPAWN_CLI_DIR:-}" ]; then
  _repo_root="$(cd "${SCRIPT_DIR}/../.." && pwd)"
  if [ -f "${_repo_root}/packages/cli/src/index.ts" ]; then
    export SPAWN_CLI_DIR="${_repo_root}"
  fi
  unset _repo_root
fi

source "${SCRIPT_DIR}/lib/common.sh"
source "${SCRIPT_DIR}/lib/provision.sh"
source "${SCRIPT_DIR}/lib/verify.sh"
source "${SCRIPT_DIR}/lib/teardown.sh"
source "${SCRIPT_DIR}/lib/soak.sh"
source "${SCRIPT_DIR}/lib/interactive.sh"
source "${SCRIPT_DIR}/lib/ai-review.sh"

# ---------------------------------------------------------------------------
# Auto-load Resend email credentials when not already set.
# Sources /etc/spawn-key-server-auth.env (QA VM) or ~/.config/spawn/resend.env
# (local dev) to populate RESEND_API_KEY and KEY_REQUEST_EMAIL.
# This ensures send_matrix_email fires on manual runs, not just QA-cycle runs.
# ---------------------------------------------------------------------------
if [ -z "${RESEND_API_KEY:-}" ] || [ -z "${KEY_REQUEST_EMAIL:-}" ]; then
  for _cred_file in /etc/spawn-key-server-auth.env "${HOME}/.config/spawn/resend.env"; do
    if [ -f "${_cred_file}" ]; then
      # shellcheck source=/dev/null  # path is dynamic
      set -a; source "${_cred_file}" 2>/dev/null; set +a
      break
    fi
  done
  unset _cred_file
fi

# ---------------------------------------------------------------------------
# All supported clouds (excluding local — no infra to provision)
# ---------------------------------------------------------------------------
ALL_CLOUDS="aws hetzner digitalocean gcp daytona sprite"

# ---------------------------------------------------------------------------
# Parse arguments
# ---------------------------------------------------------------------------
CLOUDS=""
AGENTS_TO_TEST=""
PARALLEL_COUNT=99
SKIP_CLEANUP=0
SKIP_INPUT_TEST="${SKIP_INPUT_TEST:-0}"
SEQUENTIAL_MODE=0
SOAK_MODE=0
INTERACTIVE_MODE=0
FAST_MODE=0

while [ $# -gt 0 ]; do
  case "$1" in
    --cloud)
      shift
      if [ $# -eq 0 ]; then
        printf "Error: --cloud requires a cloud name\n" >&2
        exit 1
      fi
      if [ "$1" = "all" ]; then
        CLOUDS="${ALL_CLOUDS}"
      else
        # Validate cloud name
        local_valid=0
        for c in ${ALL_CLOUDS}; do
          if [ "$1" = "${c}" ]; then
            local_valid=1
            break
          fi
        done
        if [ "${local_valid}" -eq 0 ]; then
          printf "Unknown cloud: %s\nAvailable: %s all\n" "$1" "${ALL_CLOUDS}" >&2
          exit 1
        fi
        if [ -z "${CLOUDS}" ]; then
          CLOUDS="$1"
        else
          CLOUDS="${CLOUDS} $1"
        fi
      fi
      shift
      ;;
    --parallel)
      shift
      if [ $# -eq 0 ]; then
        printf "Error: --parallel requires a number\n" >&2
        exit 1
      fi
      PARALLEL_COUNT="$1"
      if ! printf '%s' "${PARALLEL_COUNT}" | grep -qE '^[0-9]+$' || [ "${PARALLEL_COUNT}" -lt 1 ] || [ "${PARALLEL_COUNT}" -gt 50 ]; then
        printf "Error: --parallel must be between 1 and 50\n" >&2
        exit 1
      fi
      shift
      ;;
    --sequential)
      SEQUENTIAL_MODE=1
      shift
      ;;
    --skip-cleanup)
      SKIP_CLEANUP=1
      shift
      ;;
    --skip-input-test)
      SKIP_INPUT_TEST=1
      shift
      ;;
    --soak)
      SOAK_MODE=1
      shift
      ;;
    --interactive)
      INTERACTIVE_MODE=1
      shift
      ;;
    --fast)
      FAST_MODE=1
      shift
      ;;
    --help|-h)
      printf "Usage: %s --cloud CLOUD [--cloud CLOUD2 ...] [agents...] [options]\n\n" "$0"
      printf "Clouds: %s\n" "${ALL_CLOUDS}"
      printf "         Use --cloud all for all clouds in parallel.\n\n"
      printf "Agents: %s\n\n" "${ALL_AGENTS}"
      printf "Options:\n"
      printf "  --cloud CLOUD       Cloud to test (repeatable, or 'all')\n"
      printf "  --parallel N        Run N agents in parallel per cloud (default: all at once)\n"
      printf "  --sequential        Force sequential agent execution\n"
      printf "  --skip-cleanup      Skip stale e2e-* instance cleanup\n"
      printf "  --skip-input-test   Skip live input tests\n"
      printf "  --fast              Provision with --fast flag (images + tarballs + parallel)\n"
      printf "  --soak              Run Telegram soak test (OpenClaw on Sprite)\n"
      printf "  --interactive       AI-driven interactive test (requires ANTHROPIC_API_KEY)\n"
      printf "  --help              Show this help\n"
      exit 0
      ;;
    -*)
      printf "Unknown option: %s\n" "$1" >&2
      exit 1
      ;;
    *)
      # Agent name
      local_valid=0
      for a in ${ALL_AGENTS}; do
        if [ "$1" = "${a}" ]; then
          local_valid=1
          break
        fi
      done
      if [ "${local_valid}" -eq 0 ]; then
        printf "Unknown agent: %s\nAvailable: %s\n" "$1" "${ALL_AGENTS}" >&2
        exit 1
      fi
      if [ -z "${AGENTS_TO_TEST}" ]; then
        AGENTS_TO_TEST="$1"
      else
        AGENTS_TO_TEST="${AGENTS_TO_TEST} $1"
      fi
      shift
      ;;
  esac
done

# Soak mode: run Telegram soak test and exit (no --cloud required)
if [ "${SOAK_MODE}" -eq 1 ]; then
  LOG_DIR=$(mktemp -d "${TMPDIR:-/tmp}/spawn-e2e.XXXXXX")
  export LOG_DIR
  run_soak_test "${LOG_DIR}"
  exit $?
fi

# Require at least one cloud
if [ -z "${CLOUDS}" ]; then
  printf "Error: --cloud is required. Use --cloud aws, --cloud all, etc.\n" >&2
  printf "Run %s --help for usage.\n" "$0" >&2
  exit 1
fi

# Default to all agents
if [ -z "${AGENTS_TO_TEST}" ]; then
  AGENTS_TO_TEST="${ALL_AGENTS}"
fi

# Sanity-check list sizes to prevent unbounded string growth (#3190)
_cloud_count=$(printf '%s\n' "${CLOUDS}" | wc -w | tr -d ' ')
_agent_count=$(printf '%s\n' "${AGENTS_TO_TEST}" | wc -w | tr -d ' ')
if [ "${_cloud_count}" -gt 50 ]; then
  printf "Error: too many clouds (%s) — max 50\n" "${_cloud_count}" >&2
  exit 1
fi
if [ "${_agent_count}" -gt 100 ]; then
  printf "Error: too many agents (%s) — max 100\n" "${_agent_count}" >&2
  exit 1
fi
unset _cloud_count _agent_count

# ---------------------------------------------------------------------------
# Count clouds to decide single vs multi-cloud mode
# ---------------------------------------------------------------------------
cloud_count=$(printf '%s\n' "${CLOUDS}" | wc -w | tr -d ' ')

# ---------------------------------------------------------------------------
# run_single_agent AGENT
#
# Provisions, verifies, and tears down a single agent.
# Sets result in a temp file for parallel collection.
# ---------------------------------------------------------------------------
run_single_agent() {
  local agent="$1"
  local result_file="${2:-}"
  local agent_start
  agent_start=$(date +%s)

  log_header "Testing agent: ${agent}"

  local app_name
  app_name=$(make_app_name "${agent}")
  track_app "${app_name}"

  local status="fail"

  # ---------------------------------------------------------------------------
  # Per-agent timeout: run provision/verify/input_test in a subshell with a
  # wall-clock timeout. This prevents any single step from hanging indefinitely
  # and ensures a result file is always written (pass, fail, or timeout).
  # Fixes #2714: digitalocean-opencode stalling with no result.
  # ---------------------------------------------------------------------------
  local effective_agent_timeout
  effective_agent_timeout=$(get_agent_timeout "${agent}")
  log_info "Agent timeout: ${effective_agent_timeout}s"

  local status_file="${LOG_DIR}/${app_name}.agent-status"
  rm -f "${status_file}"

  # Run core logic in a subshell so we can kill it on timeout
  (
    local _inner_status="fail"
    if [ "${INTERACTIVE_MODE}" -eq 1 ]; then
      # AI-driven interactive mode: harness drives the CLI through PTY.
      # After harness exits (on "Starting agent..." marker), the install is still
      # running on the remote VM. Run verify_agent to wait for .spawnrc before
      # the input test — same as headless mode.
      if interactive_provision "${agent}" "${app_name}" "${LOG_DIR}"; then
        if verify_agent "${agent}" "${app_name}"; then
          if run_input_test "${agent}" "${app_name}"; then
            _inner_status="pass"
          fi
        fi
      fi
    else
      # Standard headless mode
      if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
        # AI review of provision logs — advisory only, runs regardless of verify result
        ai_review_logs "${agent}" "${app_name}" "${LOG_DIR}" || true
        if verify_agent "${agent}" "${app_name}"; then
          if run_input_test "${agent}" "${app_name}"; then
            _inner_status="pass"
          fi
        fi
      fi
    fi
    printf '%s' "${_inner_status}" > "${status_file}"
  ) &
  local agent_pid=$!

  # Poll for completion or timeout (bash 3.2 compatible — no wait -n)
  local agent_waited=0
  while [ "${agent_waited}" -lt "${effective_agent_timeout}" ]; do
    if [ -f "${status_file}" ]; then
      break
    fi
    # Also break if the subshell exited without writing (crash/error)
    if ! kill -0 "${agent_pid}" 2>/dev/null; then
      break
    fi
    sleep 5
    agent_waited=$((agent_waited + 5))
  done

  # Collect result or handle timeout
  if [ -f "${status_file}" ]; then
    status=$(cat "${status_file}")
    wait "${agent_pid}" 2>/dev/null || true
  elif kill -0 "${agent_pid}" 2>/dev/null; then
    # Timed out — kill the subshell and its children
    log_err "${agent} timed out after ${effective_agent_timeout}s — killing"
    pkill -P "${agent_pid}" 2>/dev/null || true
    kill "${agent_pid}" 2>/dev/null || true
    wait "${agent_pid}" 2>/dev/null || true
    status="fail"
  else
    # Subshell exited without writing status file (unexpected error)
    log_err "${agent} subshell exited without writing status"
    wait "${agent_pid}" 2>/dev/null || true
    status="fail"
  fi

  rm -f "${status_file}"

  # Teardown (always attempt, even after timeout)
  teardown_agent "${app_name}" || log_warn "Teardown failed for ${app_name}"

  local agent_end
  agent_end=$(date +%s)
  local agent_duration=$((agent_end - agent_start))
  local duration_str
  duration_str=$(format_duration "${agent_duration}")

  if [ "${status}" = "pass" ]; then
    log_ok "${agent} PASSED (${duration_str})"
  else
    log_err "${agent} FAILED (${duration_str})"
  fi

  # Write result to file (for parallel collection)
  if [ -n "${result_file}" ]; then
    printf '%s' "${status}" > "${result_file}"
  fi

  return 0
}

# ---------------------------------------------------------------------------
# run_agents_for_cloud CLOUD LOG_DIR
#
# Runs all agents for a single cloud. Supports parallel batching.
# Writes per-agent results to LOG_DIR/{cloud}-{agent}.result.
# Writes cloud summary to LOG_DIR/{cloud}.summary.
# ---------------------------------------------------------------------------
run_agents_for_cloud() {
  local cloud="$1"
  local log_dir="$2"
  local cloud_start
  cloud_start=$(date +%s)

  # Load the cloud driver
  load_cloud_driver "${cloud}"

  # Set log prefix for multi-cloud output
  if [ "${cloud_count}" -gt 1 ]; then
    CLOUD_LOG_PREFIX="[${cloud}] "
  fi

  log_header "E2E Tests: ${cloud}"
  log_info "Agents: ${AGENTS_TO_TEST}"

  # Validate environment for this cloud
  if ! require_env; then
    log_warn "Credentials not configured for ${cloud} — skipping"
    printf 'SKIPPED (no credentials)' > "${log_dir}/${cloud}.summary"
    return 0
  fi

  local cloud_passed=""
  local cloud_failed=""

  # Pre-run stale cleanup: remove orphaned e2e instances from previous
  # interrupted runs before starting new agents. Uses a shorter max_age (5 min)
  # than the default (30 min) so that orphans from recently-failed runs are
  # cleaned before they can exhaust the account's instance quota (#2793).
  if [ "${SKIP_CLEANUP}" -eq 0 ]; then
    _CLEANUP_MAX_AGE=300 cloud_cleanup_stale || log_warn "Pre-run stale cleanup encountered errors"
  fi

  # Resolve effective parallelism (respect per-cloud cap)
  local effective_parallel="${PARALLEL_COUNT}"
  if [ "${SEQUENTIAL_MODE}" -eq 0 ]; then
    local cloud_max
    cloud_max=$(cloud_max_parallel)
    if [ "${effective_parallel}" -gt "${cloud_max}" ]; then
      effective_parallel="${cloud_max}"
    fi
  fi

  # Bail out early if the cloud reports zero capacity (e.g. droplet limit reached).
  # All agents would fail anyway — skip with an actionable error instead of wasting
  # time on retries that cannot succeed. (#3059)
  if [ "${effective_parallel}" -eq 0 ] && [ "${SEQUENTIAL_MODE}" -eq 0 ]; then
    log_err "No capacity available on ${cloud} — all ${cloud} agents will be marked as failed."
    log_err "Delete existing instances or request a limit increase, then re-run."
    for agent in ${AGENTS_TO_TEST}; do
      printf 'fail' > "${log_dir}/${cloud}-${agent}.result"
      if [ -z "${cloud_failed}" ]; then cloud_failed="${agent}"; else cloud_failed="${cloud_failed} ${agent}"; fi
    done
    printf '%s %s %s %s %s' "0" "$(printf '%s\n' "${AGENTS_TO_TEST}" | wc -w | tr -d ' ')" "0s" "" "|${cloud_failed}" \
      > "${log_dir}/${cloud}.summary"
    return 1
  fi

  if [ "${effective_parallel}" -gt 0 ] && [ "${SEQUENTIAL_MODE}" -eq 0 ]; then
    # Parallel mode: batch agents
    log_info "Running agents in parallel (batch size: ${effective_parallel})"

    local batch_agents=""
    local batch_count=0
    local batch_num=0

    for agent in ${AGENTS_TO_TEST}; do
      batch_agents="${batch_agents} ${agent}"
      batch_count=$((batch_count + 1))

      if [ "${batch_count}" -ge "${effective_parallel}" ]; then
        batch_num=$((batch_num + 1))
        log_header "Batch ${batch_num} (${cloud})"

        # Refresh auth before each batch — prevents token expiry in long
        # E2E runs (60+ min). No-op for clouds without refresh support. #2934
        cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}"

        pids=""
        for ba in ${batch_agents}; do
          local_result_file="${log_dir}/${cloud}-${ba}.result"
          run_single_agent "${ba}" "${local_result_file}" &
          if [ -z "${pids}" ]; then pids="$!"; else pids="${pids} $!"; fi
        done

        for p in ${pids}; do
          wait "${p}" 2>/dev/null || true
        done

        # Collect batch results
        for ba in ${batch_agents}; do
          local_result_file="${log_dir}/${cloud}-${ba}.result"
          if [ -f "${local_result_file}" ] && [ "$(cat "${local_result_file}")" = "pass" ]; then
            if [ -z "${cloud_passed}" ]; then cloud_passed="${ba}"; else cloud_passed="${cloud_passed} ${ba}"; fi
          else
            if [ -z "${cloud_failed}" ]; then cloud_failed="${ba}"; else cloud_failed="${cloud_failed} ${ba}"; fi
          fi
        done

        batch_agents=""
        batch_count=0
      fi
    done

    # Handle remaining agents in last partial batch
    if [ -n "${batch_agents}" ]; then
      batch_num=$((batch_num + 1))
      log_header "Batch ${batch_num} (${cloud})"

      # Refresh auth before partial batch too — same reason as above. #2934
      cloud_refresh_auth || log_warn "Auth refresh failed before batch ${batch_num}"

      pids=""
      for ba in ${batch_agents}; do
        local_result_file="${log_dir}/${cloud}-${ba}.result"
        run_single_agent "${ba}" "${local_result_file}" &
        if [ -z "${pids}" ]; then pids="$!"; else pids="${pids} $!"; fi
      done

      for p in ${pids}; do
        wait "${p}" 2>/dev/null || true
      done

      for ba in ${batch_agents}; do
        local_result_file="${log_dir}/${cloud}-${ba}.result"
        if [ -f "${local_result_file}" ] && [ "$(cat "${local_result_file}")" = "pass" ]; then
          if [ -z "${cloud_passed}" ]; then cloud_passed="${ba}"; else cloud_passed="${cloud_passed} ${ba}"; fi
        else
          if [ -z "${cloud_failed}" ]; then cloud_failed="${ba}"; else cloud_failed="${cloud_failed} ${ba}"; fi
        fi
      done
    fi

  else
    # Sequential mode
    for agent in ${AGENTS_TO_TEST}; do
      local_result_file="${log_dir}/${cloud}-${agent}.result"
      run_single_agent "${agent}" "${local_result_file}"

      if [ -f "${local_result_file}" ] && [ "$(cat "${local_result_file}")" = "pass" ]; then
        if [ -z "${cloud_passed}" ]; then cloud_passed="${agent}"; else cloud_passed="${cloud_passed} ${agent}"; fi
      else
        if [ -z "${cloud_failed}" ]; then cloud_failed="${agent}"; else cloud_failed="${cloud_failed} ${agent}"; fi
      fi
    done
  fi

  # Stale cleanup
  if [ "${SKIP_CLEANUP}" -eq 0 ]; then
    cloud_cleanup_stale || log_warn "Stale cleanup encountered errors"
  fi

  # Write cloud summary
  local cloud_end
  cloud_end=$(date +%s)
  local cloud_duration=$((cloud_end - cloud_start))
  local cloud_duration_str
  cloud_duration_str=$(format_duration "${cloud_duration}")

  local pass_count=0
  local fail_count=0
  if [ -n "${cloud_passed}" ]; then pass_count=$(printf '%s\n' "${cloud_passed}" | wc -w | tr -d ' '); fi
  if [ -n "${cloud_failed}" ]; then fail_count=$(printf '%s\n' "${cloud_failed}" | wc -w | tr -d ' '); fi

  printf '%s %s %s %s %s' "${pass_count}" "${fail_count}" "${cloud_duration_str}" "${cloud_passed}" "|${cloud_failed}" \
    > "${log_dir}/${cloud}.summary"

  if [ "${fail_count}" -gt 0 ]; then
    return 1
  fi
  return 0
}

# ---------------------------------------------------------------------------
# send_matrix_email LOG_DIR CLOUDS AGENTS TOTAL_PASS TOTAL_FAIL DURATION_STR
#
# Sends an agent x cloud matrix report via Resend.
# Requires: RESEND_API_KEY, KEY_REQUEST_EMAIL env vars (silently skips if absent).
# ---------------------------------------------------------------------------
send_matrix_email() {
  local log_dir="$1"
  local clouds="$2"
  local agents="$3"
  local total_pass="$4"
  local total_fail="$5"
  local duration_str="$6"

  # Skip email for targeted re-runs (partial agent/cloud subset).
  # Set SPAWN_E2E_SKIP_EMAIL=1 to suppress the email (used by quality cycle
  # when re-running only failed agents — a partial email looks like all-passed).
  if [ "${SPAWN_E2E_SKIP_EMAIL:-0}" = "1" ]; then
    log_info "Matrix email skipped (SPAWN_E2E_SKIP_EMAIL=1)"
    return 0
  fi

  local resend_key="${RESEND_API_KEY:-}"
  local to_email="${KEY_REQUEST_EMAIL:-}"

  if [ -z "${resend_key}" ] || [ -z "${to_email}" ]; then
    log_info "Matrix email skipped (RESEND_API_KEY or KEY_REQUEST_EMAIL not set)"
    return 0
  fi

  # Build results string: "cloud:agent:result,..." for bun to process
  # Sanitize cloud/agent names to alphanumeric, dash, underscore only (#3189)
  local results=""
  for cloud in ${clouds}; do
    local safe_cloud
    safe_cloud=$(printf '%s' "${cloud}" | tr -cd 'a-zA-Z0-9_-')
    for agent in ${agents}; do
      local safe_agent
      safe_agent=$(printf '%s' "${agent}" | tr -cd 'a-zA-Z0-9_-')
      local result="skip"
      local result_file="${log_dir}/${cloud}-${agent}.result"
      if [ -f "${result_file}" ]; then
        result=$(cat "${result_file}")
      fi
      # Sanitize result to known values only
      case "${result}" in
        pass|fail|skip) ;;
        *) result="skip" ;;
      esac
      if [ -n "${results}" ]; then results="${results},"; fi
      results="${results}${safe_cloud}:${safe_agent}:${result}"
    done
  done

  local ts_file old_umask
  old_umask=$(umask)
  umask 077
  ts_file=$(mktemp /tmp/e2e-email-XXXXXX.ts)
  umask "${old_umask}"

  cat > "${ts_file}" << 'TS_EOF'
const results = (process.env._E2E_RESULTS ?? "").split(",").filter(Boolean);
const clouds = (process.env._E2E_CLOUDS ?? "").split(" ").filter(Boolean);
const agents = (process.env._E2E_AGENTS ?? "").split(" ").filter(Boolean);
const totalPass = process.env._E2E_TOTAL_PASS ?? "0";
const totalFail = process.env._E2E_TOTAL_FAIL ?? "0";
const duration = process.env._E2E_DURATION ?? "?";
const toEmail = process.env.KEY_REQUEST_EMAIL ?? "";
const resendKey = process.env.RESEND_API_KEY ?? "";
const timestamp = new Date().toUTCString();

// Build lookup map: "cloud:agent" -> result
const resultMap: Record<string, string> = {};
for (const entry of results) {
  const parts = entry.split(":");
  resultMap[`${parts[0]}:${parts[1]}`] = parts[2] ?? "skip";
}

// Cell styles per result
const cellStyle = (result: string): string => {
  if (result === "pass") return "background:#22c55e;color:#fff;font-weight:bold;padding:4px 10px;border-radius:4px;";
  if (result === "fail") return "background:#ef4444;color:#fff;font-weight:bold;padding:4px 10px;border-radius:4px;";
  return "background:#e2e8f0;color:#94a3b8;padding:4px 10px;border-radius:4px;";
};

const headerCells = clouds
  .map(c => `<th style="padding:8px 14px;background:#1e293b;color:#fff;text-transform:uppercase;font-size:11px;letter-spacing:.05em;">${c}</th>`)
  .join("");

const bodyRows = agents
  .map(agent => {
    const cells = clouds
      .map(cloud => {
        const r = resultMap[`${cloud}:${agent}`] ?? "skip";
        return `<td style="padding:6px 14px;text-align:center;"><span style="${cellStyle(r)}">${r.toUpperCase()}</span></td>`;
      })
      .join("");
    return `<tr><td style="padding:6px 14px;font-weight:600;white-space:nowrap;color:#1e293b;">${agent}</td>${cells}</tr>`;
  })
  .join("");

const status = totalFail === "0" ? "✅ All Passed" : `❌ ${totalFail} Failed`;

const html = `<!DOCTYPE html>
<html><body style="font-family:system-ui,-apple-system,sans-serif;max-width:860px;margin:0 auto;padding:24px;color:#1e293b;">
<h2 style="margin:0 0 4px;">${status} — Spawn E2E Matrix</h2>
<p style="margin:0 0 20px;color:#64748b;font-size:14px;">Completed ${timestamp}</p>
<table style="border-collapse:collapse;width:100%;">
  <thead>
    <tr>
      <th style="padding:8px 14px;background:#1e293b;color:#fff;text-align:left;font-size:11px;text-transform:uppercase;letter-spacing:.05em;">Agent</th>
      ${headerCells}
    </tr>
  </thead>
  <tbody>
    ${bodyRows}
  </tbody>
</table>
<p style="margin-top:18px;color:#64748b;font-size:13px;">
  <strong style="color:#1e293b;">Total:</strong> ${totalPass} passed, ${totalFail} failed
  &nbsp;·&nbsp;
  <strong style="color:#1e293b;">Duration:</strong> ${duration}
</p>
</body></html>`;

const subject = totalFail === "0"
  ? `✅ E2E Matrix: ${totalPass} passed · ${duration}`
  : `❌ E2E Matrix: ${totalFail} failed, ${totalPass} passed · ${duration}`;

const res = await fetch("https://api.resend.com/emails", {
  method: "POST",
  headers: {
    "Content-Type": "application/json",
    "Authorization": `Bearer ${resendKey}`,
  },
  body: JSON.stringify({
    from: "Spawn QA <onboarding@resend.dev>",
    to: [toEmail],
    subject,
    html,
  }),
});

if (!res.ok) {
  const body = await res.text();
  console.error(`Resend API error ${res.status}: ${body}`);
  process.exit(1);
}
console.log(`Matrix email sent to ${toEmail}`);
TS_EOF

  log_info "Sending matrix email to ${to_email}..."
  _E2E_RESULTS="${results}" \
  _E2E_CLOUDS="${clouds}" \
  _E2E_AGENTS="${agents}" \
  _E2E_TOTAL_PASS="${total_pass}" \
  _E2E_TOTAL_FAIL="${total_fail}" \
  _E2E_DURATION="${duration_str}" \
    bun run "${ts_file}" 2>&1 || log_warn "Failed to send matrix email"

  rm -f "${ts_file}" 2>/dev/null || true
}

# ---------------------------------------------------------------------------
# Final cleanup trap
# ---------------------------------------------------------------------------
final_cleanup() {
  if [ -n "${_TRACKED_APPS}" ]; then
    printf "\n"
    log_warn "Cleaning up tracked instances on exit..."
    for app in ${_TRACKED_APPS}; do
      log_step "Tearing down ${app}..."
      teardown_agent "${app}" 2>/dev/null || log_warn "Failed to tear down ${app}"
    done
  fi
  if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR:-}" ]; then
    if [ "${LOG_DIR}" != "${_E2E_CREATED_LOG_DIR:-}" ]; then
      log_warn "Refusing to rm -rf LOG_DIR not created by this script: ${LOG_DIR}"
    else
      # Reject symlinks to prevent TOCTOU races (CWE-367, #3233):
      # Previous code resolved symlinks then operated on the resolved path,
      # but an attacker could swap the symlink target between resolve and rm.
      # Fix: refuse to delete symlinks entirely — LOG_DIR should never be one.
      if [ -L "${LOG_DIR}" ]; then
        log_warn "LOG_DIR is a symlink, refusing deletion to prevent symlink attacks: ${LOG_DIR}"
        return
      fi
      SAFE_TMP_ROOT="${TMP_ROOT:-${TMPDIR:-/tmp}}"
      SAFE_TMP_ROOT="${SAFE_TMP_ROOT%/}"
      # Use realpath -P to resolve, then verify the original path matches
      # (ensures LOG_DIR is not inside a symlinked parent directory)
      local resolved_log_dir
      resolved_log_dir=$(realpath -P "${LOG_DIR}" 2>/dev/null)
      if [ -z "${resolved_log_dir}" ]; then
        log_warn "Failed to resolve LOG_DIR path, skipping cleanup"
        return
      fi
      # Re-check symlink after resolve to narrow the TOCTOU window
      if [ -L "${LOG_DIR}" ]; then
        log_warn "LOG_DIR became a symlink during cleanup, aborting: ${LOG_DIR}"
        return
      fi
      # Verify ownership on the original path (not the resolved one)
      if [ ! -O "${LOG_DIR}" ]; then
        log_warn "LOG_DIR not owned by current user, refusing deletion: ${LOG_DIR}"
      else
        case "${resolved_log_dir}" in
          "${SAFE_TMP_ROOT}"/spawn-e2e.*)
            # Delete the original path — if it became a symlink between check
            # and here, rm -rf on a symlink just removes the link itself when
            # the target no longer matches. The double -L check above minimizes
            # this window.
            rm -rf "${LOG_DIR}"
            ;;
          *)
            log_warn "Refusing to rm -rf unexpected path: ${resolved_log_dir}"
            ;;
        esac
      fi
    fi
  fi
}
trap final_cleanup EXIT

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
log_header "Spawn E2E Test Suite (Multi-Cloud)"
log_info "Clouds: ${CLOUDS}"
log_info "Agents: ${AGENTS_TO_TEST}"
if [ "${SEQUENTIAL_MODE}" -eq 1 ]; then
  log_info "Agent parallelism: sequential"
elif [ "${PARALLEL_COUNT}" -ge 99 ]; then
  log_info "Agent parallelism: all at once (per-cloud caps may apply)"
else
  log_info "Agent parallelism: ${PARALLEL_COUNT} per cloud"
fi
if [ "${SKIP_INPUT_TEST}" -eq 1 ]; then
  log_info "Input tests: SKIPPED"
fi
if [ "${FAST_MODE}" -eq 1 ]; then
  log_info "Fast mode: ENABLED (--fast passed to spawn)"
fi

# Export FAST_MODE so provision.sh can read it
export E2E_FAST_MODE="${FAST_MODE}"

# Create temp log directory
TMP_ROOT="${TMPDIR:-/tmp}"
TMP_ROOT="${TMP_ROOT%/}"
LOG_DIR=$(mktemp -d "${TMP_ROOT}/spawn-e2e.XXXXXX")
_E2E_CREATED_LOG_DIR="${LOG_DIR}"
export LOG_DIR
log_info "Log directory: ${LOG_DIR}"

START_TIME=$(date +%s)

# ---------------------------------------------------------------------------
# Execute: single-cloud or multi-cloud
# ---------------------------------------------------------------------------
if [ "${cloud_count}" -eq 1 ]; then
  # Single cloud — run directly in this process
  run_agents_for_cloud "${CLOUDS}" "${LOG_DIR}" || true

else
  # Multi-cloud — each cloud runs as a separate background process
  cloud_pids=""
  for cloud in ${CLOUDS}; do
    (
      # Reset parent's EXIT trap — the main process handles LOG_DIR cleanup
      trap - EXIT
      _TRACKED_APPS=""
      run_agents_for_cloud "${cloud}" "${LOG_DIR}"
    ) > "${LOG_DIR}/${cloud}.log" 2>&1 &
    cloud_pid=$!
    if [ -z "${cloud_pids}" ]; then
      cloud_pids="${cloud_pid}"
    else
      cloud_pids="${cloud_pids} ${cloud_pid}"
    fi
    log_info "Started ${cloud} tests (PID: ${cloud_pid})"
  done

  # Wait for all clouds to finish
  any_failed=0
  for pid in ${cloud_pids}; do
    wait "${pid}" 2>/dev/null || any_failed=1
  done

  # Print per-cloud logs
  for cloud in ${CLOUDS}; do
    if [ -f "${LOG_DIR}/${cloud}.log" ]; then
      printf "\n"
      log_header "Output: ${cloud}"
      cat "${LOG_DIR}/${cloud}.log"
    fi
  done
fi

# ---------------------------------------------------------------------------
# Unified Summary
# ---------------------------------------------------------------------------
END_TIME=$(date +%s)
TOTAL_DURATION=$((END_TIME - START_TIME))
DURATION_STR=$(format_duration "${TOTAL_DURATION}")

printf "\n"
log_header "E2E Test Summary"

total_pass=0
total_fail=0
any_cloud_failed=0

for cloud in ${CLOUDS}; do
  printf "\n  ${BOLD}%s:${NC}\n" "${cloud}"

  cloud_pass=0
  cloud_fail=0
  cloud_skip=0

  # Check if this cloud was skipped (no credentials) — no result files written
  cloud_has_results=0
  for agent in ${AGENTS_TO_TEST}; do
    if [ -f "${LOG_DIR}/${cloud}-${agent}.result" ]; then
      cloud_has_results=1
      break
    fi
  done

  if [ "${cloud_has_results}" -eq 0 ]; then
    printf "    ${YELLOW}(skipped — credentials not configured)${NC}\n"
    continue
  fi

  for agent in ${AGENTS_TO_TEST}; do
    result_file="${LOG_DIR}/${cloud}-${agent}.result"
    if [ -f "${result_file}" ] && [ "$(cat "${result_file}")" = "pass" ]; then
      printf "    ${GREEN}%-12s PASS${NC}\n" "${agent}"
      cloud_pass=$((cloud_pass + 1))
      total_pass=$((total_pass + 1))
    else
      printf "    ${RED}%-12s FAIL${NC}\n" "${agent}"
      cloud_fail=$((cloud_fail + 1))
      total_fail=$((total_fail + 1))
    fi
  done

  if [ "${cloud_fail}" -gt 0 ]; then
    printf "    ${RED}%d passed, %d failed${NC}\n" "${cloud_pass}" "${cloud_fail}"
    any_cloud_failed=1
  else
    printf "    ${GREEN}%d passed, 0 failed${NC}\n" "${cloud_pass}"
  fi
done

printf "\n"
printf "  ${BOLD}Total:${NC} ${GREEN}%d passed${NC}" "${total_pass}"
if [ "${total_fail}" -gt 0 ]; then
  printf ", ${RED}%d failed${NC}" "${total_fail}"
fi
printf "\n  Duration: %s\n" "${DURATION_STR}"

# Send matrix email report
send_matrix_email "${LOG_DIR}" "${CLOUDS}" "${AGENTS_TO_TEST}" "${total_pass}" "${total_fail}" "${DURATION_STR}"

# Exit with failure if any agent on any cloud failed
if [ "${total_fail}" -gt 0 ]; then
  exit 1
fi

# All tests passed — advance the e2e-last-green tag for diff-aware reviews
mark_e2e_green

exit 0