mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-04-28 03:49:31 +00:00
* fix(security): array-based agent detection and GCP instance name validation Replace shell string concatenation in detectAgent() with individual `command -v` calls per agent, eliminating the compound shell command. Add _gcp_validate_instance_name() to validate GCP instance names match [a-z][a-z0-9-]*[a-z0-9] before passing to gcloud commands. Fixes #3151 Fixes #3149 Agent: security-auditor Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * fix: add instance name validation in _gcp_cleanup_stale() Defense-in-depth: validate instance names from GCP API before passing to gcloud delete, consistent with validation at other call sites. Agent: pr-maintainer Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> --------- Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
406 lines
14 KiB
Bash
406 lines
14 KiB
Bash
#!/bin/bash
|
|
# e2e/lib/clouds/gcp.sh — GCP Compute Engine cloud driver for multi-cloud E2E
|
|
#
|
|
# Implements the standard cloud driver interface (_gcp_* prefixed functions).
|
|
# Sourced by common.sh's load_cloud_driver() which wires these to generic names.
|
|
#
|
|
# Depends on: log_step, log_ok, log_err, log_warn, log_info, format_duration,
|
|
# untrack_app (provided by common.sh)
|
|
set -eo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Instance IP cache (avoid repeated API calls within a single run)
|
|
# ---------------------------------------------------------------------------
|
|
_GCP_INSTANCE_IP=""
|
|
_GCP_INSTANCE_APP=""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_validate_instance_name NAME
|
|
#
|
|
# Validate that a GCP instance name contains only safe characters.
|
|
# GCP requires: lowercase letters, digits, and hyphens; must start with a
|
|
# letter and not end with a hyphen; max 63 chars.
|
|
# Returns 0 on valid, 1 on invalid.
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_validate_instance_name() {
|
|
local name="$1"
|
|
if [ -z "${name}" ]; then
|
|
log_err "Instance name is empty"
|
|
return 1
|
|
fi
|
|
if ! printf '%s' "${name}" | grep -qE '^[a-z][a-z0-9-]{0,61}[a-z0-9]$'; then
|
|
log_err "Invalid GCP instance name: ${name} (must match [a-z][a-z0-9-]*[a-z0-9], max 63 chars)"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_validate_env
|
|
#
|
|
# Check that the gcloud CLI is installed and credentials are valid.
|
|
# Requires GCP_PROJECT to be set. Loads GCP_PROJECT and GCP_ZONE from
|
|
# ~/.config/spawn/gcp.json if not already in the environment.
|
|
# Returns 0 on success, 1 on failure.
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_validate_env() {
|
|
local missing=0
|
|
|
|
# Load GCP_PROJECT and GCP_ZONE from ~/.config/spawn/gcp.json if not set.
|
|
# This allows the QA VM to configure the correct zone without env var exports.
|
|
local _gcp_config="${HOME}/.config/spawn/gcp.json"
|
|
if [ -f "${_gcp_config}" ]; then
|
|
if [ -z "${GCP_PROJECT:-}" ]; then
|
|
local _proj
|
|
if command -v jq >/dev/null 2>&1; then
|
|
_proj=$(jq -r '.GCP_PROJECT // "" | select(. != null)' "${_gcp_config}" 2>/dev/null)
|
|
else
|
|
_proj=$(_FILE="${_gcp_config}" bun -e "
|
|
import fs from 'fs';
|
|
const d = JSON.parse(fs.readFileSync(process.env._FILE, 'utf8'));
|
|
process.stdout.write(d.GCP_PROJECT || '');
|
|
" 2>/dev/null)
|
|
fi
|
|
if [ -n "${_proj}" ]; then
|
|
export GCP_PROJECT="${_proj}"
|
|
fi
|
|
fi
|
|
if [ -z "${GCP_ZONE:-}" ]; then
|
|
local _zone
|
|
if command -v jq >/dev/null 2>&1; then
|
|
_zone=$(jq -r '.GCP_ZONE // "" | select(. != null)' "${_gcp_config}" 2>/dev/null)
|
|
else
|
|
_zone=$(_FILE="${_gcp_config}" bun -e "
|
|
import fs from 'fs';
|
|
const d = JSON.parse(fs.readFileSync(process.env._FILE, 'utf8'));
|
|
process.stdout.write(d.GCP_ZONE || '');
|
|
" 2>/dev/null)
|
|
fi
|
|
if [ -n "${_zone}" ]; then
|
|
export GCP_ZONE="${_zone}"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if ! command -v gcloud >/dev/null 2>&1; then
|
|
log_err "gcloud CLI not found. Install from https://cloud.google.com/sdk/docs/install"
|
|
missing=1
|
|
fi
|
|
|
|
if [ -z "${GCP_PROJECT:-}" ]; then
|
|
log_err "GCP_PROJECT is not set"
|
|
missing=1
|
|
fi
|
|
|
|
if [ "${missing}" -eq 1 ]; then
|
|
return 1
|
|
fi
|
|
|
|
if ! gcloud auth print-identity-token >/dev/null 2>&1; then
|
|
log_err "GCP credentials are not valid. Run: gcloud auth login"
|
|
return 1
|
|
fi
|
|
|
|
# Check if billing is enabled on the project. Without billing, instance
|
|
# creation always fails — skip early so the orchestrator reports "skipped"
|
|
# instead of failing every agent individually. See #3091.
|
|
local _billing_enabled
|
|
_billing_enabled=$(gcloud billing projects describe "${GCP_PROJECT}" \
|
|
--format="value(billingEnabled)" 2>/dev/null || true)
|
|
if [ "${_billing_enabled}" = "False" ]; then
|
|
log_err "Billing is disabled on GCP project '${GCP_PROJECT}' — cannot create instances"
|
|
log_err "Re-enable billing at: https://console.cloud.google.com/billing/linkedaccount?project=${GCP_PROJECT}"
|
|
return 1
|
|
fi
|
|
|
|
log_ok "GCP credentials validated (project: ${GCP_PROJECT}, zone: ${GCP_ZONE:-us-central1-a})"
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_headless_env APP AGENT
|
|
#
|
|
# Print export lines to stdout for headless provisioning.
|
|
# These are eval'd by the provisioning harness before invoking the CLI.
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_headless_env() {
|
|
local app="$1"
|
|
# $2 = agent (unused but part of the interface)
|
|
_gcp_validate_instance_name "${app}" || return 1
|
|
|
|
printf 'export GCP_INSTANCE_NAME="%s"\n' "${app}"
|
|
printf 'export GCP_PROJECT="%s"\n' "${GCP_PROJECT:-}"
|
|
printf 'export GCP_ZONE="%s"\n' "${GCP_ZONE:-us-central1-a}"
|
|
printf 'export GCP_MACHINE_TYPE="%s"\n' "${GCP_MACHINE_TYPE:-e2-standard-2}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_provision_verify APP LOG_DIR
|
|
#
|
|
# Verify instance exists after provisioning, resolve external IP, and write
|
|
# metadata files for downstream steps (verify, teardown).
|
|
#
|
|
# Writes:
|
|
# $LOG_DIR/$APP.ip — external IPv4 address (plain text)
|
|
# $LOG_DIR/$APP.meta — instance metadata (JSON: name, zone, project)
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_provision_verify() {
|
|
local app="$1"
|
|
local log_dir="$2"
|
|
local zone="${GCP_ZONE:-us-central1-a}"
|
|
local project="${GCP_PROJECT:-}"
|
|
_gcp_validate_instance_name "${app}" || return 1
|
|
|
|
# Check instance exists
|
|
if ! gcloud compute instances describe "${app}" \
|
|
--zone="${zone}" \
|
|
--project="${project}" >/dev/null 2>&1; then
|
|
log_err "Instance ${app} does not exist after provisioning"
|
|
return 1
|
|
fi
|
|
|
|
log_ok "Instance ${app} exists"
|
|
|
|
# Resolve external IP from networkInterfaces
|
|
local instance_ip
|
|
instance_ip=$(gcloud compute instances describe "${app}" \
|
|
--zone="${zone}" \
|
|
--project="${project}" \
|
|
--format=json 2>/dev/null \
|
|
| jq -r '.networkInterfaces[0].accessConfigs[0].natIP // empty' 2>/dev/null || true)
|
|
|
|
if [ -z "${instance_ip}" ]; then
|
|
log_err "Could not resolve external IP for ${app}"
|
|
return 1
|
|
fi
|
|
|
|
log_ok "Instance IP: ${instance_ip}"
|
|
|
|
# Write IP file for downstream steps
|
|
printf '%s' "${instance_ip}" > "${log_dir}/${app}.ip"
|
|
|
|
# Write metadata file
|
|
printf '{"name":"%s","zone":"%s","project":"%s"}\n' \
|
|
"${app}" "${zone}" "${project}" \
|
|
> "${log_dir}/${app}.meta"
|
|
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_exec APP CMD
|
|
#
|
|
# Resolve instance IP (cached), then run CMD via SSH.
|
|
# Returns the exit code of the remote command.
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_exec() {
|
|
local app="$1"
|
|
local cmd="$2"
|
|
local ssh_user="${GCP_SSH_USER:-$(whoami)}"
|
|
_gcp_validate_instance_name "${app}" || return 1
|
|
|
|
# Validate SSH user contains only safe characters (defense-in-depth)
|
|
if ! printf '%s' "${ssh_user}" | grep -qE '^[a-zA-Z0-9._-]+$'; then
|
|
log_err "Invalid SSH user for instance ${app}: ${ssh_user}"
|
|
return 1
|
|
fi
|
|
|
|
# Resolve instance IP (cached per app)
|
|
if [ "${_GCP_INSTANCE_APP}" != "${app}" ] || [ -z "${_GCP_INSTANCE_IP}" ]; then
|
|
# Try reading from the IP file first (written by _gcp_provision_verify)
|
|
if [ -n "${LOG_DIR:-}" ] && [ -f "${LOG_DIR}/${app}.ip" ]; then
|
|
_GCP_INSTANCE_IP=$(cat "${LOG_DIR}/${app}.ip")
|
|
else
|
|
_GCP_INSTANCE_IP=$(gcloud compute instances describe "${app}" \
|
|
--zone="${GCP_ZONE:-us-central1-a}" \
|
|
--project="${GCP_PROJECT:-}" \
|
|
--format=json 2>/dev/null \
|
|
| jq -r '.networkInterfaces[0].accessConfigs[0].natIP // empty' 2>/dev/null || true)
|
|
fi
|
|
_GCP_INSTANCE_APP="${app}"
|
|
if [ -z "${_GCP_INSTANCE_IP}" ]; then
|
|
log_err "Could not resolve IP for instance ${app}"
|
|
return 1
|
|
fi
|
|
# Validate IP looks like an IPv4 address (defense-in-depth against API/file tampering)
|
|
if ! printf '%s' "${_GCP_INSTANCE_IP}" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
|
|
log_err "Invalid IP address for instance ${app}: ${_GCP_INSTANCE_IP}"
|
|
_GCP_INSTANCE_IP=""
|
|
_GCP_INSTANCE_APP=""
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
# Base64-encode the command and pipe it via stdin to avoid any shell
|
|
# interpolation on the remote side. This is structurally immune to
|
|
# injection regardless of the command content.
|
|
local encoded_cmd
|
|
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
|
|
|
|
# Validate base64 output contains only safe characters (defense-in-depth).
|
|
# Standard base64 only produces [A-Za-z0-9+/=]. This rejects any corruption.
|
|
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
|
|
log_err "Invalid base64 encoding of command for SSH exec"
|
|
return 1
|
|
fi
|
|
|
|
# Pass encoded command via stdin instead of shell interpolation.
|
|
# This completely avoids command injection — the remote side only sees
|
|
# stdin data, never an interpolated shell string.
|
|
printf '%s' "${encoded_cmd}" | ssh -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null \
|
|
-o ConnectTimeout=10 -o LogLevel=ERROR -o BatchMode=yes \
|
|
"${ssh_user}@${_GCP_INSTANCE_IP}" "base64 -d | bash"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_teardown APP
|
|
#
|
|
# Delete the GCP Compute Engine instance, verify deletion, and untrack it.
|
|
# Reads zone/project from metadata file if available, falls back to env vars.
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_teardown() {
|
|
local app="$1"
|
|
local zone="${GCP_ZONE:-us-central1-a}"
|
|
local project="${GCP_PROJECT:-}"
|
|
_gcp_validate_instance_name "${app}" || return 1
|
|
|
|
# Try reading zone/project from metadata file
|
|
if [ -n "${LOG_DIR:-}" ] && [ -f "${LOG_DIR}/${app}.meta" ]; then
|
|
local meta_zone
|
|
local meta_project
|
|
meta_zone=$(jq -r '.zone // empty' "${LOG_DIR}/${app}.meta" 2>/dev/null || true)
|
|
meta_project=$(jq -r '.project // empty' "${LOG_DIR}/${app}.meta" 2>/dev/null || true)
|
|
if [ -n "${meta_zone}" ]; then
|
|
zone="${meta_zone}"
|
|
fi
|
|
if [ -n "${meta_project}" ]; then
|
|
project="${meta_project}"
|
|
fi
|
|
fi
|
|
|
|
log_step "Tearing down ${app}..."
|
|
|
|
# Delete the instance
|
|
gcloud compute instances delete "${app}" \
|
|
--zone="${zone}" \
|
|
--project="${project}" \
|
|
--quiet >/dev/null 2>&1 || true
|
|
|
|
# Brief wait for deletion to propagate
|
|
sleep 2
|
|
|
|
# Verify deletion
|
|
if gcloud compute instances describe "${app}" \
|
|
--zone="${zone}" \
|
|
--project="${project}" >/dev/null 2>&1; then
|
|
log_warn "Instance ${app} may still exist (GCP still reports it)"
|
|
else
|
|
log_ok "Instance ${app} torn down"
|
|
fi
|
|
|
|
# Clear IP cache if this was the cached instance
|
|
if [ "${_GCP_INSTANCE_APP}" = "${app}" ]; then
|
|
_GCP_INSTANCE_IP=""
|
|
_GCP_INSTANCE_APP=""
|
|
fi
|
|
|
|
untrack_app "${app}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _gcp_cleanup_stale
|
|
#
|
|
# List all GCP Compute Engine instances matching e2e-* in the project,
|
|
# and destroy any older than 30 minutes (based on the unix timestamp
|
|
# embedded in the name).
|
|
# ---------------------------------------------------------------------------
|
|
_gcp_cleanup_stale() {
|
|
local project="${GCP_PROJECT:-}"
|
|
local now
|
|
now=$(date +%s)
|
|
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
|
|
|
|
if [ -z "${project}" ]; then
|
|
log_warn "GCP_PROJECT not set — skipping stale cleanup"
|
|
return 0
|
|
fi
|
|
|
|
# List all instances matching e2e-* across all zones
|
|
local instances_json
|
|
instances_json=$(gcloud compute instances list \
|
|
--filter="name~^e2e-" \
|
|
--project="${project}" \
|
|
--format=json 2>/dev/null || true)
|
|
|
|
if [ -z "${instances_json}" ] || [ "${instances_json}" = "null" ] || [ "${instances_json}" = "[]" ]; then
|
|
log_info "Could not list instances or no e2e instances found — skipping cleanup"
|
|
return 0
|
|
fi
|
|
|
|
# Extract instance names and zones
|
|
local instance_entries
|
|
instance_entries=$(printf '%s' "${instances_json}" | jq -r '.[]? | "\(.name) \(.zone)"' 2>/dev/null || true)
|
|
|
|
if [ -z "${instance_entries}" ]; then
|
|
log_ok "No stale e2e instances found"
|
|
return 0
|
|
fi
|
|
|
|
local cleaned=0
|
|
local skipped=0
|
|
|
|
while IFS= read -r entry; do
|
|
local instance_name
|
|
local instance_zone_url
|
|
instance_name=$(printf '%s' "${entry}" | awk '{print $1}')
|
|
instance_zone_url=$(printf '%s' "${entry}" | awk '{print $2}')
|
|
|
|
if ! _gcp_validate_instance_name "${instance_name}"; then
|
|
log_warn "Skipping ${instance_name} — invalid name format"
|
|
skipped=$((skipped + 1))
|
|
continue
|
|
fi
|
|
|
|
# Extract zone name from full URL (zones/us-central1-a -> us-central1-a)
|
|
local instance_zone
|
|
instance_zone=$(printf '%s' "${instance_zone_url}" | sed 's|.*/||')
|
|
|
|
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
|
|
# The timestamp is the last dash-separated segment
|
|
local ts
|
|
ts=$(printf '%s' "${instance_name}" | sed 's/.*-//')
|
|
|
|
# Validate it looks like a unix timestamp (all digits, 10 chars)
|
|
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
|
|
log_warn "Skipping ${instance_name} — cannot parse timestamp"
|
|
skipped=$((skipped + 1))
|
|
continue
|
|
fi
|
|
|
|
local age=$((now - ts))
|
|
if [ "${age}" -gt "${max_age}" ]; then
|
|
local age_str
|
|
age_str=$(format_duration "${age}")
|
|
log_step "Destroying stale instance ${instance_name} (zone: ${instance_zone}, age: ${age_str})"
|
|
|
|
# Directly delete with the instance's actual zone
|
|
gcloud compute instances delete "${instance_name}" \
|
|
--zone="${instance_zone}" \
|
|
--project="${project}" \
|
|
--quiet >/dev/null 2>&1 || log_warn "Failed to tear down ${instance_name}"
|
|
|
|
cleaned=$((cleaned + 1))
|
|
else
|
|
skipped=$((skipped + 1))
|
|
fi
|
|
done <<EOF
|
|
${instance_entries}
|
|
EOF
|
|
|
|
if [ "${cleaned}" -gt 0 ]; then
|
|
log_ok "Cleaned ${cleaned} stale instance(s)"
|
|
fi
|
|
if [ "${skipped}" -gt 0 ]; then
|
|
log_info "Skipped ${skipped} recent instance(s)"
|
|
fi
|
|
}
|