spawn/sh/e2e/lib/clouds/gcp.sh
A e3278578ee
fix(e2e): skip GCP tests when billing is disabled (#3146)
Add a billing pre-check to _gcp_validate_env so the E2E orchestrator
skips GCP gracefully ("skipped — credentials not configured") instead
of failing every agent individually when billing is disabled.

Fixes #3091

Agent: test-engineer

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 19:26:42 +07:00

375 lines
13 KiB
Bash

#!/bin/bash
# e2e/lib/clouds/gcp.sh — GCP Compute Engine cloud driver for multi-cloud E2E
#
# Implements the standard cloud driver interface (_gcp_* prefixed functions).
# Sourced by common.sh's load_cloud_driver() which wires these to generic names.
#
# Depends on: log_step, log_ok, log_err, log_warn, log_info, format_duration,
# untrack_app (provided by common.sh)
set -eo pipefail
# ---------------------------------------------------------------------------
# Instance IP cache (avoid repeated API calls within a single run)
# ---------------------------------------------------------------------------
_GCP_INSTANCE_IP=""
_GCP_INSTANCE_APP=""
# ---------------------------------------------------------------------------
# _gcp_validate_env
#
# Check that the gcloud CLI is installed and credentials are valid.
# Requires GCP_PROJECT to be set. Loads GCP_PROJECT and GCP_ZONE from
# ~/.config/spawn/gcp.json if not already in the environment.
# Returns 0 on success, 1 on failure.
# ---------------------------------------------------------------------------
_gcp_validate_env() {
local missing=0
# Load GCP_PROJECT and GCP_ZONE from ~/.config/spawn/gcp.json if not set.
# This allows the QA VM to configure the correct zone without env var exports.
local _gcp_config="${HOME}/.config/spawn/gcp.json"
if [ -f "${_gcp_config}" ]; then
if [ -z "${GCP_PROJECT:-}" ]; then
local _proj
if command -v jq >/dev/null 2>&1; then
_proj=$(jq -r '.GCP_PROJECT // "" | select(. != null)' "${_gcp_config}" 2>/dev/null)
else
_proj=$(_FILE="${_gcp_config}" bun -e "
import fs from 'fs';
const d = JSON.parse(fs.readFileSync(process.env._FILE, 'utf8'));
process.stdout.write(d.GCP_PROJECT || '');
" 2>/dev/null)
fi
if [ -n "${_proj}" ]; then
export GCP_PROJECT="${_proj}"
fi
fi
if [ -z "${GCP_ZONE:-}" ]; then
local _zone
if command -v jq >/dev/null 2>&1; then
_zone=$(jq -r '.GCP_ZONE // "" | select(. != null)' "${_gcp_config}" 2>/dev/null)
else
_zone=$(_FILE="${_gcp_config}" bun -e "
import fs from 'fs';
const d = JSON.parse(fs.readFileSync(process.env._FILE, 'utf8'));
process.stdout.write(d.GCP_ZONE || '');
" 2>/dev/null)
fi
if [ -n "${_zone}" ]; then
export GCP_ZONE="${_zone}"
fi
fi
fi
if ! command -v gcloud >/dev/null 2>&1; then
log_err "gcloud CLI not found. Install from https://cloud.google.com/sdk/docs/install"
missing=1
fi
if [ -z "${GCP_PROJECT:-}" ]; then
log_err "GCP_PROJECT is not set"
missing=1
fi
if [ "${missing}" -eq 1 ]; then
return 1
fi
if ! gcloud auth print-identity-token >/dev/null 2>&1; then
log_err "GCP credentials are not valid. Run: gcloud auth login"
return 1
fi
# Check if billing is enabled on the project. Without billing, instance
# creation always fails — skip early so the orchestrator reports "skipped"
# instead of failing every agent individually. See #3091.
local _billing_enabled
_billing_enabled=$(gcloud billing projects describe "${GCP_PROJECT}" \
--format="value(billingEnabled)" 2>/dev/null || true)
if [ "${_billing_enabled}" = "False" ]; then
log_err "Billing is disabled on GCP project '${GCP_PROJECT}' — cannot create instances"
log_err "Re-enable billing at: https://console.cloud.google.com/billing/linkedaccount?project=${GCP_PROJECT}"
return 1
fi
log_ok "GCP credentials validated (project: ${GCP_PROJECT}, zone: ${GCP_ZONE:-us-central1-a})"
return 0
}
# ---------------------------------------------------------------------------
# _gcp_headless_env APP AGENT
#
# Print export lines to stdout for headless provisioning.
# These are eval'd by the provisioning harness before invoking the CLI.
# ---------------------------------------------------------------------------
_gcp_headless_env() {
local app="$1"
# $2 = agent (unused but part of the interface)
printf 'export GCP_INSTANCE_NAME="%s"\n' "${app}"
printf 'export GCP_PROJECT="%s"\n' "${GCP_PROJECT:-}"
printf 'export GCP_ZONE="%s"\n' "${GCP_ZONE:-us-central1-a}"
printf 'export GCP_MACHINE_TYPE="%s"\n' "${GCP_MACHINE_TYPE:-e2-standard-2}"
}
# ---------------------------------------------------------------------------
# _gcp_provision_verify APP LOG_DIR
#
# Verify instance exists after provisioning, resolve external IP, and write
# metadata files for downstream steps (verify, teardown).
#
# Writes:
# $LOG_DIR/$APP.ip — external IPv4 address (plain text)
# $LOG_DIR/$APP.meta — instance metadata (JSON: name, zone, project)
# ---------------------------------------------------------------------------
_gcp_provision_verify() {
local app="$1"
local log_dir="$2"
local zone="${GCP_ZONE:-us-central1-a}"
local project="${GCP_PROJECT:-}"
# Check instance exists
if ! gcloud compute instances describe "${app}" \
--zone="${zone}" \
--project="${project}" >/dev/null 2>&1; then
log_err "Instance ${app} does not exist after provisioning"
return 1
fi
log_ok "Instance ${app} exists"
# Resolve external IP from networkInterfaces
local instance_ip
instance_ip=$(gcloud compute instances describe "${app}" \
--zone="${zone}" \
--project="${project}" \
--format=json 2>/dev/null \
| jq -r '.networkInterfaces[0].accessConfigs[0].natIP // empty' 2>/dev/null || true)
if [ -z "${instance_ip}" ]; then
log_err "Could not resolve external IP for ${app}"
return 1
fi
log_ok "Instance IP: ${instance_ip}"
# Write IP file for downstream steps
printf '%s' "${instance_ip}" > "${log_dir}/${app}.ip"
# Write metadata file
printf '{"name":"%s","zone":"%s","project":"%s"}\n' \
"${app}" "${zone}" "${project}" \
> "${log_dir}/${app}.meta"
return 0
}
# ---------------------------------------------------------------------------
# _gcp_exec APP CMD
#
# Resolve instance IP (cached), then run CMD via SSH.
# Returns the exit code of the remote command.
# ---------------------------------------------------------------------------
_gcp_exec() {
local app="$1"
local cmd="$2"
local ssh_user="${GCP_SSH_USER:-$(whoami)}"
# Validate SSH user contains only safe characters (defense-in-depth)
if ! printf '%s' "${ssh_user}" | grep -qE '^[a-zA-Z0-9._-]+$'; then
log_err "Invalid SSH user for instance ${app}: ${ssh_user}"
return 1
fi
# Resolve instance IP (cached per app)
if [ "${_GCP_INSTANCE_APP}" != "${app}" ] || [ -z "${_GCP_INSTANCE_IP}" ]; then
# Try reading from the IP file first (written by _gcp_provision_verify)
if [ -n "${LOG_DIR:-}" ] && [ -f "${LOG_DIR}/${app}.ip" ]; then
_GCP_INSTANCE_IP=$(cat "${LOG_DIR}/${app}.ip")
else
_GCP_INSTANCE_IP=$(gcloud compute instances describe "${app}" \
--zone="${GCP_ZONE:-us-central1-a}" \
--project="${GCP_PROJECT:-}" \
--format=json 2>/dev/null \
| jq -r '.networkInterfaces[0].accessConfigs[0].natIP // empty' 2>/dev/null || true)
fi
_GCP_INSTANCE_APP="${app}"
if [ -z "${_GCP_INSTANCE_IP}" ]; then
log_err "Could not resolve IP for instance ${app}"
return 1
fi
# Validate IP looks like an IPv4 address (defense-in-depth against API/file tampering)
if ! printf '%s' "${_GCP_INSTANCE_IP}" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
log_err "Invalid IP address for instance ${app}: ${_GCP_INSTANCE_IP}"
_GCP_INSTANCE_IP=""
_GCP_INSTANCE_APP=""
return 1
fi
fi
# Base64-encode the command and pipe it via stdin to avoid any shell
# interpolation on the remote side. This is structurally immune to
# injection regardless of the command content.
local encoded_cmd
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
# Validate base64 output contains only safe characters (defense-in-depth).
# Standard base64 only produces [A-Za-z0-9+/=]. This rejects any corruption.
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
log_err "Invalid base64 encoding of command for SSH exec"
return 1
fi
# Pass encoded command via stdin instead of shell interpolation.
# This completely avoids command injection — the remote side only sees
# stdin data, never an interpolated shell string.
printf '%s' "${encoded_cmd}" | ssh -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=10 -o LogLevel=ERROR -o BatchMode=yes \
"${ssh_user}@${_GCP_INSTANCE_IP}" "base64 -d | bash"
}
# ---------------------------------------------------------------------------
# _gcp_teardown APP
#
# Delete the GCP Compute Engine instance, verify deletion, and untrack it.
# Reads zone/project from metadata file if available, falls back to env vars.
# ---------------------------------------------------------------------------
_gcp_teardown() {
local app="$1"
local zone="${GCP_ZONE:-us-central1-a}"
local project="${GCP_PROJECT:-}"
# Try reading zone/project from metadata file
if [ -n "${LOG_DIR:-}" ] && [ -f "${LOG_DIR}/${app}.meta" ]; then
local meta_zone
local meta_project
meta_zone=$(jq -r '.zone // empty' "${LOG_DIR}/${app}.meta" 2>/dev/null || true)
meta_project=$(jq -r '.project // empty' "${LOG_DIR}/${app}.meta" 2>/dev/null || true)
if [ -n "${meta_zone}" ]; then
zone="${meta_zone}"
fi
if [ -n "${meta_project}" ]; then
project="${meta_project}"
fi
fi
log_step "Tearing down ${app}..."
# Delete the instance
gcloud compute instances delete "${app}" \
--zone="${zone}" \
--project="${project}" \
--quiet >/dev/null 2>&1 || true
# Brief wait for deletion to propagate
sleep 2
# Verify deletion
if gcloud compute instances describe "${app}" \
--zone="${zone}" \
--project="${project}" >/dev/null 2>&1; then
log_warn "Instance ${app} may still exist (GCP still reports it)"
else
log_ok "Instance ${app} torn down"
fi
# Clear IP cache if this was the cached instance
if [ "${_GCP_INSTANCE_APP}" = "${app}" ]; then
_GCP_INSTANCE_IP=""
_GCP_INSTANCE_APP=""
fi
untrack_app "${app}"
}
# ---------------------------------------------------------------------------
# _gcp_cleanup_stale
#
# List all GCP Compute Engine instances matching e2e-* in the project,
# and destroy any older than 30 minutes (based on the unix timestamp
# embedded in the name).
# ---------------------------------------------------------------------------
_gcp_cleanup_stale() {
local project="${GCP_PROJECT:-}"
local now
now=$(date +%s)
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
if [ -z "${project}" ]; then
log_warn "GCP_PROJECT not set — skipping stale cleanup"
return 0
fi
# List all instances matching e2e-* across all zones
local instances_json
instances_json=$(gcloud compute instances list \
--filter="name~^e2e-" \
--project="${project}" \
--format=json 2>/dev/null || true)
if [ -z "${instances_json}" ] || [ "${instances_json}" = "null" ] || [ "${instances_json}" = "[]" ]; then
log_info "Could not list instances or no e2e instances found — skipping cleanup"
return 0
fi
# Extract instance names and zones
local instance_entries
instance_entries=$(printf '%s' "${instances_json}" | jq -r '.[]? | "\(.name) \(.zone)"' 2>/dev/null || true)
if [ -z "${instance_entries}" ]; then
log_ok "No stale e2e instances found"
return 0
fi
local cleaned=0
local skipped=0
while IFS= read -r entry; do
local instance_name
local instance_zone_url
instance_name=$(printf '%s' "${entry}" | awk '{print $1}')
instance_zone_url=$(printf '%s' "${entry}" | awk '{print $2}')
# Extract zone name from full URL (zones/us-central1-a -> us-central1-a)
local instance_zone
instance_zone=$(printf '%s' "${instance_zone_url}" | sed 's|.*/||')
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
# The timestamp is the last dash-separated segment
local ts
ts=$(printf '%s' "${instance_name}" | sed 's/.*-//')
# Validate it looks like a unix timestamp (all digits, 10 chars)
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
log_warn "Skipping ${instance_name} — cannot parse timestamp"
skipped=$((skipped + 1))
continue
fi
local age=$((now - ts))
if [ "${age}" -gt "${max_age}" ]; then
local age_str
age_str=$(format_duration "${age}")
log_step "Destroying stale instance ${instance_name} (zone: ${instance_zone}, age: ${age_str})"
# Directly delete with the instance's actual zone
gcloud compute instances delete "${instance_name}" \
--zone="${instance_zone}" \
--project="${project}" \
--quiet >/dev/null 2>&1 || log_warn "Failed to tear down ${instance_name}"
cleaned=$((cleaned + 1))
else
skipped=$((skipped + 1))
fi
done <<EOF
${instance_entries}
EOF
if [ "${cleaned}" -gt 0 ]; then
log_ok "Cleaned ${cleaned} stale instance(s)"
fi
if [ "${skipped}" -gt 0 ]; then
log_info "Skipped ${skipped} recent instance(s)"
fi
}