spawn/sh/e2e/lib/clouds/hetzner.sh
A 81ab237efe
fix(e2e): harden shell scripts against injection in SSH commands (#2945)
- hetzner.sh: Pipe base64-encoded command via stdin to SSH instead of
  embedding it in the SSH command string via variable expansion. The
  remote bash reads stdin, base64-decodes, and executes.

- verify.sh: Add remote-side re-validation of base64 and timeout values
  in _stage_prompt_remotely and _stage_timeout_remotely. Values are
  assigned to remote shell variables and validated before writing to
  temp files, providing defense-in-depth against injection.

- provision.sh: Add explicit early rejection of dangerous shell chars
  ($, `, \) in env var values from cloud_headless_env, and add
  remote-side re-validation of base64 payload before writing.

Fixes #2937
Fixes #2938
Fixes #2939

Agent: security-auditor

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-03-24 13:30:47 +07:00

385 lines
13 KiB
Bash

#!/bin/bash
# e2e/lib/clouds/hetzner.sh — Hetzner Cloud driver for multi-cloud E2E tests
set -eo pipefail
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
_HETZNER_API="https://api.hetzner.cloud/v1"
# ---------------------------------------------------------------------------
# _hetzner_curl_auth [curl-args...]
#
# Wrapper around curl that passes the HCLOUD_TOKEN via a temp config file
# instead of a command-line -H flag. This keeps the token out of `ps` output.
# All arguments are forwarded to curl.
# ---------------------------------------------------------------------------
_hetzner_curl_auth() {
local _cfg
_cfg=$(mktemp)
chmod 600 "${_cfg}"
printf 'header = "Authorization: Bearer %s"\n' "${HCLOUD_TOKEN}" > "${_cfg}"
curl -K "${_cfg}" "$@"
local _rc=$?
rm -f "${_cfg}"
return "${_rc}"
}
# ---------------------------------------------------------------------------
# _hetzner_validate_env
#
# Verify HCLOUD_TOKEN is set and credentials are valid.
# Returns 0 on success, 1 on failure.
# ---------------------------------------------------------------------------
_hetzner_validate_env() {
if [ -z "${HCLOUD_TOKEN:-}" ]; then
log_err "HCLOUD_TOKEN is not set"
return 1
fi
if ! _hetzner_curl_auth -sf \
"${_HETZNER_API}/servers?per_page=1" >/dev/null 2>&1; then
log_err "Hetzner API credentials are invalid"
return 1
fi
log_ok "Hetzner credentials validated"
return 0
}
# ---------------------------------------------------------------------------
# _hetzner_headless_env APP AGENT
#
# Print export lines for headless provisioning to stdout.
# ---------------------------------------------------------------------------
_hetzner_headless_env() {
local app="$1"
# $2 = agent (unused but part of the interface)
printf 'export HETZNER_SERVER_NAME="%s"\n' "${app}"
printf 'export HETZNER_SERVER_TYPE="%s"\n' "${HETZNER_SERVER_TYPE:-cx23}"
printf 'export HETZNER_LOCATION="%s"\n' "${HETZNER_LOCATION:-fsn1}"
}
# ---------------------------------------------------------------------------
# _hetzner_provision_verify APP LOG_DIR
#
# Verify the server exists via Hetzner API. Extract ID and IP.
# Write IP to $LOG_DIR/$APP.ip and metadata to $LOG_DIR/$APP.meta.
# ---------------------------------------------------------------------------
_hetzner_provision_verify() {
local app="$1"
local log_dir="$2"
# URL-encode the app name to prevent query parameter injection
local encoded_app
encoded_app=$(jq -rn --arg v "${app}" '$v|@uri')
local response
response=$(_hetzner_curl_auth -sf \
"${_HETZNER_API}/servers?name=${encoded_app}" 2>/dev/null || true)
if [ -z "${response}" ]; then
log_err "Failed to query Hetzner API for server ${app}"
return 1
fi
local server_count
server_count=$(printf '%s' "${response}" | jq '.servers | length' 2>/dev/null || printf '0')
if [ "${server_count}" -eq 0 ]; then
log_err "Server ${app} does not exist on Hetzner"
return 1
fi
local server_id
server_id=$(printf '%s' "${response}" | jq -r '.servers[0].id' 2>/dev/null)
local server_ip
server_ip=$(printf '%s' "${response}" | jq -r '.servers[0].public_net.ipv4.ip // empty' 2>/dev/null)
if [ -z "${server_ip}" ]; then
log_err "Could not resolve public IP for ${app}"
return 1
fi
local server_name
server_name=$(printf '%s' "${response}" | jq -r '.servers[0].name' 2>/dev/null)
local server_location
server_location=$(printf '%s' "${response}" | jq -r '.servers[0].datacenter.location.name // "unknown"' 2>/dev/null)
# Write IP for SSH access
printf '%s' "${server_ip}" > "${log_dir}/${app}.ip"
# Write metadata for teardown
printf '{"id":%s,"name":"%s","location":"%s"}\n' \
"${server_id}" "${server_name}" "${server_location}" \
> "${log_dir}/${app}.meta"
log_ok "Server ${app} verified (id=${server_id}, ip=${server_ip}, location=${server_location})"
return 0
}
# ---------------------------------------------------------------------------
# _hetzner_exec APP CMD
#
# Execute a command on the server via SSH.
# ---------------------------------------------------------------------------
_hetzner_exec() {
local app="$1"
local cmd="$2"
local log_dir="${LOG_DIR:-/tmp}"
local ip_file="${log_dir}/${app}.ip"
if [ ! -f "${ip_file}" ]; then
log_err "No IP file found for ${app} at ${ip_file}"
return 1
fi
local ip
ip=$(cat "${ip_file}")
if [ -z "${ip}" ]; then
log_err "Empty IP in ${ip_file}"
return 1
fi
# Validate IP looks like an IPv4 address (defense-in-depth against file tampering)
if ! printf '%s' "${ip}" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
log_err "Invalid IP address in ${ip_file}: ${ip}"
return 1
fi
# Base64-encode the command and pipe the payload via stdin to SSH.
# This eliminates variable expansion of the encoded command in the SSH
# command string, preventing injection even if base64 validation is bypassed.
local encoded_cmd
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
# Validate base64 output contains only safe characters (defense-in-depth).
# Standard base64 only produces [A-Za-z0-9+/=]. This rejects any corruption.
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
log_err "Invalid base64 encoding of command for SSH exec"
return 1
fi
# Pipe the base64 payload via stdin to the remote host. The remote bash
# reads stdin, base64-decodes it, and executes the result. No user-controlled
# data is interpolated into the SSH command string.
printf '%s' "${encoded_cmd}" | ssh -o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o LogLevel=ERROR \
-o BatchMode=yes \
-o ConnectTimeout=10 \
"root@${ip}" "base64 -d | bash"
}
# ---------------------------------------------------------------------------
# _hetzner_teardown APP
#
# Delete the server via Hetzner API using the stored server ID.
# ---------------------------------------------------------------------------
_hetzner_teardown() {
local app="$1"
local log_dir="${LOG_DIR:-/tmp}"
local meta_file="${log_dir}/${app}.meta"
if [ ! -f "${meta_file}" ]; then
log_warn "No metadata file for ${app} — cannot determine server ID"
untrack_app "${app}"
return 0
fi
local server_id
server_id=$(jq -r '.id' "${meta_file}" 2>/dev/null)
if [ -z "${server_id}" ] || [ "${server_id}" = "null" ]; then
log_warn "Could not parse server ID from ${meta_file}"
untrack_app "${app}"
return 0
fi
# Validate server ID is numeric (defense-in-depth against metadata tampering)
case "${server_id}" in ''|*[!0-9]*) log_warn "Non-numeric server ID: ${server_id}"; untrack_app "${app}"; return 0 ;; esac
log_step "Deleting Hetzner server ${app} (id=${server_id})"
local http_code
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
-X DELETE \
"${_HETZNER_API}/servers/${server_id}" 2>/dev/null || printf '000')
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
log_ok "Server ${app} (id=${server_id}) deleted"
elif [ "${http_code}" = "404" ]; then
log_info "Server ${app} (id=${server_id}) already gone"
else
log_warn "Unexpected HTTP ${http_code} deleting server ${app} (id=${server_id})"
fi
untrack_app "${app}"
}
# ---------------------------------------------------------------------------
# _hetzner_cleanup_orphaned_ips
#
# Delete Hetzner Primary IPs not attached to any server. These accumulate
# from failed/interrupted provisioning runs and consume the account's
# primary_ip_limit quota, causing resource_limit_exceeded errors (#2933).
# ---------------------------------------------------------------------------
_hetzner_cleanup_orphaned_ips() {
local response
response=$(_hetzner_curl_auth -sf \
"${_HETZNER_API}/primary_ips?per_page=50" 2>/dev/null || true)
if [ -z "${response}" ]; then
log_info "Could not list Hetzner primary IPs — skipping IP cleanup"
return 0
fi
local orphaned
orphaned=$(printf '%s' "${response}" | jq -r '.primary_ips[] | select(.assignee_id == null or .assignee_id == 0) | "\(.id):\(.ip)"' 2>/dev/null || true)
if [ -z "${orphaned}" ]; then
log_ok "No orphaned Hetzner Primary IPs found"
return 0
fi
local cleaned=0
for entry in ${orphaned}; do
local ip_id
ip_id=$(printf '%s' "${entry}" | cut -d: -f1)
local ip_addr
ip_addr=$(printf '%s' "${entry}" | cut -d: -f2-)
# Validate IP ID is numeric before using it in API URL
case "${ip_id}" in ''|*[!0-9]*) log_warn "Skipping orphaned IP ${entry} — non-numeric ID"; continue ;; esac
local http_code
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
-X DELETE \
"${_HETZNER_API}/primary_ips/${ip_id}" 2>/dev/null || printf '000')
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
log_ok "Deleted orphaned Primary IP ${ip_addr} (id=${ip_id})"
cleaned=$((cleaned + 1))
elif [ "${http_code}" = "404" ]; then
log_info "Primary IP ${ip_addr} (id=${ip_id}) already gone"
else
log_warn "Failed to delete Primary IP ${ip_addr} (id=${ip_id}, HTTP ${http_code})"
fi
done
if [ "${cleaned}" -gt 0 ]; then
log_ok "Cleaned ${cleaned} orphaned Hetzner Primary IP(s)"
fi
}
# ---------------------------------------------------------------------------
# _hetzner_cleanup_stale
#
# List all Hetzner servers, find e2e-* instances older than 30 minutes,
# and destroy them. Also cleans up orphaned Primary IPs to prevent
# resource_limit_exceeded errors (#2933).
# ---------------------------------------------------------------------------
_hetzner_cleanup_stale() {
local now
now=$(date +%s)
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
local response
response=$(_hetzner_curl_auth -sf \
"${_HETZNER_API}/servers?per_page=50" 2>/dev/null || true)
if [ -z "${response}" ]; then
log_info "Could not list Hetzner servers — skipping cleanup"
return 0
fi
local server_count
server_count=$(printf '%s' "${response}" | jq '.servers | length' 2>/dev/null || printf '0')
if [ "${server_count}" -eq 0 ]; then
log_ok "No Hetzner servers found"
return 0
fi
# Extract e2e-* servers as "id:name" pairs
local servers
servers=$(printf '%s' "${response}" | jq -r '.servers[] | select(.name | startswith("e2e-")) | "\(.id):\(.name)"' 2>/dev/null || true)
if [ -z "${servers}" ]; then
log_ok "No stale e2e instances found on Hetzner"
return 0
fi
local cleaned=0
local skipped=0
for entry in ${servers}; do
local server_id
server_id=$(printf '%s' "${entry}" | cut -d: -f1)
local server_name
server_name=$(printf '%s' "${entry}" | cut -d: -f2-)
# Validate server ID is numeric before using it in API URL
case "${server_id}" in ''|*[!0-9]*) log_warn "Skipping ${entry} — non-numeric server ID"; skipped=$((skipped + 1)); continue ;; esac
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
local ts
ts=$(printf '%s' "${server_name}" | sed 's/.*-//')
# Validate it looks like a unix timestamp (all digits, 10 chars)
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
log_warn "Skipping ${server_name} — cannot parse timestamp"
skipped=$((skipped + 1))
continue
fi
local age=$((now - ts))
if [ "${age}" -gt "${max_age}" ]; then
local age_str
age_str=$(format_duration "${age}")
log_step "Destroying stale Hetzner server ${server_name} (id=${server_id}, age: ${age_str})"
local http_code
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
-X DELETE \
"${_HETZNER_API}/servers/${server_id}" 2>/dev/null || printf '000')
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
log_ok "Deleted ${server_name}"
elif [ "${http_code}" = "404" ]; then
log_info "Server ${server_name} already gone"
else
log_warn "Failed to delete ${server_name} (HTTP ${http_code})"
fi
cleaned=$((cleaned + 1))
else
skipped=$((skipped + 1))
fi
done
if [ "${cleaned}" -gt 0 ]; then
log_ok "Cleaned ${cleaned} stale Hetzner instance(s)"
fi
if [ "${skipped}" -gt 0 ]; then
log_info "Skipped ${skipped} recent Hetzner instance(s)"
fi
# Also clean up orphaned Primary IPs to free quota for new provisioning (#2933)
_hetzner_cleanup_orphaned_ips
}
# ---------------------------------------------------------------------------
# _hetzner_max_parallel
#
# Hetzner accounts have a primary IP limit (~5 for most accounts).
# ---------------------------------------------------------------------------
_hetzner_max_parallel() {
printf '5'
}