mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-04-28 03:49:31 +00:00
Prevents server_limit_reached errors when pre-existing servers (e.g. spawn-szil) consume quota during E2E batch 1. Fixes #3111 Agent: test-engineer Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
386 lines
13 KiB
Bash
386 lines
13 KiB
Bash
#!/bin/bash
|
|
# e2e/lib/clouds/hetzner.sh — Hetzner Cloud driver for multi-cloud E2E tests
|
|
set -eo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
_HETZNER_API="https://api.hetzner.cloud/v1"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_curl_auth [curl-args...]
|
|
#
|
|
# Wrapper around curl that passes the HCLOUD_TOKEN via a temp config file
|
|
# instead of a command-line -H flag. This keeps the token out of `ps` output.
|
|
# All arguments are forwarded to curl.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_curl_auth() {
|
|
local _cfg
|
|
_cfg=$(mktemp)
|
|
chmod 600 "${_cfg}"
|
|
printf 'header = "Authorization: Bearer %s"\n' "${HCLOUD_TOKEN}" > "${_cfg}"
|
|
curl -K "${_cfg}" "$@"
|
|
local _rc=$?
|
|
rm -f "${_cfg}"
|
|
return "${_rc}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_validate_env
|
|
#
|
|
# Verify HCLOUD_TOKEN is set and credentials are valid.
|
|
# Returns 0 on success, 1 on failure.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_validate_env() {
|
|
if [ -z "${HCLOUD_TOKEN:-}" ]; then
|
|
log_err "HCLOUD_TOKEN is not set"
|
|
return 1
|
|
fi
|
|
|
|
if ! _hetzner_curl_auth -sf \
|
|
"${_HETZNER_API}/servers?per_page=1" >/dev/null 2>&1; then
|
|
log_err "Hetzner API credentials are invalid"
|
|
return 1
|
|
fi
|
|
|
|
log_ok "Hetzner credentials validated"
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_headless_env APP AGENT
|
|
#
|
|
# Print export lines for headless provisioning to stdout.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_headless_env() {
|
|
local app="$1"
|
|
# $2 = agent (unused but part of the interface)
|
|
|
|
printf 'export HETZNER_SERVER_NAME="%s"\n' "${app}"
|
|
printf 'export HETZNER_SERVER_TYPE="%s"\n' "${HETZNER_SERVER_TYPE:-cx23}"
|
|
printf 'export HETZNER_LOCATION="%s"\n' "${HETZNER_LOCATION:-fsn1}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_provision_verify APP LOG_DIR
|
|
#
|
|
# Verify the server exists via Hetzner API. Extract ID and IP.
|
|
# Write IP to $LOG_DIR/$APP.ip and metadata to $LOG_DIR/$APP.meta.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_provision_verify() {
|
|
local app="$1"
|
|
local log_dir="$2"
|
|
|
|
# URL-encode the app name to prevent query parameter injection
|
|
local encoded_app
|
|
encoded_app=$(jq -rn --arg v "${app}" '$v|@uri')
|
|
|
|
local response
|
|
response=$(_hetzner_curl_auth -sf \
|
|
"${_HETZNER_API}/servers?name=${encoded_app}" 2>/dev/null || true)
|
|
|
|
if [ -z "${response}" ]; then
|
|
log_err "Failed to query Hetzner API for server ${app}"
|
|
return 1
|
|
fi
|
|
|
|
local server_count
|
|
server_count=$(printf '%s' "${response}" | jq '.servers | length' 2>/dev/null || printf '0')
|
|
|
|
if [ "${server_count}" -eq 0 ]; then
|
|
log_err "Server ${app} does not exist on Hetzner"
|
|
return 1
|
|
fi
|
|
|
|
local server_id
|
|
server_id=$(printf '%s' "${response}" | jq -r '.servers[0].id' 2>/dev/null)
|
|
|
|
local server_ip
|
|
server_ip=$(printf '%s' "${response}" | jq -r '.servers[0].public_net.ipv4.ip // empty' 2>/dev/null)
|
|
|
|
if [ -z "${server_ip}" ]; then
|
|
log_err "Could not resolve public IP for ${app}"
|
|
return 1
|
|
fi
|
|
|
|
local server_name
|
|
server_name=$(printf '%s' "${response}" | jq -r '.servers[0].name' 2>/dev/null)
|
|
|
|
local server_location
|
|
server_location=$(printf '%s' "${response}" | jq -r '.servers[0].datacenter.location.name // "unknown"' 2>/dev/null)
|
|
|
|
# Write IP for SSH access
|
|
printf '%s' "${server_ip}" > "${log_dir}/${app}.ip"
|
|
|
|
# Write metadata for teardown
|
|
printf '{"id":%s,"name":"%s","location":"%s"}\n' \
|
|
"${server_id}" "${server_name}" "${server_location}" \
|
|
> "${log_dir}/${app}.meta"
|
|
|
|
log_ok "Server ${app} verified (id=${server_id}, ip=${server_ip}, location=${server_location})"
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_exec APP CMD
|
|
#
|
|
# Execute a command on the server via SSH.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_exec() {
|
|
local app="$1"
|
|
local cmd="$2"
|
|
local log_dir="${LOG_DIR:-/tmp}"
|
|
|
|
local ip_file="${log_dir}/${app}.ip"
|
|
if [ ! -f "${ip_file}" ]; then
|
|
log_err "No IP file found for ${app} at ${ip_file}"
|
|
return 1
|
|
fi
|
|
|
|
local ip
|
|
ip=$(cat "${ip_file}")
|
|
|
|
if [ -z "${ip}" ]; then
|
|
log_err "Empty IP in ${ip_file}"
|
|
return 1
|
|
fi
|
|
|
|
# Validate IP looks like an IPv4 address (defense-in-depth against file tampering)
|
|
if ! printf '%s' "${ip}" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
|
|
log_err "Invalid IP address in ${ip_file}: ${ip}"
|
|
return 1
|
|
fi
|
|
|
|
# Base64-encode the command and pipe the payload via stdin to SSH.
|
|
# This eliminates variable expansion of the encoded command in the SSH
|
|
# command string, preventing injection even if base64 validation is bypassed.
|
|
local encoded_cmd
|
|
encoded_cmd=$(printf '%s' "${cmd}" | base64 | tr -d '\n')
|
|
|
|
# Validate base64 output contains only safe characters (defense-in-depth).
|
|
# Standard base64 only produces [A-Za-z0-9+/=]. This rejects any corruption.
|
|
if ! printf '%s' "${encoded_cmd}" | grep -qE '^[A-Za-z0-9+/=]+$'; then
|
|
log_err "Invalid base64 encoding of command for SSH exec"
|
|
return 1
|
|
fi
|
|
|
|
# Pipe the base64 payload via stdin to the remote host. The remote bash
|
|
# reads stdin, base64-decodes it, and executes the result. No user-controlled
|
|
# data is interpolated into the SSH command string.
|
|
printf '%s' "${encoded_cmd}" | ssh -o StrictHostKeyChecking=accept-new \
|
|
-o UserKnownHostsFile=/dev/null \
|
|
-o LogLevel=ERROR \
|
|
-o BatchMode=yes \
|
|
-o ConnectTimeout=10 \
|
|
"root@${ip}" "base64 -d | bash"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_teardown APP
|
|
#
|
|
# Delete the server via Hetzner API using the stored server ID.
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_teardown() {
|
|
local app="$1"
|
|
local log_dir="${LOG_DIR:-/tmp}"
|
|
|
|
local meta_file="${log_dir}/${app}.meta"
|
|
if [ ! -f "${meta_file}" ]; then
|
|
log_warn "No metadata file for ${app} — cannot determine server ID"
|
|
untrack_app "${app}"
|
|
return 0
|
|
fi
|
|
|
|
local server_id
|
|
server_id=$(jq -r '.id' "${meta_file}" 2>/dev/null)
|
|
|
|
if [ -z "${server_id}" ] || [ "${server_id}" = "null" ]; then
|
|
log_warn "Could not parse server ID from ${meta_file}"
|
|
untrack_app "${app}"
|
|
return 0
|
|
fi
|
|
|
|
# Validate server ID is numeric (defense-in-depth against metadata tampering)
|
|
case "${server_id}" in ''|*[!0-9]*) log_warn "Non-numeric server ID: ${server_id}"; untrack_app "${app}"; return 0 ;; esac
|
|
|
|
log_step "Deleting Hetzner server ${app} (id=${server_id})"
|
|
|
|
local http_code
|
|
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
|
|
-X DELETE \
|
|
"${_HETZNER_API}/servers/${server_id}" 2>/dev/null || printf '000')
|
|
|
|
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
|
|
log_ok "Server ${app} (id=${server_id}) deleted"
|
|
elif [ "${http_code}" = "404" ]; then
|
|
log_info "Server ${app} (id=${server_id}) already gone"
|
|
else
|
|
log_warn "Unexpected HTTP ${http_code} deleting server ${app} (id=${server_id})"
|
|
fi
|
|
|
|
untrack_app "${app}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_cleanup_orphaned_ips
|
|
#
|
|
# Delete Hetzner Primary IPs not attached to any server. These accumulate
|
|
# from failed/interrupted provisioning runs and consume the account's
|
|
# primary_ip_limit quota, causing resource_limit_exceeded errors (#2933).
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_cleanup_orphaned_ips() {
|
|
local response
|
|
response=$(_hetzner_curl_auth -sf \
|
|
"${_HETZNER_API}/primary_ips?per_page=50" 2>/dev/null || true)
|
|
|
|
if [ -z "${response}" ]; then
|
|
log_info "Could not list Hetzner primary IPs — skipping IP cleanup"
|
|
return 0
|
|
fi
|
|
|
|
local orphaned
|
|
orphaned=$(printf '%s' "${response}" | jq -r '.primary_ips[] | select(.assignee_id == null or .assignee_id == 0) | "\(.id):\(.ip)"' 2>/dev/null || true)
|
|
|
|
if [ -z "${orphaned}" ]; then
|
|
log_ok "No orphaned Hetzner Primary IPs found"
|
|
return 0
|
|
fi
|
|
|
|
local cleaned=0
|
|
for entry in ${orphaned}; do
|
|
local ip_id
|
|
ip_id=$(printf '%s' "${entry}" | cut -d: -f1)
|
|
|
|
local ip_addr
|
|
ip_addr=$(printf '%s' "${entry}" | cut -d: -f2-)
|
|
|
|
# Validate IP ID is numeric before using it in API URL
|
|
case "${ip_id}" in ''|*[!0-9]*) log_warn "Skipping orphaned IP ${entry} — non-numeric ID"; continue ;; esac
|
|
|
|
local http_code
|
|
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
|
|
-X DELETE \
|
|
"${_HETZNER_API}/primary_ips/${ip_id}" 2>/dev/null || printf '000')
|
|
|
|
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
|
|
log_ok "Deleted orphaned Primary IP ${ip_addr} (id=${ip_id})"
|
|
cleaned=$((cleaned + 1))
|
|
elif [ "${http_code}" = "404" ]; then
|
|
log_info "Primary IP ${ip_addr} (id=${ip_id}) already gone"
|
|
else
|
|
log_warn "Failed to delete Primary IP ${ip_addr} (id=${ip_id}, HTTP ${http_code})"
|
|
fi
|
|
done
|
|
|
|
if [ "${cleaned}" -gt 0 ]; then
|
|
log_ok "Cleaned ${cleaned} orphaned Hetzner Primary IP(s)"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_cleanup_stale
|
|
#
|
|
# List all Hetzner servers, find e2e-* instances older than 30 minutes,
|
|
# and destroy them. Also cleans up orphaned Primary IPs to prevent
|
|
# resource_limit_exceeded errors (#2933).
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_cleanup_stale() {
|
|
local now
|
|
now=$(date +%s)
|
|
local max_age="${_CLEANUP_MAX_AGE:-1800}" # default 30 min; pre-run uses shorter
|
|
|
|
local response
|
|
response=$(_hetzner_curl_auth -sf \
|
|
"${_HETZNER_API}/servers?per_page=50" 2>/dev/null || true)
|
|
|
|
if [ -z "${response}" ]; then
|
|
log_info "Could not list Hetzner servers — skipping cleanup"
|
|
return 0
|
|
fi
|
|
|
|
local server_count
|
|
server_count=$(printf '%s' "${response}" | jq '.servers | length' 2>/dev/null || printf '0')
|
|
|
|
if [ "${server_count}" -eq 0 ]; then
|
|
log_ok "No Hetzner servers found"
|
|
return 0
|
|
fi
|
|
|
|
# Extract e2e-* servers as "id:name" pairs
|
|
local servers
|
|
servers=$(printf '%s' "${response}" | jq -r '.servers[] | select(.name | startswith("e2e-")) | "\(.id):\(.name)"' 2>/dev/null || true)
|
|
|
|
if [ -z "${servers}" ]; then
|
|
log_ok "No stale e2e instances found on Hetzner"
|
|
return 0
|
|
fi
|
|
|
|
local cleaned=0
|
|
local skipped=0
|
|
|
|
for entry in ${servers}; do
|
|
local server_id
|
|
server_id=$(printf '%s' "${entry}" | cut -d: -f1)
|
|
|
|
local server_name
|
|
server_name=$(printf '%s' "${entry}" | cut -d: -f2-)
|
|
|
|
# Validate server ID is numeric before using it in API URL
|
|
case "${server_id}" in ''|*[!0-9]*) log_warn "Skipping ${entry} — non-numeric server ID"; skipped=$((skipped + 1)); continue ;; esac
|
|
|
|
# Extract timestamp from name: e2e-AGENT-TIMESTAMP
|
|
local ts
|
|
ts=$(printf '%s' "${server_name}" | sed 's/.*-//')
|
|
|
|
# Validate it looks like a unix timestamp (all digits, 10 chars)
|
|
if ! printf '%s' "${ts}" | grep -qE '^[0-9]{10}$'; then
|
|
log_warn "Skipping ${server_name} — cannot parse timestamp"
|
|
skipped=$((skipped + 1))
|
|
continue
|
|
fi
|
|
|
|
local age=$((now - ts))
|
|
if [ "${age}" -gt "${max_age}" ]; then
|
|
local age_str
|
|
age_str=$(format_duration "${age}")
|
|
log_step "Destroying stale Hetzner server ${server_name} (id=${server_id}, age: ${age_str})"
|
|
|
|
local http_code
|
|
http_code=$(_hetzner_curl_auth -s -o /dev/null -w '%{http_code}' \
|
|
-X DELETE \
|
|
"${_HETZNER_API}/servers/${server_id}" 2>/dev/null || printf '000')
|
|
|
|
if [ "${http_code}" = "200" ] || [ "${http_code}" = "204" ]; then
|
|
log_ok "Deleted ${server_name}"
|
|
elif [ "${http_code}" = "404" ]; then
|
|
log_info "Server ${server_name} already gone"
|
|
else
|
|
log_warn "Failed to delete ${server_name} (HTTP ${http_code})"
|
|
fi
|
|
|
|
cleaned=$((cleaned + 1))
|
|
else
|
|
skipped=$((skipped + 1))
|
|
fi
|
|
done
|
|
|
|
if [ "${cleaned}" -gt 0 ]; then
|
|
log_ok "Cleaned ${cleaned} stale Hetzner instance(s)"
|
|
fi
|
|
if [ "${skipped}" -gt 0 ]; then
|
|
log_info "Skipped ${skipped} recent Hetzner instance(s)"
|
|
fi
|
|
|
|
# Also clean up orphaned Primary IPs to free quota for new provisioning (#2933)
|
|
_hetzner_cleanup_orphaned_ips
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _hetzner_max_parallel
|
|
#
|
|
# Hetzner accounts have a primary IP limit. Reduced from 3 to 2 to avoid
|
|
# server_limit_reached when pre-existing servers consume quota (#3111).
|
|
# ---------------------------------------------------------------------------
|
|
_hetzner_max_parallel() {
|
|
printf '2'
|
|
}
|