spawn/runpod/lib/common.sh
L 2364458f36
Add RunPod GPU cloud provider (#39)
* Add RunPod GPU cloud provider with all 13 agent scripts

- runpod/lib/common.sh: GraphQL API wrapper, pod creation/termination,
  SSH connectivity (direct TCP or proxy via ssh.runpod.io)
- 13 agent scripts: claude, openclaw, nanoclaw, aider, goose, codex,
  interpreter, gemini, amazonq, cline, gptme, opencode, plandex
- runpod/README.md with usage docs and environment variable reference
- manifest.json: RunPod cloud entry + all matrix entries as implemented

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Add UpCloud cloud provider with all 13 agent scripts

- upcloud/lib/common.sh: UpCloud API wrapper with Basic Auth, server
  provisioning, SSH connectivity, base tool installation
- 13 agent scripts: claude, openclaw, nanoclaw, aider, goose, codex,
  interpreter, gemini, amazonq, cline, gptme, opencode, plandex
- upcloud/README.md with usage docs and env var reference
- manifest.json updated with UpCloud cloud entry and 13 matrix entries

UpCloud uses HTTP Basic Auth (username:password) instead of Bearer tokens.
Servers are provisioned via POST /1.3/server with SSH keys injected via
login_user. Ubuntu template UUID is dynamically resolved from the API.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Sprite <noreply@sprite.dev>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-07 21:43:10 -08:00

303 lines
12 KiB
Bash

#!/bin/bash
# Common bash functions for RunPod spawn scripts
# Uses RunPod GraphQL API — https://docs.runpod.io/
# Bash safety flags
set -eo pipefail
# ============================================================
# Provider-agnostic functions
# ============================================================
# Source shared provider-agnostic functions (local or remote fallback)
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd)"
if [[ -n "${SCRIPT_DIR}" && -f "${SCRIPT_DIR}/../../shared/common.sh" ]]; then
source "${SCRIPT_DIR}/../../shared/common.sh"
else
eval "$(curl -fsSL https://raw.githubusercontent.com/OpenRouterTeam/spawn/main/shared/common.sh)"
fi
# Note: Provider-agnostic functions (logging, OAuth, browser, nc_listen) are now in shared/common.sh
# ============================================================
# RunPod specific functions
# ============================================================
RUNPOD_GRAPHQL_URL="https://api.runpod.io/graphql"
# SSH_OPTS is defined in shared/common.sh
# Configurable timeout/delay constants
INSTANCE_STATUS_POLL_DELAY=${INSTANCE_STATUS_POLL_DELAY:-10} # Delay between instance status checks
SSH_RETRY_DELAY=${SSH_RETRY_DELAY:-5} # Delay between SSH connection retry attempts
# RunPod GraphQL API wrapper
# Usage: runpod_api QUERY
runpod_api() {
local query="${1}"
local body
body=$(python3 -c "
import json
q = '''${query}'''
print(json.dumps({'query': q}))
")
curl -s -X POST \
-H "Content-Type: application/json" \
"${RUNPOD_GRAPHQL_URL}?api_key=${RUNPOD_API_KEY}" \
-d "${body}"
}
test_runpod_token() {
local response
response=$(runpod_api "query { myself { id } }")
if echo "${response}" | grep -q '"errors"'; then
local error_msg
error_msg=$(echo "${response}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('errors',[{}])[0].get('message','Unknown error'))" 2>/dev/null || echo "Unable to parse error")
log_error "API Error: ${error_msg}"
log_warn "Remediation steps:"
log_warn " 1. Verify API key at: https://www.runpod.io/console/user/settings"
log_warn " 2. Ensure the key has read/write permissions"
log_warn " 3. Check key hasn't been revoked"
return 1
fi
return 0
}
# Ensure RUNPOD_API_KEY is available (env var -> config file -> prompt+save)
ensure_runpod_token() {
ensure_api_token_with_provider \
"RunPod" \
"RUNPOD_API_KEY" \
"${HOME}/.config/spawn/runpod.json" \
"https://www.runpod.io/console/user/settings" \
"test_runpod_token"
}
# RunPod manages SSH keys at the account level via the web console.
# Users must add their SSH public key at https://www.runpod.io/console/user/settings
# The key is automatically injected into all new pods.
ensure_ssh_key() {
local key_path="${HOME}/.ssh/id_ed25519"
generate_ssh_key_if_missing "${key_path}"
log_warn "RunPod requires SSH keys to be added via the web console."
log_warn "Ensure your public key is added at: https://www.runpod.io/console/user/settings"
log_warn ""
log_warn "Your public key:"
cat "${key_path}.pub" >&2
echo "" >&2
}
get_server_name() {
local server_name
server_name=$(get_resource_name "RUNPOD_SERVER_NAME" "Enter pod name: ") || return 1
if ! validate_server_name "${server_name}"; then
return 1
fi
echo "${server_name}"
}
create_server() {
local name="${1}"
local gpu_type="${RUNPOD_GPU_TYPE:-NVIDIA RTX A4000}"
local gpu_count="${RUNPOD_GPU_COUNT:-1}"
local image="${RUNPOD_IMAGE:-runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04}"
local volume_gb="${RUNPOD_VOLUME_GB:-50}"
local container_disk_gb="${RUNPOD_CONTAINER_DISK_GB:-20}"
local cloud_type="${RUNPOD_CLOUD_TYPE:-ALL}"
log_warn "Creating RunPod pod '${name}' (GPU: ${gpu_type}, image: ${image})..."
local query='mutation { podFindAndDeployOnDemand(input: { name: "'"${name}"'", imageName: "'"${image}"'", gpuTypeId: "'"${gpu_type}"'", cloudType: '"${cloud_type}"', gpuCount: '"${gpu_count}"', volumeInGb: '"${volume_gb}"', containerDiskInGb: '"${container_disk_gb}"', ports: "22/tcp", volumeMountPath: "/workspace", dockerArgs: "" }) { id imageName machineId } }'
local response
response=$(runpod_api "${query}")
if echo "${response}" | grep -q '"errors"'; then
log_error "Failed to create RunPod pod"
local error_msg
error_msg=$(echo "${response}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('errors',[{}])[0].get('message','Unknown error'))" 2>/dev/null || echo "${response}")
log_error "API Error: ${error_msg}"
log_warn "Common issues:"
log_warn " - Insufficient account balance"
log_warn " - GPU type unavailable (try different RUNPOD_GPU_TYPE)"
log_warn " - GPU count unavailable"
log_warn "Remediation: Check https://www.runpod.io/console/pods"
return 1
fi
RUNPOD_POD_ID=$(echo "${response}" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['data']['podFindAndDeployOnDemand']['id'])")
export RUNPOD_POD_ID
log_info "Pod created: ID=${RUNPOD_POD_ID}"
# Wait for pod to become ready and get SSH connection info
log_warn "Waiting for pod to become ready..."
local max_attempts=60
local attempt=1
while [[ "${attempt}" -le "${max_attempts}" ]]; do
local status_query='query { pod(input: { podId: "'"${RUNPOD_POD_ID}"'" }) { id name desiredStatus runtime { uptimeInSeconds ports { ip isIpPublic privatePort publicPort type } } } }'
local status_response
status_response=$(runpod_api "${status_query}")
local runtime
runtime=$(echo "${status_response}" | python3 -c "import json,sys; r=json.loads(sys.stdin.read())['data']['pod']['runtime']; print('running' if r else 'pending')" 2>/dev/null || echo "pending")
if [[ "${runtime}" == "running" ]]; then
# Extract SSH connection info from ports
local ssh_info
ssh_info=$(echo "${status_response}" | python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
ports = data['data']['pod']['runtime']['ports']
for p in (ports or []):
if p['privatePort'] == 22 and p['type'] == 'tcp':
print(p['ip'] + ':' + str(p['publicPort']))
sys.exit(0)
# No direct TCP port found, fall back to proxy SSH
print('proxy')
" 2>/dev/null || echo "proxy")
if [[ "${ssh_info}" == "proxy" ]]; then
# Use RunPod SSH proxy
RUNPOD_SSH_HOST="ssh.runpod.io"
RUNPOD_SSH_PORT="22"
RUNPOD_SSH_USER="${RUNPOD_POD_ID}"
export RUNPOD_SSH_HOST RUNPOD_SSH_PORT RUNPOD_SSH_USER
log_info "Pod ready (using SSH proxy: ${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST})"
else
RUNPOD_SSH_HOST=$(echo "${ssh_info}" | cut -d: -f1)
RUNPOD_SSH_PORT=$(echo "${ssh_info}" | cut -d: -f2)
RUNPOD_SSH_USER="root"
export RUNPOD_SSH_HOST RUNPOD_SSH_PORT RUNPOD_SSH_USER
log_info "Pod ready: SSH at ${RUNPOD_SSH_HOST}:${RUNPOD_SSH_PORT}"
fi
return 0
fi
local desired_status
desired_status=$(echo "${status_response}" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['data']['pod']['desiredStatus'])" 2>/dev/null || echo "UNKNOWN")
log_warn "Pod status: ${desired_status}/${runtime} (${attempt}/${max_attempts})"
sleep "${INSTANCE_STATUS_POLL_DELAY}"
attempt=$((attempt + 1))
done
log_error "Pod did not become ready in time"
return 1
}
# Build SSH options string for RunPod (may use non-standard port)
_runpod_ssh_opts() {
echo "${SSH_OPTS} -o ConnectTimeout=10 -p ${RUNPOD_SSH_PORT}"
}
verify_server_connectivity() {
local max_attempts=${1:-30}
local attempt=1
local ssh_target="${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}"
log_warn "Waiting for SSH connectivity to ${ssh_target}:${RUNPOD_SSH_PORT}..."
while [[ "${attempt}" -le "${max_attempts}" ]]; do
# shellcheck disable=SC2086
if ssh $(_runpod_ssh_opts) "${ssh_target}" "echo ok" >/dev/null 2>&1; then
log_info "SSH connection established"
return 0
fi
log_warn "Waiting for SSH... (${attempt}/${max_attempts})"
sleep "${SSH_RETRY_DELAY}"
attempt=$((attempt + 1))
done
log_error "Pod failed to respond via SSH after ${max_attempts} attempts"
return 1
}
# Install base tools (RunPod pods are Docker containers, no cloud-init)
install_base_tools() {
local ssh_target="${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}"
log_warn "Installing base tools..."
# shellcheck disable=SC2086
ssh $(_runpod_ssh_opts) "${ssh_target}" "apt-get update -y && apt-get install -y curl unzip git zsh npm" >/dev/null 2>&1 || true
# Install Bun
log_warn "Installing Bun..."
# shellcheck disable=SC2086
ssh $(_runpod_ssh_opts) "${ssh_target}" "curl -fsSL https://bun.sh/install | bash" >/dev/null 2>&1 || true
# Install Claude Code
log_warn "Installing Claude Code..."
# shellcheck disable=SC2086
ssh $(_runpod_ssh_opts) "${ssh_target}" "curl -fsSL https://claude.ai/install.sh | bash" >/dev/null 2>&1 || true
# Configure PATH in .bashrc and .zshrc
# shellcheck disable=SC2086
ssh $(_runpod_ssh_opts) "${ssh_target}" "grep -q '.bun/bin' ~/.bashrc 2>/dev/null || printf '%s\n' 'export PATH=\"\${HOME}/.claude/local/bin:\${HOME}/.bun/bin:\${PATH}\"' >> ~/.bashrc; grep -q '.bun/bin' ~/.zshrc 2>/dev/null || printf '%s\n' 'export PATH=\"\${HOME}/.claude/local/bin:\${HOME}/.bun/bin:\${PATH}\"' >> ~/.zshrc" >/dev/null 2>&1 || true
log_info "Base tools installed"
}
# RunPod uses root user (or pod ID for proxy SSH)
# These functions follow the IP-first arg pattern for compatibility with inject_env_vars_ssh
# The "ip" arg is ignored since RunPod uses RUNPOD_SSH_USER@RUNPOD_SSH_HOST
# shellcheck disable=SC2086
run_server() {
local _ip="${1}"
local cmd="${2}"
ssh $(_runpod_ssh_opts) "${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}" "${cmd}"
}
# shellcheck disable=SC2086
upload_file() {
local _ip="${1}"
local local_path="${2}"
local remote_path="${3}"
scp $(_runpod_ssh_opts) "${local_path}" "${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}:${remote_path}"
}
# shellcheck disable=SC2086
interactive_session() {
local _ip="${1}"
local cmd="${2}"
ssh -t $(_runpod_ssh_opts) "${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}" "${cmd}"
}
destroy_server() {
local pod_id="${1}"
log_warn "Terminating pod ${pod_id}..."
local query='mutation { podTerminate(input: { podId: "'"${pod_id}"'" }) }'
runpod_api "${query}" >/dev/null
log_info "Pod ${pod_id} terminated"
}
list_servers() {
local query='query { myself { pods { id name desiredStatus runtime { uptimeInSeconds ports { ip isIpPublic privatePort publicPort type } } } } }'
local response
response=$(runpod_api "${query}")
python3 -c "
import json, sys
data = json.loads(sys.stdin.read())
pods = data.get('data', {}).get('myself', {}).get('pods', [])
if not pods:
print('No pods found')
sys.exit(0)
print(f\"{'NAME':<25} {'ID':<25} {'STATUS':<12} {'SSH':<30}\")
print('-' * 92)
for p in pods:
name = p.get('name', 'N/A')
pid = p['id']
status = p.get('desiredStatus', 'N/A')
ssh_info = 'N/A'
runtime = p.get('runtime')
if runtime and runtime.get('ports'):
for port in runtime['ports']:
if port.get('privatePort') == 22 and port.get('type') == 'tcp':
ssh_info = f\"{port['ip']}:{port['publicPort']}\"
break
if ssh_info == 'N/A':
ssh_info = f\"{pid}@ssh.runpod.io\"
print(f'{name:<25} {pid:<25} {status:<12} {ssh_info:<30}')
" <<< "${response}"
}