diff --git a/.github/workflows/packer-snapshots.yml b/.github/workflows/packer-snapshots.yml index 7a6ba6c2..e5ca6788 100644 --- a/.github/workflows/packer-snapshots.yml +++ b/.github/workflows/packer-snapshots.yml @@ -10,14 +10,6 @@ on: description: "Single agent to build (leave empty for all)" required: false type: string - cloud: - description: "Cloud to build for (leave empty for all)" - required: false - type: choice - options: - - "" - - digitalocean - - hetzner permissions: contents: read @@ -33,30 +25,22 @@ jobs: - id: set run: | SINGLE_AGENT="${SINGLE_AGENT_INPUT}" - SINGLE_CLOUD="${SINGLE_CLOUD_INPUT}" if [ -n "$SINGLE_AGENT" ]; then - AGENTS="[\"${SINGLE_AGENT}\"]" + AGENTS=$(jq -nc --arg agent "$SINGLE_AGENT" '[$agent]') else AGENTS=$(jq -c 'keys' packer/agents.json) fi - if [ -n "$SINGLE_CLOUD" ]; then - CLOUDS="[\"${SINGLE_CLOUD}\"]" - else - CLOUDS='["digitalocean","hetzner"]' - fi - # Build a flat include array: [{agent, cloud}, ...] - INCLUDE=$(jq -nc --argjson agents "$AGENTS" --argjson clouds "$CLOUDS" \ - '[$agents[] as $a | $clouds[] as $c | {agent: $a, cloud: $c}]') + INCLUDE=$(jq -nc --argjson agents "$AGENTS" \ + '[$agents[] as $a | {agent: $a, cloud: "digitalocean"}]') echo "include=${INCLUDE}" >> "$GITHUB_OUTPUT" env: SINGLE_AGENT_INPUT: ${{ inputs.agent }} - SINGLE_CLOUD_INPUT: ${{ inputs.cloud }} build: - name: "${{ matrix.cloud }}/${{ matrix.agent }}" + name: "digitalocean/${{ matrix.agent }}" needs: matrix runs-on: ubuntu-latest strategy: @@ -82,10 +66,9 @@ jobs: version: latest - name: Init Packer plugins - run: packer init packer/${{ matrix.cloud }}.pkr.hcl + run: packer init packer/digitalocean.pkr.hcl - - name: Generate variables file (DigitalOcean) - if: matrix.cloud == 'digitalocean' + - name: Generate variables file run: | jq -n \ --arg token "$DO_API_TOKEN" \ @@ -104,31 +87,34 @@ jobs: TIER: ${{ steps.config.outputs.tier }} INSTALL_COMMANDS: ${{ steps.config.outputs.install }} - - name: Generate variables file (Hetzner) - if: matrix.cloud == 'hetzner' - run: | - jq -n \ - --arg token "$HCLOUD_TOKEN" \ - --arg agent "$AGENT_NAME" \ - --arg tier "$TIER" \ - --argjson install "$INSTALL_COMMANDS" \ - '{ - hcloud_token: $token, - agent_name: $agent, - cloud_init_tier: $tier, - install_commands: $install - }' > packer/auto.pkrvars.json - env: - HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} - AGENT_NAME: ${{ matrix.agent }} - TIER: ${{ steps.config.outputs.tier }} - INSTALL_COMMANDS: ${{ steps.config.outputs.install }} - - name: Build snapshot - run: packer build -var-file=packer/auto.pkrvars.json packer/${{ matrix.cloud }}.pkr.hcl + run: packer build -var-file=packer/auto.pkrvars.json packer/digitalocean.pkr.hcl - - name: Cleanup old DO snapshots - if: success() && matrix.cloud == 'digitalocean' + # When a workflow is cancelled, Packer is killed before it can destroy + # the temporary builder droplet — leaving orphaned instances. + - name: Destroy orphaned builder droplets + if: cancelled() + run: | + # Filter by spawn-packer tag to avoid destroying builder droplets from other workflows + DROPLET_IDS=$(curl -s -H "Authorization: Bearer ${DO_API_TOKEN}" \ + "https://api.digitalocean.com/v2/droplets?per_page=200&tag_name=spawn-packer" \ + | jq -r '.droplets[].id') + + if [ -z "$DROPLET_IDS" ]; then + echo "No orphaned packer builder droplets found" + exit 0 + fi + + for ID in $DROPLET_IDS; do + echo "Destroying orphaned builder droplet: ${ID}" + curl -s -X DELETE -H "Authorization: Bearer ${DO_API_TOKEN}" \ + "https://api.digitalocean.com/v2/droplets/${ID}" || true + done + env: + DO_API_TOKEN: ${{ secrets.DO_API_TOKEN }} + + - name: Cleanup old snapshots + if: success() run: | PREFIX="spawn-${AGENT_NAME}-" SNAPSHOTS=$(curl -s -H "Authorization: Bearer ${DO_API_TOKEN}" \ @@ -145,27 +131,8 @@ jobs: DO_API_TOKEN: ${{ secrets.DO_API_TOKEN }} AGENT_NAME: ${{ matrix.agent }} - - name: Cleanup old Hetzner snapshots - if: success() && matrix.cloud == 'hetzner' - run: | - PREFIX="spawn-${AGENT_NAME}-" - # Hetzner Packer sets snapshot_name → description field in the API - SNAPSHOTS=$(curl -s -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ - "https://api.hetzner.cloud/v1/images?type=snapshot&per_page=100" \ - | jq -r --arg prefix "$PREFIX" \ - '[.images[] | select(.description | startswith($prefix))] | sort_by(.created) | reverse | .[1:] | .[].id') - - for ID in $SNAPSHOTS; do - echo "Deleting old snapshot: ${ID}" - curl -s -X DELETE -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ - "https://api.hetzner.cloud/v1/images/${ID}" || true - done - env: - HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} - AGENT_NAME: ${{ matrix.agent }} - - name: Submit to DO Marketplace - if: success() && matrix.cloud == 'digitalocean' + if: success() run: | # Skip if no marketplace app IDs configured if [ -z "$MARKETPLACE_APP_IDS" ]; then diff --git a/packer/digitalocean.pkr.hcl b/packer/digitalocean.pkr.hcl index 3eff5b35..2329be01 100644 --- a/packer/digitalocean.pkr.hcl +++ b/packer/digitalocean.pkr.hcl @@ -40,6 +40,9 @@ source "digitalocean" "spawn" { size = "s-2vcpu-2gb" ssh_username = "root" + # Tag the temporary builder droplet so cancel-cleanup can target only our builds + tags = ["spawn-packer"] + snapshot_name = local.image_name snapshot_regions = [ "nyc1", "nyc3", "sfo3", "tor1", "ams3", diff --git a/packer/hetzner.pkr.hcl b/packer/hetzner.pkr.hcl deleted file mode 100644 index 22543dda..00000000 --- a/packer/hetzner.pkr.hcl +++ /dev/null @@ -1,165 +0,0 @@ -packer { - required_plugins { - hcloud = { - version = ">= 1.6.0" - source = "github.com/hetznercloud/hcloud" - } - } -} - -variable "hcloud_token" { - type = string - sensitive = true -} - -variable "agent_name" { - type = string -} - -variable "cloud_init_tier" { - type = string - default = "minimal" -} - -variable "install_commands" { - type = list(string) - default = [] -} - -locals { - timestamp = formatdate("YYYYMMDD-hhmm", timestamp()) - image_name = "spawn-${var.agent_name}-${local.timestamp}" -} - -source "hcloud" "spawn" { - token = var.hcloud_token - image = "ubuntu-24.04" - location = "nbg1" - # cpx22 (AMD) available in nbg1/hel1/sin — cx23 only available in hel1. - # 4 GB RAM needed — Claude's installer and zeroclaw's Rust build get OOM-killed on smaller instances. - server_type = "cpx22" - ssh_username = "root" - - snapshot_name = local.image_name - snapshot_labels = { - managed-by = "packer" - project = "spawn" - agent = var.agent_name - } -} - -build { - sources = ["source.hcloud.spawn"] - - # Wait for cloud-init to finish (Hetzner base images run it on first boot) - provisioner "shell" { - inline = [ - "cloud-init status --wait || true", - ] - } - - # Wait for any apt locks to be released (cloud-init may hold them) - provisioner "shell" { - inline = [ - "for i in $(seq 1 30); do fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1 || break; echo 'Waiting for apt lock...'; sleep 2; done", - ] - } - - # Run the tier script (installs base packages: curl, git, node, bun, etc.) - provisioner "shell" { - script = "packer/scripts/tier-${var.cloud_init_tier}.sh" - } - - # Install the agent - provisioner "shell" { - inline = var.install_commands - environment_vars = [ - "HOME=/root", - "DEBIAN_FRONTEND=noninteractive", - "PATH=/root/.local/bin:/root/.bun/bin:/root/.npm-global/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - ] - } - - # Leave a marker so the CLI knows this is a pre-baked snapshot - provisioner "shell" { - inline = [ - "echo 'spawn-${var.agent_name}' > /root/.spawn-snapshot", - "date -u '+%Y-%m-%dT%H:%M:%SZ' >> /root/.spawn-snapshot", - "touch /root/.cloud-init-complete", - ] - environment_vars = [ - "HOME=/root", - "PATH=/root/.local/bin:/root/.bun/bin:/root/.npm-global/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - ] - } - - # Install security updates and clean up - provisioner "shell" { - inline = [ - "apt-get update -y", - "apt-get -o Dpkg::Options::='--force-confold' dist-upgrade -y", - "apt-get -y autoremove", - "apt-get -y autoclean", - ] - environment_vars = [ - "DEBIAN_FRONTEND=noninteractive", - ] - } - - # Cleanup — clear secrets, keys, history, logs so each server gets a fresh identity. - # cloud-init re-runs on first boot to re-inject SSH keys. - provisioner "shell" { - inline = [ - # Ensure /tmp exists with correct permissions - "mkdir -p /tmp", - "chmod 1777 /tmp", - - # Remove SSH authorized keys (cloud-init re-injects on first boot) - "rm -f /root/.ssh/authorized_keys", - "find /home -name authorized_keys -delete", - - # Remove SSH host keys (regenerated on first boot) - "rm -f /etc/ssh/ssh_host_*", - "touch /etc/ssh/revoked_keys", - "chmod 600 /etc/ssh/revoked_keys", - - # Clear bash history - "rm -f /root/.bash_history", - "find /home -name .bash_history -delete", - - # Truncate recent log files and remove archived logs - "find /var/log -mtime -1 -type f -exec truncate -s 0 {} \\;", - "rm -rf /var/log/*.gz /var/log/*.[0-9] /var/log/*-????????", - - # Clear apt cache - "apt-get clean", - "rm -rf /var/lib/apt/lists/*", - - # Clear tmp - "rm -rf /tmp/* /var/tmp/*", - - # Remove cloud-init instance data so it re-runs on first boot - "rm -rf /var/lib/cloud/instances/*", - - # Remove machine-id so each server gets a unique one - "truncate -s 0 /etc/machine-id", - "rm -f /var/lib/dbus/machine-id", - "ln -sf /etc/machine-id /var/lib/dbus/machine-id", - - # Reset cloud-init so it runs again on first boot - "cloud-init clean --logs", - - # Zero-fill free disk space to reduce snapshot size - "dd if=/dev/zero of=/zerofile bs=4096 || true", - "rm -f /zerofile", - - "sync", - ] - } - - # Write Packer manifest for CI - post-processor "manifest" { - output = "packer/manifest.json" - strip_path = true - } -} diff --git a/sh/shared/sprite-keep-running.sh b/sh/shared/sprite-keep-running.sh new file mode 100755 index 00000000..3996a1cb --- /dev/null +++ b/sh/shared/sprite-keep-running.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -eo pipefail + +# sprite-keep-running — Wraps a command and keeps the sprite alive by pinging +# its own public URL every 30 seconds. Prevents inactivity shutdown while an +# agent session is running. +# +# Usage: sprite-keep-running [args...] +# +# The keep-alive loop runs in the background and is killed when the wrapped +# command exits. Exit code is preserved from the wrapped command. + +if [ $# -eq 0 ]; then + echo "Usage: sprite-keep-running [args...]" >&2 + exit 1 +fi + +# Resolve sprite's own public URL via sprite-env (available on all sprites) +SPRITE_URL="" +if command -v sprite-env >/dev/null 2>&1; then + SPRITE_URL=$(sprite-env info 2>/dev/null | grep -o '"sprite_url":"[^"]*"' | cut -d'"' -f4) || true +fi + +if [ -z "${SPRITE_URL}" ]; then + # Can't determine URL — just run the command without keep-alive + exec "$@" +fi + +# Start background keep-alive loop +( + while true; do + curl -sf "${SPRITE_URL}" >/dev/null 2>&1 || true + sleep 30 + done +) & +KEEPALIVE_PID=$! + +# Ensure keep-alive is killed on exit +cleanup() { + kill "${KEEPALIVE_PID}" 2>/dev/null || true + wait "${KEEPALIVE_PID}" 2>/dev/null || true +} +trap cleanup EXIT INT TERM + +# Run the wrapped command +"$@"