From aa4174db9e79db030aa7a6b5abd1d1b9cd876d48 Mon Sep 17 00:00:00 2001 From: A <258483684+la14-1@users.noreply.github.com> Date: Sat, 21 Feb 2026 07:45:32 -0800 Subject: [PATCH] fix: add retry logic to wait_for_cloud_init for error recovery (#1575) (#1588) Add _fly_run_with_retry helper that wraps run_server with configurable retry count, sleep interval, and timeout. Apply it to package manager and installer commands in wait_for_cloud_init so transient failures (network timeouts, apt lock contention) no longer abort the entire cloud-init sequence. Agent: complexity-hunter Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 --- fly/lib/common.sh | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/fly/lib/common.sh b/fly/lib/common.sh index fffab3fe..40b8e794 100644 --- a/fly/lib/common.sh +++ b/fly/lib/common.sh @@ -483,6 +483,26 @@ create_server() { save_vm_connection "fly-ssh" "root" "${FLY_MACHINE_ID}" "$name" "fly" } +# Retry a run_server command up to N times with sleep between attempts. +# Usage: _fly_run_with_retry MAX_ATTEMPTS SLEEP_SEC TIMEOUT CMD +_fly_run_with_retry() { + local max_attempts="${1:-3}" + local sleep_sec="${2:-5}" + local timeout_secs="${3:-120}" + local cmd="${4}" + local attempt=1 + while [ "$attempt" -le "$max_attempts" ]; do + if run_server "$cmd" "$timeout_secs"; then + return 0 + fi + log_warn "Command failed (attempt $attempt/$max_attempts): $cmd" + attempt=$((attempt + 1)) + [ "$attempt" -le "$max_attempts" ] && sleep "$sleep_sec" + done + log_error "Command failed after $max_attempts attempts: $cmd" + return 1 +} + # Wait for SSH to be reachable on the Fly.io machine _fly_wait_for_ssh() { local max_attempts="${1:-20}" @@ -509,16 +529,16 @@ wait_for_cloud_init() { _fly_wait_for_ssh || return 1 log_step "Installing packages (this may take 1-2 minutes)..." - run_server "apt-get update -y && apt-get install -y curl unzip git zsh python3 python3-pip build-essential" 600 || { - log_warn "Package install timed out or failed, retrying..." - run_server "apt-get install -y curl unzip git zsh python3 python3-pip build-essential" 300 || true + _fly_run_with_retry 3 10 600 "apt-get update -y && apt-get install -y curl unzip git zsh python3 python3-pip build-essential" || { + log_warn "Full package install failed after retries, trying minimal set..." + _fly_run_with_retry 2 5 300 "apt-get install -y curl git" || true } log_step "Installing Node.js..." - run_server "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && apt-get install -y nodejs" 120 || { - log_warn "Node.js install failed, npm-based agents may not work" + _fly_run_with_retry 3 10 120 "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && apt-get install -y nodejs" || { + log_warn "Node.js install failed after retries, npm-based agents may not work" } log_step "Installing bun..." - run_server "curl -fsSL https://bun.sh/install | bash" 120 || true + _fly_run_with_retry 2 5 120 "curl -fsSL https://bun.sh/install | bash" || true run_server 'echo "export PATH=\"\$HOME/.local/bin:\$HOME/.bun/bin:\$PATH\"" >> ~/.bashrc' 30 || true run_server 'echo "export PATH=\"\$HOME/.local/bin:\$HOME/.bun/bin:\$PATH\"" >> ~/.zshrc' 30 || true log_info "Base tools installed"