fix: add retry logic to wait_for_cloud_init for error recovery (#1575) (#1588)

Add _fly_run_with_retry helper that wraps run_server with configurable
retry count, sleep interval, and timeout. Apply it to package manager
and installer commands in wait_for_cloud_init so transient failures
(network timeouts, apt lock contention) no longer abort the entire
cloud-init sequence.

Agent: complexity-hunter

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
A 2026-02-21 07:45:32 -08:00 committed by GitHub
parent cbb6198258
commit aa4174db9e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -483,6 +483,26 @@ create_server() {
save_vm_connection "fly-ssh" "root" "${FLY_MACHINE_ID}" "$name" "fly"
}
# Retry a run_server command up to N times with sleep between attempts.
# Usage: _fly_run_with_retry MAX_ATTEMPTS SLEEP_SEC TIMEOUT CMD
_fly_run_with_retry() {
local max_attempts="${1:-3}"
local sleep_sec="${2:-5}"
local timeout_secs="${3:-120}"
local cmd="${4}"
local attempt=1
while [ "$attempt" -le "$max_attempts" ]; do
if run_server "$cmd" "$timeout_secs"; then
return 0
fi
log_warn "Command failed (attempt $attempt/$max_attempts): $cmd"
attempt=$((attempt + 1))
[ "$attempt" -le "$max_attempts" ] && sleep "$sleep_sec"
done
log_error "Command failed after $max_attempts attempts: $cmd"
return 1
}
# Wait for SSH to be reachable on the Fly.io machine
_fly_wait_for_ssh() {
local max_attempts="${1:-20}"
@ -509,16 +529,16 @@ wait_for_cloud_init() {
_fly_wait_for_ssh || return 1
log_step "Installing packages (this may take 1-2 minutes)..."
run_server "apt-get update -y && apt-get install -y curl unzip git zsh python3 python3-pip build-essential" 600 || {
log_warn "Package install timed out or failed, retrying..."
run_server "apt-get install -y curl unzip git zsh python3 python3-pip build-essential" 300 || true
_fly_run_with_retry 3 10 600 "apt-get update -y && apt-get install -y curl unzip git zsh python3 python3-pip build-essential" || {
log_warn "Full package install failed after retries, trying minimal set..."
_fly_run_with_retry 2 5 300 "apt-get install -y curl git" || true
}
log_step "Installing Node.js..."
run_server "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && apt-get install -y nodejs" 120 || {
log_warn "Node.js install failed, npm-based agents may not work"
_fly_run_with_retry 3 10 120 "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && apt-get install -y nodejs" || {
log_warn "Node.js install failed after retries, npm-based agents may not work"
}
log_step "Installing bun..."
run_server "curl -fsSL https://bun.sh/install | bash" 120 || true
_fly_run_with_retry 2 5 120 "curl -fsSL https://bun.sh/install | bash" || true
run_server 'echo "export PATH=\"\$HOME/.local/bin:\$HOME/.bun/bin:\$PATH\"" >> ~/.bashrc' 30 || true
run_server 'echo "export PATH=\"\$HOME/.local/bin:\$HOME/.bun/bin:\$PATH\"" >> ~/.zshrc' 30 || true
log_info "Base tools installed"