From 97992dc6a2e625a26858c6870c04e8ab19fabc23 Mon Sep 17 00:00:00 2001 From: A <258483684+la14-1@users.noreply.github.com> Date: Sun, 22 Feb 2026 15:20:16 -0800 Subject: [PATCH] feat: add retry logic for failure-prone orchestration operations (#1764) Agent installation, config upload, env setup, and agent configuration can all fail transiently due to network flakiness or SSH instability on fresh VMs. Add a shared withRetry() helper and wrap these operations with 2-attempt retries to improve reliability without over-engineering. Co-authored-by: lab <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) --- cli/src/shared/agent-setup.ts | 24 ++++++++++++++++++------ cli/src/shared/orchestrate.ts | 18 ++++++++++++------ cli/src/shared/ui.ts | 19 +++++++++++++++++++ 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/cli/src/shared/agent-setup.ts b/cli/src/shared/agent-setup.ts index b1366cb6..8df68fc4 100644 --- a/cli/src/shared/agent-setup.ts +++ b/cli/src/shared/agent-setup.ts @@ -4,7 +4,7 @@ import { writeFileSync, unlinkSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; -import { logInfo, logWarn, logError, logStep, prompt, jsonEscape } from "./ui"; +import { logInfo, logWarn, logError, logStep, prompt, jsonEscape, withRetry } from "./ui"; import type { AgentConfig } from "./agents"; // Re-export so cloud modules can re-export from here @@ -28,7 +28,12 @@ export async function installAgent( ): Promise { logStep(`Installing ${agentName}...`); try { - await runner.runServer(installCmd, timeoutSecs); + await withRetry( + `${agentName} install`, + () => runner.runServer(installCmd, timeoutSecs), + 2, + 10, + ); } catch { logError(`${agentName} installation failed`); throw new Error(`${agentName} install failed`); @@ -45,11 +50,18 @@ export async function uploadConfigFile(runner: CloudRunner, content: string, rem mode: 0o600, }); - const tempRemote = `/tmp/spawn_config_${Date.now()}`; try { - await runner.uploadFile(tmpFile, tempRemote); - await runner.runServer( - `mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`, + await withRetry( + "config upload", + async () => { + const tempRemote = `/tmp/spawn_config_${Date.now()}`; + await runner.uploadFile(tmpFile, tempRemote); + await runner.runServer( + `mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`, + ); + }, + 2, + 5, ); } finally { try { diff --git a/cli/src/shared/orchestrate.ts b/cli/src/shared/orchestrate.ts index dc66b8f2..7ef1ade3 100644 --- a/cli/src/shared/orchestrate.ts +++ b/cli/src/shared/orchestrate.ts @@ -3,7 +3,7 @@ import type { AgentConfig } from "./agents"; import { generateEnvConfig } from "./agents"; -import { logInfo, logStep, logWarn } from "./ui"; +import { logInfo, logStep, logWarn, withRetry } from "./ui"; import { getOrPromptApiKey, getModelIdInteractive } from "./oauth"; import type { CloudRunner } from "./agent-setup"; import { offerGithubAuth } from "./agent-setup"; @@ -64,10 +64,16 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon const envContent = generateEnvConfig(agent.envVars(apiKey)); const envB64 = Buffer.from(envContent).toString("base64"); try { - await cloud.runner.runServer( - `printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` + - `grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` + - `grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`, + await withRetry( + "env setup", + () => + cloud.runner.runServer( + `printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` + + `grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` + + `grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`, + ), + 2, + 5, ); } catch { logWarn("Environment setup had errors"); @@ -79,7 +85,7 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon // 10. Agent-specific configuration if (agent.configure) { try { - await agent.configure(apiKey, modelId); + await withRetry("agent config", () => agent.configure!(apiKey, modelId), 2, 5); } catch { logWarn("Agent configuration failed (continuing with defaults)"); } diff --git a/cli/src/shared/ui.ts b/cli/src/shared/ui.ts index e8fbafba..6c8507dc 100644 --- a/cli/src/shared/ui.ts +++ b/cli/src/shared/ui.ts @@ -149,6 +149,25 @@ export function openBrowser(url: string): void { logStep(`Please open: ${url}`); } +/** Generic async retry helper. Retries `fn` up to `maxAttempts` times with a delay between attempts. */ +export async function withRetry( + label: string, + fn: () => Promise, + maxAttempts = 3, + delaySec = 5, +): Promise { + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + return await fn(); + } catch (err) { + if (attempt >= maxAttempts) throw err; + logWarn(`${label} failed (attempt ${attempt}/${maxAttempts}), retrying in ${delaySec}s...`); + await new Promise((r) => setTimeout(r, delaySec * 1000)); + } + } + throw new Error("unreachable"); +} + /** JSON-escape a string (returns the quoted JSON string). */ export function jsonEscape(s: string): string { return JSON.stringify(s);