feat: add retry logic for failure-prone orchestration operations (#1764)

Agent installation, config upload, env setup, and agent configuration
can all fail transiently due to network flakiness or SSH instability
on fresh VMs. Add a shared withRetry() helper and wrap these operations
with 2-attempt retries to improve reliability without over-engineering.

Co-authored-by: lab <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
A 2026-02-22 15:20:16 -08:00 committed by GitHub
parent 63bce1bd04
commit 97992dc6a2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 49 additions and 12 deletions

View file

@ -4,7 +4,7 @@
import { writeFileSync, unlinkSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { logInfo, logWarn, logError, logStep, prompt, jsonEscape } from "./ui";
import { logInfo, logWarn, logError, logStep, prompt, jsonEscape, withRetry } from "./ui";
import type { AgentConfig } from "./agents";
// Re-export so cloud modules can re-export from here
@ -28,7 +28,12 @@ export async function installAgent(
): Promise<void> {
logStep(`Installing ${agentName}...`);
try {
await runner.runServer(installCmd, timeoutSecs);
await withRetry(
`${agentName} install`,
() => runner.runServer(installCmd, timeoutSecs),
2,
10,
);
} catch {
logError(`${agentName} installation failed`);
throw new Error(`${agentName} install failed`);
@ -45,11 +50,18 @@ export async function uploadConfigFile(runner: CloudRunner, content: string, rem
mode: 0o600,
});
const tempRemote = `/tmp/spawn_config_${Date.now()}`;
try {
await runner.uploadFile(tmpFile, tempRemote);
await runner.runServer(
`mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`,
await withRetry(
"config upload",
async () => {
const tempRemote = `/tmp/spawn_config_${Date.now()}`;
await runner.uploadFile(tmpFile, tempRemote);
await runner.runServer(
`mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`,
);
},
2,
5,
);
} finally {
try {

View file

@ -3,7 +3,7 @@
import type { AgentConfig } from "./agents";
import { generateEnvConfig } from "./agents";
import { logInfo, logStep, logWarn } from "./ui";
import { logInfo, logStep, logWarn, withRetry } from "./ui";
import { getOrPromptApiKey, getModelIdInteractive } from "./oauth";
import type { CloudRunner } from "./agent-setup";
import { offerGithubAuth } from "./agent-setup";
@ -64,10 +64,16 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon
const envContent = generateEnvConfig(agent.envVars(apiKey));
const envB64 = Buffer.from(envContent).toString("base64");
try {
await cloud.runner.runServer(
`printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` +
`grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` +
`grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`,
await withRetry(
"env setup",
() =>
cloud.runner.runServer(
`printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` +
`grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` +
`grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`,
),
2,
5,
);
} catch {
logWarn("Environment setup had errors");
@ -79,7 +85,7 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon
// 10. Agent-specific configuration
if (agent.configure) {
try {
await agent.configure(apiKey, modelId);
await withRetry("agent config", () => agent.configure!(apiKey, modelId), 2, 5);
} catch {
logWarn("Agent configuration failed (continuing with defaults)");
}

View file

@ -149,6 +149,25 @@ export function openBrowser(url: string): void {
logStep(`Please open: ${url}`);
}
/** Generic async retry helper. Retries `fn` up to `maxAttempts` times with a delay between attempts. */
export async function withRetry<T>(
label: string,
fn: () => Promise<T>,
maxAttempts = 3,
delaySec = 5,
): Promise<T> {
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
try {
return await fn();
} catch (err) {
if (attempt >= maxAttempts) throw err;
logWarn(`${label} failed (attempt ${attempt}/${maxAttempts}), retrying in ${delaySec}s...`);
await new Promise((r) => setTimeout(r, delaySec * 1000));
}
}
throw new Error("unreachable");
}
/** JSON-escape a string (returns the quoted JSON string). */
export function jsonEscape(s: string): string {
return JSON.stringify(s);