mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-05-21 18:52:56 +00:00
feat: add retry logic for failure-prone orchestration operations (#1764)
Agent installation, config upload, env setup, and agent configuration can all fail transiently due to network flakiness or SSH instability on fresh VMs. Add a shared withRetry() helper and wrap these operations with 2-attempt retries to improve reliability without over-engineering. Co-authored-by: lab <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
63bce1bd04
commit
97992dc6a2
3 changed files with 49 additions and 12 deletions
|
|
@ -4,7 +4,7 @@
|
|||
import { writeFileSync, unlinkSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { logInfo, logWarn, logError, logStep, prompt, jsonEscape } from "./ui";
|
||||
import { logInfo, logWarn, logError, logStep, prompt, jsonEscape, withRetry } from "./ui";
|
||||
import type { AgentConfig } from "./agents";
|
||||
|
||||
// Re-export so cloud modules can re-export from here
|
||||
|
|
@ -28,7 +28,12 @@ export async function installAgent(
|
|||
): Promise<void> {
|
||||
logStep(`Installing ${agentName}...`);
|
||||
try {
|
||||
await runner.runServer(installCmd, timeoutSecs);
|
||||
await withRetry(
|
||||
`${agentName} install`,
|
||||
() => runner.runServer(installCmd, timeoutSecs),
|
||||
2,
|
||||
10,
|
||||
);
|
||||
} catch {
|
||||
logError(`${agentName} installation failed`);
|
||||
throw new Error(`${agentName} install failed`);
|
||||
|
|
@ -45,11 +50,18 @@ export async function uploadConfigFile(runner: CloudRunner, content: string, rem
|
|||
mode: 0o600,
|
||||
});
|
||||
|
||||
const tempRemote = `/tmp/spawn_config_${Date.now()}`;
|
||||
try {
|
||||
await runner.uploadFile(tmpFile, tempRemote);
|
||||
await runner.runServer(
|
||||
`mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`,
|
||||
await withRetry(
|
||||
"config upload",
|
||||
async () => {
|
||||
const tempRemote = `/tmp/spawn_config_${Date.now()}`;
|
||||
await runner.uploadFile(tmpFile, tempRemote);
|
||||
await runner.runServer(
|
||||
`mkdir -p $(dirname "${remotePath}") && chmod 600 '${tempRemote}' && mv '${tempRemote}' "${remotePath}"`,
|
||||
);
|
||||
},
|
||||
2,
|
||||
5,
|
||||
);
|
||||
} finally {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import type { AgentConfig } from "./agents";
|
||||
import { generateEnvConfig } from "./agents";
|
||||
import { logInfo, logStep, logWarn } from "./ui";
|
||||
import { logInfo, logStep, logWarn, withRetry } from "./ui";
|
||||
import { getOrPromptApiKey, getModelIdInteractive } from "./oauth";
|
||||
import type { CloudRunner } from "./agent-setup";
|
||||
import { offerGithubAuth } from "./agent-setup";
|
||||
|
|
@ -64,10 +64,16 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon
|
|||
const envContent = generateEnvConfig(agent.envVars(apiKey));
|
||||
const envB64 = Buffer.from(envContent).toString("base64");
|
||||
try {
|
||||
await cloud.runner.runServer(
|
||||
`printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` +
|
||||
`grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` +
|
||||
`grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`,
|
||||
await withRetry(
|
||||
"env setup",
|
||||
() =>
|
||||
cloud.runner.runServer(
|
||||
`printf '%s' '${envB64}' | base64 -d > ~/.spawnrc && chmod 600 ~/.spawnrc; ` +
|
||||
`grep -q 'source ~/.spawnrc' ~/.bashrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.bashrc; ` +
|
||||
`grep -q 'source ~/.spawnrc' ~/.zshrc 2>/dev/null || echo '[ -f ~/.spawnrc ] && source ~/.spawnrc' >> ~/.zshrc`,
|
||||
),
|
||||
2,
|
||||
5,
|
||||
);
|
||||
} catch {
|
||||
logWarn("Environment setup had errors");
|
||||
|
|
@ -79,7 +85,7 @@ export async function runOrchestration(cloud: CloudOrchestrator, agent: AgentCon
|
|||
// 10. Agent-specific configuration
|
||||
if (agent.configure) {
|
||||
try {
|
||||
await agent.configure(apiKey, modelId);
|
||||
await withRetry("agent config", () => agent.configure!(apiKey, modelId), 2, 5);
|
||||
} catch {
|
||||
logWarn("Agent configuration failed (continuing with defaults)");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -149,6 +149,25 @@ export function openBrowser(url: string): void {
|
|||
logStep(`Please open: ${url}`);
|
||||
}
|
||||
|
||||
/** Generic async retry helper. Retries `fn` up to `maxAttempts` times with a delay between attempts. */
|
||||
export async function withRetry<T>(
|
||||
label: string,
|
||||
fn: () => Promise<T>,
|
||||
maxAttempts = 3,
|
||||
delaySec = 5,
|
||||
): Promise<T> {
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
||||
try {
|
||||
return await fn();
|
||||
} catch (err) {
|
||||
if (attempt >= maxAttempts) throw err;
|
||||
logWarn(`${label} failed (attempt ${attempt}/${maxAttempts}), retrying in ${delaySec}s...`);
|
||||
await new Promise((r) => setTimeout(r, delaySec * 1000));
|
||||
}
|
||||
}
|
||||
throw new Error("unreachable");
|
||||
}
|
||||
|
||||
/** JSON-escape a string (returns the quoted JSON string). */
|
||||
export function jsonEscape(s: string): string {
|
||||
return JSON.stringify(s);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue