fix: prevent sprite idle shutdown during agent install (#2874)

The sprite was going idle and shutting down during long npm install
operations because the remote keep-alive script wasn't installed yet
and sprite exec alone doesn't count as activity.

- Add local keep-alive that pings the sprite's public URL every 30s
  from the client machine during provisioning and agent install
- Stop it when the interactive session starts (remote script takes over)
- Add i/o timeout to spriteRetry's transient error regex so connection
  timeouts are retried instead of failing immediately

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Ahmed Abushagur 2026-03-22 12:13:07 -07:00 committed by GitHub
parent 66a1749b4b
commit baf03ce47b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 58 additions and 3 deletions

View file

@ -1,6 +1,6 @@
{
"name": "@openrouter/spawn",
"version": "0.25.12",
"version": "0.25.13",
"type": "module",
"bin": {
"spawn": "cli.js"

View file

@ -19,6 +19,8 @@ import {
promptSpawnName,
runSprite,
setupShellEnvironment,
startLocalKeepAlive,
stopLocalKeepAlive,
uploadFileSprite,
verifySpriteConnectivity,
} from "./sprite.js";
@ -50,13 +52,20 @@ async function main() {
async createServer(name: string) {
await createSprite(name);
await verifySpriteConnectivity();
// Start pinging the sprite URL locally to prevent idle shutdown
// during long operations (agent install, config). Stopped when
// the interactive session starts (remote keep-alive takes over).
startLocalKeepAlive();
await setupShellEnvironment();
await installSpriteKeepAlive();
return getVmConnection();
},
getServerName,
async waitForReady() {},
interactiveSession,
async interactiveSession(cmd: string, spawnFn?: (args: string[]) => number) {
stopLocalKeepAlive();
return interactiveSession(cmd, spawnFn);
},
};
await runOrchestration(cloud, agent, agentName);

View file

@ -84,7 +84,7 @@ async function spriteRetry<T>(desc: string, fn: () => Promise<T>): Promise<T> {
}
// Only retry on transient network errors
if (/TLS handshake timeout|connection closed|connection reset|connection refused/i.test(msg)) {
if (/TLS handshake timeout|connection closed|connection reset|connection refused|i\/o timeout/i.test(msg)) {
logWarn(`${desc}: Transient error, retrying (${attempt}/${maxRetries})...`);
await sleep(3000);
continue;
@ -386,6 +386,52 @@ export async function verifySpriteConnectivity(maxAttempts = 6): Promise<void> {
throw new Error("Sprite connectivity timeout");
}
// ─── Local Keep-Alive ────────────────────────────────────────────────────────
/**
* Background keep-alive that pings the sprite's public URL every 30s from the
* local machine. Prevents the sprite from going idle during long operations
* like agent installation (where the remote keep-alive script isn't running yet).
*/
let _keepAliveTimer: ReturnType<typeof setInterval> | null = null;
export function startLocalKeepAlive(): void {
if (_keepAliveTimer) {
return;
}
const cmd = getSpriteCmd();
if (!cmd || !_state.name) {
return;
}
// Get the sprite's public URL
const urlResult = spawnSync([
cmd,
...orgFlags(),
"url",
"-s",
_state.name,
]);
const urlMatch = urlResult.stdout.match(/https:\/\/\S+/);
if (!urlMatch) {
return;
}
const spriteUrl = urlMatch[0];
_keepAliveTimer = setInterval(() => {
// Fire-and-forget fetch to keep the sprite alive
fetch(spriteUrl).catch(() => {});
}, 30_000);
}
export function stopLocalKeepAlive(): void {
if (_keepAliveTimer) {
clearInterval(_keepAliveTimer);
_keepAliveTimer = null;
}
}
// ─── Shell Environment Setup ─────────────────────────────────────────────────
export async function setupShellEnvironment(): Promise<void> {