fix(e2e): fix --fast mode for native binary agents on Sprite (#2965)

Add 180s timeout to uploadFileSprite to prevent indefinite hangs during
tarball uploads. Without a timeout, large tarballs or stalled Sprite
connections block the entire provisioning pipeline past the 720s E2E
provision timeout, causing agent binary not-found failures for openclaw,
zeroclaw, and codex.

Also skip the redundant remote tarball download fallback when a local
tarball was already downloaded but its upload/extract failed -- the
remote download would face the same extraction issues. This saves ~150s
in the fallback chain, leaving enough time for the live install to
complete within the provision timeout.

Fixes #2960

Agent: code-health

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
A 2026-03-24 12:59:11 -07:00 committed by GitHub
parent 4ee4bd71e6
commit d3889519bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 25 additions and 5 deletions

View file

@ -251,7 +251,11 @@ export async function runOrchestration(
installed = await uploadAndExtractTarball(cloud.runner, localTarball.localPath);
localTarball.cleanup();
}
if (!installed && useTarball && !agent.skipTarball) {
// Only try remote tarball download when we didn't already have a local tarball.
// If the local tarball was available but upload/extract failed, the remote
// download would face the same extraction issues — skip it to save ~150s
// and fall through to live install immediately.
if (!installed && !localTarball && useTarball && !agent.skipTarball) {
const tarball = options?.tryTarball ?? tryTarballInstall;
installed = await tarball(cloud.runner, agentName);
}

View file

@ -576,6 +576,12 @@ export async function uploadFileSprite(localPath: string, remotePath: string): P
// Compute the parent directory in TypeScript to avoid shell interpolation
const parentDir = posixDirname(normalizedRemote);
// 180s timeout — prevents indefinite hangs during tarball uploads in fast mode.
// Without this, large file uploads (e.g. 300MB openclaw tarball) or stalled
// Sprite connections can block the entire provisioning pipeline past the
// E2E provision timeout (720s), causing agent binary not-found failures.
const UPLOAD_TIMEOUT_MS = 180_000;
await spriteRetry("sprite upload", async () => {
// Upload the file to the temp path, then mkdir + mv using array args
// to avoid shell string interpolation (command injection risk).
@ -604,8 +610,13 @@ export async function uploadFileSprite(localPath: string, remotePath: string): P
);
// Drain stderr before awaiting exit to prevent pipe buffer deadlock
const stderrText = new Response(proc.stderr).text();
const exitCode = await proc.exited;
if (exitCode !== 0) {
const uploadTimer = setTimeout(() => killWithTimeout(proc), UPLOAD_TIMEOUT_MS);
const uploadResult = await asyncTryCatch(() => proc.exited);
clearTimeout(uploadTimer);
if (!uploadResult.ok) {
throw new Error(`upload timed out for ${remotePath}`);
}
if (uploadResult.data !== 0) {
throw new Error(`upload mkdir failed for ${remotePath}: ${await stderrText}`);
}
@ -632,8 +643,13 @@ export async function uploadFileSprite(localPath: string, remotePath: string): P
},
);
const mvStderrText = new Response(mvProc.stderr).text();
const mvExitCode = await mvProc.exited;
if (mvExitCode !== 0) {
const mvTimer = setTimeout(() => killWithTimeout(mvProc), 60_000);
const mvResult = await asyncTryCatch(() => mvProc.exited);
clearTimeout(mvTimer);
if (!mvResult.ok) {
throw new Error(`upload mv timed out for ${remotePath}`);
}
if (mvResult.data !== 0) {
throw new Error(`upload mv failed for ${remotePath}: ${await mvStderrText}`);
}
});