spawn/packages/cli/src/shared/ssh.ts
A e44705d925
fix(ux): reduce SSH wait verbosity and clarify agent handoff (#3056)
- Replace repeated 'SSH port closed (N/36)' with periodic updates every 5 attempts
- Add clear 'Provisioning complete. Connecting...' line before agent attach

Fixes #3053

Agent: ux-engineer

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-03-27 15:22:46 +07:00

446 lines
14 KiB
TypeScript

// shared/ssh.ts — Shared SSH wait utility with TCP pre-check and stderr capture
import { spawnSync as nodeSpawnSync } from "node:child_process";
import { connect } from "node:net";
import { normalize } from "node:path/posix";
import { asyncTryCatch, tryCatch } from "./result.js";
import { logError, logInfo, logStep, logStepDone, logStepInline } from "./ui.js";
// ─── Shared SSH Options ──────────────────────────────────────────────────────
/** Base SSH options shared across all clouds (array form for Bun.spawn). */
export const SSH_BASE_OPTS: string[] = [
"-o",
"StrictHostKeyChecking=accept-new",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"LogLevel=ERROR",
"-o",
"ConnectTimeout=10",
"-o",
"ServerAliveInterval=15",
"-o",
"ServerAliveCountMax=3",
"-o",
"GSSAPIAuthentication=no",
"-o",
"TCPKeepAlive=no",
"-o",
"BatchMode=yes",
];
/**
* SSH options for interactive sessions (user-facing TTY).
*
* Differences from SSH_BASE_OPTS:
* - No BatchMode (interactive sessions need TTY prompts to work)
* - StrictHostKeyChecking=accept-new instead of =no (safer for reconnects)
* - Compression=yes (reduces latency on slow/distant links)
* - IPQoS=lowdelay (mark packets for low-latency QoS treatment)
* - RequestTTY=yes (force TTY allocation for the session)
* - EscapeChar=none (disable per-byte ~ escape scanning for faster keystroke echo)
* - AddressFamily=inet (skip IPv6 resolution to avoid intermittent stalls)
*/
export const SSH_INTERACTIVE_OPTS: string[] = [
"-o",
"StrictHostKeyChecking=accept-new",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"LogLevel=ERROR",
"-o",
"ConnectTimeout=10",
"-o",
"ServerAliveInterval=15",
"-o",
"ServerAliveCountMax=3",
"-o",
"GSSAPIAuthentication=no",
"-o",
"TCPKeepAlive=no",
"-o",
"Compression=no",
"-o",
"IPQoS=lowdelay",
"-o",
"EscapeChar=none",
"-o",
"AddressFamily=inet",
"-t",
];
// ─── Remote Path Validation ─────────────────────────────────────────────────
/**
* Validate a remote file path for use with scp/ssh file operations.
*
* Rejects path traversal (.. segments), argument injection (leading dashes),
* and characters outside a safe allowlist. The `..` check is performed on the
* RAW input before normalize() so that crafted paths like `/tmp/../../etc/passwd`
* (which normalize to `/etc/passwd`) are still caught.
*
* @param remotePath - The raw remote path to validate
* @param allowedCharsPattern - Optional regex for allowed characters
* (default: alphanumerics, `/`, `.`, `_`, `~`, `$`, `{`, `}`, `:`, `-`)
* @returns The normalized path if valid
* @throws Error if the path is unsafe
*/
export function validateRemotePath(remotePath: string, allowedCharsPattern: RegExp = /^[\w/.~${}:-]+$/): string {
// 1. Check for ".." traversal in the RAW input BEFORE normalize() strips it
if (remotePath.includes("..")) {
throw new Error(`Invalid remote path: path traversal detected ("..") in: ${remotePath}`);
}
// 2. Reject empty paths
if (!remotePath) {
throw new Error("Invalid remote path: path must not be empty");
}
// 3. Normalize (resolve . segments, collapse slashes)
const normalized = normalize(remotePath);
// 4. Double-check normalized result for ".." (defense in depth)
if (normalized.includes("..")) {
throw new Error(`Invalid remote path: path traversal detected ("..") in normalized: ${normalized}`);
}
// 5. Character allowlist
if (!allowedCharsPattern.test(normalized)) {
throw new Error(`Invalid remote path: contains unsafe characters: ${remotePath}`);
}
// 6. Reject argument injection (segments starting with -)
if (normalized.split("/").some((s) => s.startsWith("-"))) {
throw new Error(`Invalid remote path: segments must not start with "-": ${remotePath}`);
}
return normalized;
}
// ─── Interactive Spawn ───────────────────────────────────────────────────────
/**
* Spawn a child process for an interactive terminal session using spawnSync.
*
* Why spawnSync instead of Bun.spawn?
* Bun's async event loop keeps polling fd 0 (stdin) even after
* process.stdin.pause()/destroy(). With Bun.spawn + stdio:"inherit",
* both the parent's event loop and the child (SSH) race for bytes on
* the same fd, causing random keystroke drops.
*
* spawnSync blocks the event loop entirely, so the child process is the
* sole reader of stdin. This matches the behavior of running SSH directly
* from a shell.
*/
export function spawnInteractive(args: string[], env?: Record<string, string | undefined>): number {
// Use Node's spawnSync (not Bun.spawnSync) — it's more battle-tested
// with interactive TTY programs and properly handles SIGWINCH, job
// control, and terminal I/O forwarding.
const result = nodeSpawnSync(args[0], args.slice(1), {
stdio: "inherit",
env: env ?? process.env,
});
// Reset terminal state after the interactive session ends.
// The remote agent's TUI (e.g. Claude Code) may leave the terminal in
// raw mode or with altered attributes, causing garbled post-session output.
if (process.stderr.isTTY) {
process.stderr.write("\x1b[0m\x1b[?25h"); // reset attributes + show cursor
}
if (process.stdout.isTTY) {
process.stdout.write("\x1b[0m\x1b[?25h");
}
// Restore sane terminal settings (cooked mode, echo, etc.)
tryCatch(() =>
nodeSpawnSync(
"stty",
[
"sane",
],
{
stdio: "inherit",
},
),
);
return result.status ?? 1;
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
/** Async sleep — shared across all cloud providers. */
export function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
/**
* Kill a child process with SIGTERM, then escalate to SIGKILL after a grace period.
*
* SSH processes stuck in network I/O can ignore SIGTERM indefinitely,
* causing `await proc.exited` to hang forever. This helper ensures the
* process is forcefully killed if it doesn't respond to SIGTERM.
*/
export function killWithTimeout(
proc: {
kill(signal?: number): void;
},
gracePeriodMs = 5000,
): void {
const r = tryCatch(() => proc.kill());
if (!r.ok) {
return;
}
const sigkillTimer = setTimeout(() => {
tryCatch(() => proc.kill(9));
}, gracePeriodMs);
// Don't let this timer keep the event loop alive — the process may already
// be dead from SIGTERM, so there's no reason to block exit for 5 seconds.
sigkillTimer.unref();
}
// ─── TCP Pre-Check ───────────────────────────────────────────────────────────
/**
* Probe whether a TCP port is open using node:net.
* Returns true if the connection succeeds within `timeoutMs`, false otherwise.
* This is much cheaper than a full SSH handshake attempt.
*/
function tcpCheck(host: string, port: number, timeoutMs = 2000): Promise<boolean> {
return new Promise((resolve) => {
const socket = connect({
host,
port,
});
const timer = setTimeout(() => {
socket.destroy();
resolve(false);
}, timeoutMs);
socket.on("connect", () => {
clearTimeout(timer);
socket.destroy();
resolve(true);
});
socket.on("error", () => {
clearTimeout(timer);
socket.destroy();
resolve(false);
});
});
}
// ─── SSH Tunnel ──────────────────────────────────────────────────────────
export interface SshTunnelHandle {
localPort: number;
stop: () => void;
exited: Promise<number>;
}
/**
* Start an SSH tunnel forwarding a remote port to localhost.
* Tries local ports starting from `remotePort` up to `remotePort + 10`.
* Throws if no port is available or the SSH connection fails immediately.
*/
export async function startSshTunnel(opts: {
host: string;
user: string;
remotePort: number;
localPort?: number;
sshKeyOpts?: string[];
}): Promise<SshTunnelHandle> {
const { host, user, remotePort, sshKeyOpts } = opts;
// Find available local port
let localPort = opts.localPort ?? remotePort;
let found = false;
for (let p = localPort; p <= localPort + 10; p++) {
const inUse = await tcpCheck("127.0.0.1", p, 500);
if (!inUse) {
localPort = p;
found = true;
break;
}
}
if (!found) {
throw new Error(`No available local port in range ${remotePort}-${remotePort + 10}`);
}
const args = [
"ssh",
...SSH_BASE_OPTS,
...(sshKeyOpts ?? []),
"-N",
"-L",
`${localPort}:127.0.0.1:${remotePort}`,
`${user}@${host}`,
];
const proc = Bun.spawn(args, {
stdio: [
"ignore",
"ignore",
"pipe",
],
});
// Wait briefly to detect immediate failures (bad auth, connection refused)
await sleep(1500);
if (proc.exitCode !== null) {
const stderr = await new Response(proc.stderr).text();
throw new Error(`SSH tunnel failed: ${stderr.trim() || `exit code ${proc.exitCode}`}`);
}
return {
localPort,
stop: () => killWithTimeout(proc),
exited: proc.exited,
};
}
// ─── SSH Wait ────────────────────────────────────────────────────────────────
export interface WaitForSshOpts {
host: string;
user: string;
/** Maximum total attempts across both phases. Default: 36 (~3 min). */
maxAttempts?: number;
/** Path to SSH identity file (e.g. ~/.ssh/id_ed25519). */
sshKeyPath?: string;
/** Extra SSH options appended after SSH_BASE_OPTS. */
extraSshOpts?: string[];
}
/**
* Two-phase SSH wait with resilience improvements:
*
* **Phase 1 (TCP probe):** Loop with cheap TCP probes until port 22 is open.
* Uses 2s intervals. Avoids the 10s ConnectTimeout overhead when sshd isn't
* even listening yet (VM still booting).
*
* **Phase 2 (SSH handshake):** Once port 22 is open, attempt full SSH `echo ok`.
* Uses 3s intervals. Captures stderr so the user sees the actual error reason.
*
* Total budget: ~`maxAttempts` attempts spread across both phases.
* Effective timeout: ~3 min with defaults.
*/
export async function waitForSsh(opts: WaitForSshOpts): Promise<void> {
const { host, user, sshKeyPath, extraSshOpts } = opts;
const maxAttempts = opts.maxAttempts ?? 36;
// Build SSH args
const sshArgs: string[] = [
...SSH_BASE_OPTS,
];
if (sshKeyPath) {
sshArgs.push("-i", sshKeyPath);
}
if (extraSshOpts) {
sshArgs.push(...extraSshOpts);
}
// ── Phase 1: TCP probe ────────────────────────────────────────────────────
logStep("Waiting for SSH port to open...");
let attempt = 0;
let tcpOpen = false;
while (attempt < maxAttempts) {
attempt += 1;
const open = await tcpCheck(host, 22, 2000);
if (open) {
tcpOpen = true;
logStepDone();
logInfo("SSH port 22 is open");
break;
}
if (attempt % 5 === 0 || attempt === 1) {
logStepInline(`Waiting for SSH port... (${attempt}/${maxAttempts} attempts)`);
}
await sleep(2000);
}
if (!tcpOpen) {
logStepDone();
logError(`SSH port 22 never opened after ${maxAttempts} attempts`);
throw new Error("SSH connectivity timeout — port 22 never opened");
}
// ── Phase 2: SSH handshake ────────────────────────────────────────────────
logStep("Waiting for SSH handshake...");
const remaining = maxAttempts - attempt;
// At least 5 handshake attempts even if TCP phase used most of the budget
const handshakeAttempts = Math.max(remaining, 5);
for (let i = 1; i <= handshakeAttempts; i++) {
const r = await asyncTryCatch(async () => {
const proc = Bun.spawn(
[
"ssh",
...sshArgs,
`${user}@${host}`,
"echo ok",
],
{
stdio: [
"ignore",
"pipe",
"pipe",
],
},
);
// Per-process timeout: ConnectTimeout=10 only covers TCP connect, not
// the full SSH handshake. If sshd accepts the connection but stalls
// during key exchange or auth, the process hangs indefinitely. Kill it
// after 30s so the retry loop can continue.
const timer = setTimeout(() => killWithTimeout(proc), 30_000);
const inner = await asyncTryCatch(async () => {
const [stdout, stderr] = await Promise.all([
new Response(proc.stdout).text(),
new Response(proc.stderr).text(),
]);
const exitCode = await proc.exited;
if (exitCode === 0 && stdout.includes("ok")) {
return {
stdout,
stderr,
exitCode,
};
}
// Show the actual SSH error reason dimly so users can debug
const reason = stderr.trim();
if (reason) {
logStep(`SSH handshake failed (${i}/${handshakeAttempts}): ${reason}`);
} else {
logStep(`SSH handshake failed (${i}/${handshakeAttempts})`);
}
return null;
});
clearTimeout(timer);
if (!inner.ok) {
throw inner.error;
}
return inner.data;
});
if (r.ok && r.data !== null) {
logInfo("SSH is ready");
return;
}
if (!r.ok) {
logStep(`SSH handshake error (${i}/${handshakeAttempts})`);
}
await sleep(3000);
}
logError(`SSH handshake failed after ${handshakeAttempts} attempts`);
throw new Error("SSH connectivity timeout — handshake never succeeded");
}
/**
* Wait for SSH availability on a snapshot-booted VM (no cloud-init needed).
* Used by cloud modules that support snapshot-based provisioning (Hetzner, DigitalOcean).
*/
export async function waitForSshSnapshotBoot(ip: string, extraSshOpts: string[]): Promise<void> {
await waitForSsh({
host: ip,
user: "root",
maxAttempts: 36,
extraSshOpts,
});
logInfo("SSH available (snapshot boot — skipping cloud-init)");
}