spawn/packages/cli/src/shared/ssh.ts

// shared/ssh.ts — Shared SSH wait utility with TCP pre-check and stderr capture

import { spawnSync as nodeSpawnSync } from "node:child_process";
import { connect } from "node:net";
import { normalize } from "node:path/posix";
import { asyncTryCatch, tryCatch } from "./result.js";
import { logError, logInfo, logStep, logStepDone, logStepInline } from "./ui.js";

// ─── Shared SSH Options ──────────────────────────────────────────────────────

/** Base SSH options shared across all clouds (array form for Bun.spawn). */
export const SSH_BASE_OPTS: string[] = [
  "-o",
  "StrictHostKeyChecking=accept-new",
  "-o",
  "UserKnownHostsFile=/dev/null",
  "-o",
  "LogLevel=ERROR",
  "-o",
  "ConnectTimeout=10",
  "-o",
  "ServerAliveInterval=15",
  "-o",
  "ServerAliveCountMax=3",
  "-o",
  "GSSAPIAuthentication=no",
  "-o",
  "TCPKeepAlive=no",
  "-o",
  "BatchMode=yes",
];

/**
 * SSH options for interactive sessions (user-facing TTY).
 *
 * Differences from SSH_BASE_OPTS:
 * - No BatchMode (interactive sessions need TTY prompts to work)
 * - StrictHostKeyChecking=accept-new instead of =no (safer for reconnects)
 * - Compression=yes (reduces latency on slow/distant links)
 * - IPQoS=lowdelay (mark packets for low-latency QoS treatment)
 * - RequestTTY=yes (force TTY allocation for the session)
 * - EscapeChar=none (disable per-byte ~ escape scanning for faster keystroke echo)
 * - AddressFamily=inet (skip IPv6 resolution to avoid intermittent stalls)
 */
export const SSH_INTERACTIVE_OPTS: string[] = [
  "-o",
  "StrictHostKeyChecking=accept-new",
  "-o",
  "UserKnownHostsFile=/dev/null",
  "-o",
  "LogLevel=ERROR",
  "-o",
  "ConnectTimeout=10",
  "-o",
  "ServerAliveInterval=15",
  "-o",
  "ServerAliveCountMax=3",
  "-o",
  "GSSAPIAuthentication=no",
  "-o",
  "TCPKeepAlive=no",
  "-o",
  "Compression=no",
  "-o",
  "IPQoS=lowdelay",
  "-o",
  "EscapeChar=none",
  "-o",
  "AddressFamily=inet",
  "-t",
];

// ─── Remote Path Validation ─────────────────────────────────────────────────

/**
 * Validate a remote file path for use with scp/ssh file operations.
 *
 * Rejects path traversal (.. segments), argument injection (leading dashes),
 * and characters outside a safe allowlist. The `..` check is performed on the
 * RAW input before normalize() so that crafted paths like `/tmp/../../etc/passwd`
 * (which normalize to `/etc/passwd`) are still caught.
 *
 * @param remotePath - The raw remote path to validate
 * @param allowedCharsPattern - Optional regex for allowed characters
 *   (default: alphanumerics, `/`, `.`, `_`, `~`, `$`, `{`, `}`, `:`, `-`)
 * @returns The normalized path if valid
 * @throws Error if the path is unsafe
 */
export function validateRemotePath(remotePath: string, allowedCharsPattern: RegExp = /^[\w/.~${}:-]+$/): string {
  // 1. Check for ".." traversal in the RAW input BEFORE normalize() strips it
  if (remotePath.includes("..")) {
    throw new Error(`Invalid remote path: path traversal detected ("..") in: ${remotePath}`);
  }
  // 2. Reject empty paths
  if (!remotePath) {
    throw new Error("Invalid remote path: path must not be empty");
  }
  // 3. Normalize (resolve . segments, collapse slashes)
  const normalized = normalize(remotePath);
  // 4. Double-check normalized result for ".." (defense in depth)
  if (normalized.includes("..")) {
    throw new Error(`Invalid remote path: path traversal detected ("..") in normalized: ${normalized}`);
  }
  // 5. Character allowlist
  if (!allowedCharsPattern.test(normalized)) {
    throw new Error(`Invalid remote path: contains unsafe characters: ${remotePath}`);
  }
  // 6. Reject argument injection (segments starting with -)
  if (normalized.split("/").some((s) => s.startsWith("-"))) {
    throw new Error(`Invalid remote path: segments must not start with "-": ${remotePath}`);
  }
  return normalized;
}

// ─── Interactive Spawn ───────────────────────────────────────────────────────

/**
 * Spawn a child process for an interactive terminal session using spawnSync.
 *
 * Why spawnSync instead of Bun.spawn?
 * Bun's async event loop keeps polling fd 0 (stdin) even after
 * process.stdin.pause()/destroy(). With Bun.spawn + stdio:"inherit",
 * both the parent's event loop and the child (SSH) race for bytes on
 * the same fd, causing random keystroke drops.
 *
 * spawnSync blocks the event loop entirely, so the child process is the
 * sole reader of stdin. This matches the behavior of running SSH directly
 * from a shell.
 */
export function spawnInteractive(args: string[], env?: Record<string, string | undefined>): number {
  // Use Node's spawnSync (not Bun.spawnSync) — it's more battle-tested
  // with interactive TTY programs and properly handles SIGWINCH, job
  // control, and terminal I/O forwarding.
  const result = nodeSpawnSync(args[0], args.slice(1), {
    stdio: "inherit",
    env: env ?? process.env,
  });

  // Reset terminal state after the interactive session ends.
  // The remote agent's TUI (e.g. Claude Code) may leave the terminal in
  // raw mode or with altered attributes, causing garbled post-session output.
  if (process.stderr.isTTY) {
    process.stderr.write("\x1b[0m\x1b[?25h"); // reset attributes + show cursor
  }
  if (process.stdout.isTTY) {
    process.stdout.write("\x1b[0m\x1b[?25h");
  }
  // Restore sane terminal settings (cooked mode, echo, etc.)
  tryCatch(() =>
    nodeSpawnSync(
      "stty",
      [
        "sane",
      ],
      {
        stdio: "inherit",
      },
    ),
  );

  return result.status ?? 1;
}

// ─── Helpers ─────────────────────────────────────────────────────────────────

/** Async sleep — shared across all cloud providers. */
export function sleep(ms: number): Promise<void> {
  return new Promise((r) => setTimeout(r, ms));
}

/**
 * Kill a child process with SIGTERM, then escalate to SIGKILL after a grace period.
 *
 * SSH processes stuck in network I/O can ignore SIGTERM indefinitely,
 * causing `await proc.exited` to hang forever. This helper ensures the
 * process is forcefully killed if it doesn't respond to SIGTERM.
 */
export function killWithTimeout(
  proc: {
    kill(signal?: number): void;
  },
  gracePeriodMs = 5000,
): void {
  const r = tryCatch(() => proc.kill());
  if (!r.ok) {
    return;
  }
  const sigkillTimer = setTimeout(() => {
    tryCatch(() => proc.kill(9));
  }, gracePeriodMs);
  // Don't let this timer keep the event loop alive — the process may already
  // be dead from SIGTERM, so there's no reason to block exit for 5 seconds.
  sigkillTimer.unref();
}

// ─── TCP Pre-Check ───────────────────────────────────────────────────────────

/**
 * Probe whether a TCP port is open using node:net.
 * Returns true if the connection succeeds within `timeoutMs`, false otherwise.
 * This is much cheaper than a full SSH handshake attempt.
 */
function tcpCheck(host: string, port: number, timeoutMs = 2000): Promise<boolean> {
  return new Promise((resolve) => {
    const socket = connect({
      host,
      port,
    });
    const timer = setTimeout(() => {
      socket.destroy();
      resolve(false);
    }, timeoutMs);
    socket.on("connect", () => {
      clearTimeout(timer);
      socket.destroy();
      resolve(true);
    });
    socket.on("error", () => {
      clearTimeout(timer);
      socket.destroy();
      resolve(false);
    });
  });
}

// ─── SSH Tunnel ──────────────────────────────────────────────────────────

export interface SshTunnelHandle {
  localPort: number;
  stop: () => void;
  exited: Promise<number>;
}

/**
 * Start an SSH tunnel forwarding a remote port to localhost.
 * Tries local ports starting from `remotePort` up to `remotePort + 10`.
 * Throws if no port is available or the SSH connection fails immediately.
 */
export async function startSshTunnel(opts: {
  host: string;
  user: string;
  remotePort: number;
  localPort?: number;
  sshKeyOpts?: string[];
}): Promise<SshTunnelHandle> {
  const { host, user, remotePort, sshKeyOpts } = opts;

  // Find available local port
  let localPort = opts.localPort ?? remotePort;
  let found = false;
  for (let p = localPort; p <= localPort + 10; p++) {
    const inUse = await tcpCheck("127.0.0.1", p, 500);
    if (!inUse) {
      localPort = p;
      found = true;
      break;
    }
  }
  if (!found) {
    throw new Error(`No available local port in range ${remotePort}-${remotePort + 10}`);
  }

  const args = [
    "ssh",
    ...SSH_BASE_OPTS,
    ...(sshKeyOpts ?? []),
    "-N",
    "-L",
    `${localPort}:127.0.0.1:${remotePort}`,
    `${user}@${host}`,
  ];

  const proc = Bun.spawn(args, {
    stdio: [
      "ignore",
      "ignore",
      "pipe",
    ],
  });

  // Wait briefly to detect immediate failures (bad auth, connection refused)
  await sleep(1500);

  if (proc.exitCode !== null) {
    const stderr = await new Response(proc.stderr).text();
    throw new Error(`SSH tunnel failed: ${stderr.trim() || `exit code ${proc.exitCode}`}`);
  }

  return {
    localPort,
    stop: () => killWithTimeout(proc),
    exited: proc.exited,
  };
}

// ─── SSH Wait ────────────────────────────────────────────────────────────────

export interface WaitForSshOpts {
  host: string;
  user: string;
  /** Maximum total attempts across both phases. Default: 36 (~3 min). */
  maxAttempts?: number;
  /** Path to SSH identity file (e.g. ~/.ssh/id_ed25519). */
  sshKeyPath?: string;
  /** Extra SSH options appended after SSH_BASE_OPTS. */
  extraSshOpts?: string[];
}

/**
 * Two-phase SSH wait with resilience improvements:
 *
 * **Phase 1 (TCP probe):** Loop with cheap TCP probes until port 22 is open.
 *   Uses 2s intervals. Avoids the 10s ConnectTimeout overhead when sshd isn't
 *   even listening yet (VM still booting).
 *
 * **Phase 2 (SSH handshake):** Once port 22 is open, attempt full SSH `echo ok`.
 *   Uses 3s intervals. Captures stderr so the user sees the actual error reason.
 *
 * Total budget: ~`maxAttempts` attempts spread across both phases.
 * Effective timeout: ~3 min with defaults.
 */
export async function waitForSsh(opts: WaitForSshOpts): Promise<void> {
  const { host, user, sshKeyPath, extraSshOpts } = opts;
  const maxAttempts = opts.maxAttempts ?? 36;

  // Build SSH args
  const sshArgs: string[] = [
    ...SSH_BASE_OPTS,
  ];
  if (sshKeyPath) {
    sshArgs.push("-i", sshKeyPath);
  }
  if (extraSshOpts) {
    sshArgs.push(...extraSshOpts);
  }

  // ── Phase 1: TCP probe ────────────────────────────────────────────────────
  logStep("Waiting for SSH port to open...");
  let attempt = 0;
  let tcpOpen = false;
  while (attempt < maxAttempts) {
    attempt += 1;
    const open = await tcpCheck(host, 22, 2000);
    if (open) {
      tcpOpen = true;
      logStepDone();
      logInfo("SSH port 22 is open");
      break;
    }
    if (attempt % 5 === 0 || attempt === 1) {
      logStepInline(`Waiting for SSH port... (${attempt}/${maxAttempts} attempts)`);
    }
    await sleep(2000);
  }

  if (!tcpOpen) {
    logStepDone();
    logError(`SSH port 22 never opened after ${maxAttempts} attempts`);
    throw new Error("SSH connectivity timeout — port 22 never opened");
  }

  // ── Phase 2: SSH handshake ────────────────────────────────────────────────
  logStep("Waiting for SSH handshake...");
  const remaining = maxAttempts - attempt;
  // At least 5 handshake attempts even if TCP phase used most of the budget
  const handshakeAttempts = Math.max(remaining, 5);

  for (let i = 1; i <= handshakeAttempts; i++) {
    const r = await asyncTryCatch(async () => {
      const proc = Bun.spawn(
        [
          "ssh",
          ...sshArgs,
          `${user}@${host}`,
          "echo ok",
        ],
        {
          stdio: [
            "ignore",
            "pipe",
            "pipe",
          ],
        },
      );
      // Per-process timeout: ConnectTimeout=10 only covers TCP connect, not
      // the full SSH handshake. If sshd accepts the connection but stalls
      // during key exchange or auth, the process hangs indefinitely. Kill it
      // after 30s so the retry loop can continue.
      const timer = setTimeout(() => killWithTimeout(proc), 30_000);
      const inner = await asyncTryCatch(async () => {
        const [stdout, stderr] = await Promise.all([
          new Response(proc.stdout).text(),
          new Response(proc.stderr).text(),
        ]);
        const exitCode = await proc.exited;

        if (exitCode === 0 && stdout.includes("ok")) {
          return {
            stdout,
            stderr,
            exitCode,
          };
        }

        // Show the actual SSH error reason dimly so users can debug
        const reason = stderr.trim();
        if (reason) {
          logStep(`SSH handshake failed (${i}/${handshakeAttempts}): ${reason}`);
        } else {
          logStep(`SSH handshake failed (${i}/${handshakeAttempts})`);
        }
        return null;
      });
      clearTimeout(timer);
      if (!inner.ok) {
        throw inner.error;
      }
      return inner.data;
    });
    if (r.ok && r.data !== null) {
      logInfo("SSH is ready");
      return;
    }
    if (!r.ok) {
      logStep(`SSH handshake error (${i}/${handshakeAttempts})`);
    }
    await sleep(3000);
  }

  logError(`SSH handshake failed after ${handshakeAttempts} attempts`);
  throw new Error("SSH connectivity timeout — handshake never succeeded");
}

/**
 * Wait for SSH availability on a snapshot-booted VM (no cloud-init needed).
 * Used by cloud modules that support snapshot-based provisioning (Hetzner, DigitalOcean).
 */
export async function waitForSshSnapshotBoot(ip: string, extraSshOpts: string[]): Promise<void> {
  await waitForSsh({
    host: ip,
    user: "root",
    maxAttempts: 36,
    extraSshOpts,
  });
  logInfo("SSH available (snapshot boot — skipping cloud-init)");
}