feat(telemetry): funnel + lifecycle events for onboarding drop-off (#3305)
Some checks are pending
CLI Release / Build and release CLI (push) Waiting to run
Lint / macOS Compatibility (push) Waiting to run
Lint / ShellCheck (push) Waiting to run
Lint / Biome Lint (push) Waiting to run

* feat(telemetry): funnel + lifecycle events for onboarding drop-off

Adds low-volume, high-signal product events on top of the existing
errors/warnings telemetry (shared/telemetry.ts). Answers "where do users
bail before reaching a running agent" at the fleet level.

Funnel events (in orchestrate.ts, both fast and sequential paths):

  funnel_started              pipeline begins
  funnel_cloud_authed         cloud.authenticate() ok
  funnel_credentials_ready    OR key + preProvision resolved
  funnel_vm_ready             VM booted and SSH-reachable
  funnel_install_completed    agent install succeeded (tarball or live)
  funnel_configure_completed  agent.configure() ran
  funnel_prelaunch_completed  gateway / dashboard / preLaunch hooks done
  funnel_handoff              about to launch TUI (final step)

Every event carries elapsed_ms since funnel_started, plus agent and cloud
via telemetry context. Per-step counts reveal the drop-off funnel in
PostHog without touching any PII.

Lifecycle events (new shared/lifecycle-telemetry.ts):

  spawn_connected  { spawn_id, agent, cloud, connect_count, date }
    fired from list.ts when the user reconnects via the interactive picker.
    Increments connection.metadata.connect_count and writes last_connected_at
    so subsequent events and the eventual spawn_deleted have the total.

  spawn_deleted    { spawn_id, agent, cloud, lifetime_hours, connect_count, date }
    fired from delete.ts (both interactive confirmAndDelete and headless
    cmdDelete loop) after a successful cloud destroy. lifetime_hours is
    computed from SpawnRecord.timestamp to now. Clamped at 0 for corrupt
    clocks. connect_count is read from metadata.

New captureEvent(name, properties) helper in telemetry.ts:
- Respects SPAWN_TELEMETRY=0 opt-out (no new flag)
- Runs every string property through the existing scrubber (API keys,
  GitHub tokens, bearer, emails, IPs, base64 blobs, home paths)
- Non-string values pass through untouched

Tests: 20 new (15 lifecycle-telemetry + 2 captureEvent + 3 assertion
additions to disabled-telemetry). Full suite: 2129/2129 pass.

Bumps 1.0.10 -> 1.0.11. Patch bump — auto-propagates under #3296 policy.

* fix(test): replace mock.module with spyOn in lifecycle-telemetry tests

mock.module contaminates the global module registry when running under
--coverage, causing telemetry.test.ts and history-cov.test.ts to receive
mocked implementations instead of the real modules. Switch to spyOn with
mockRestore in afterEach so the real modules are preserved across files.

Agent: pr-maintainer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: L <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Ahmed Abushagur 2026-04-14 21:35:53 -07:00 committed by GitHub
parent 4de37274e4
commit 1e64d34e5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 468 additions and 2 deletions

View file

@ -0,0 +1,231 @@
/**
* lifecycle-telemetry.test.ts Verifies trackSpawnConnected /
* trackSpawnDeleted emit the right PostHog events and persist the
* connect_count + last_connected_at metadata.
*/
import type { SpawnRecord } from "../history";
import { afterEach, beforeEach, describe, expect, it, spyOn } from "bun:test";
import { isNumber, isString } from "@openrouter/spawn-shared";
// Import the real modules so we can spy on their exports without
// polluting the global module registry (mock.module contaminates
// other test files when running under --coverage).
import * as historyMod from "../history";
import { trackSpawnConnected, trackSpawnDeleted } from "../shared/lifecycle-telemetry";
import * as telemetryMod from "../shared/telemetry";
const savedMetadataCalls: Array<{
entries: Record<string, string>;
spawnId?: string;
}> = [];
const capturedEvents: Array<{
event: string;
properties: Record<string, unknown>;
}> = [];
// ── Helpers ─────────────────────────────────────────────────────────────
function makeRecord(overrides: Partial<SpawnRecord> = {}): SpawnRecord {
return {
id: "spawn-abc123",
agent: "claude",
cloud: "digitalocean",
timestamp: "2026-04-13T12:00:00.000Z",
connection: {
ip: "10.0.0.1",
user: "root",
cloud: "digitalocean",
metadata: {},
},
...overrides,
};
}
// ── Tests ───────────────────────────────────────────────────────────────
describe("lifecycle-telemetry", () => {
let saveMetadataSpy: ReturnType<typeof spyOn>;
let captureEventSpy: ReturnType<typeof spyOn>;
beforeEach(() => {
savedMetadataCalls.length = 0;
capturedEvents.length = 0;
saveMetadataSpy = spyOn(historyMod, "saveMetadata").mockImplementation(
(entries: Record<string, string>, spawnId?: string) => {
savedMetadataCalls.push({
entries,
spawnId,
});
},
);
captureEventSpy = spyOn(telemetryMod, "captureEvent").mockImplementation(
(event: string, properties: Record<string, unknown>) => {
capturedEvents.push({
event,
properties,
});
},
);
});
afterEach(() => {
saveMetadataSpy.mockRestore();
captureEventSpy.mockRestore();
savedMetadataCalls.length = 0;
capturedEvents.length = 0;
});
describe("trackSpawnConnected", () => {
it("starts the connect count at 1 when metadata is empty", () => {
const record = makeRecord();
const count = trackSpawnConnected(record);
expect(count).toBe(1);
expect(savedMetadataCalls).toHaveLength(1);
expect(savedMetadataCalls[0].entries.connect_count).toBe("1");
expect(savedMetadataCalls[0].spawnId).toBe("spawn-abc123");
});
it("increments an existing connect count", () => {
const record = makeRecord({
connection: {
ip: "10.0.0.1",
user: "root",
cloud: "digitalocean",
metadata: {
connect_count: "4",
},
},
});
const count = trackSpawnConnected(record);
expect(count).toBe(5);
expect(savedMetadataCalls[0].entries.connect_count).toBe("5");
});
it("tolerates malformed connect_count by resetting to 1", () => {
const record = makeRecord({
connection: {
ip: "10.0.0.1",
user: "root",
cloud: "digitalocean",
metadata: {
connect_count: "not-a-number",
},
},
});
const count = trackSpawnConnected(record);
// Malformed parses to 0, +1 = 1. Never throws.
expect(count).toBe(1);
});
it("updates last_connected_at to an ISO timestamp", () => {
trackSpawnConnected(makeRecord());
const ts = savedMetadataCalls[0].entries.last_connected_at;
expect(ts).toBeDefined();
// ISO 8601 format YYYY-MM-DDTHH:MM:SS.sssZ
expect(ts).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/);
});
it("emits spawn_connected event with spawn metadata", () => {
const record = makeRecord();
trackSpawnConnected(record);
expect(capturedEvents).toHaveLength(1);
expect(capturedEvents[0].event).toBe("spawn_connected");
expect(capturedEvents[0].properties.spawn_id).toBe("spawn-abc123");
expect(capturedEvents[0].properties.agent).toBe("claude");
expect(capturedEvents[0].properties.cloud).toBe("digitalocean");
expect(capturedEvents[0].properties.connect_count).toBe(1);
});
it("is a no-op for records without an id or connection", () => {
const noId = makeRecord({
id: undefined,
});
expect(trackSpawnConnected(noId)).toBe(0);
expect(savedMetadataCalls).toHaveLength(0);
expect(capturedEvents).toHaveLength(0);
const noConn = makeRecord({
connection: undefined,
});
expect(trackSpawnConnected(noConn)).toBe(0);
expect(savedMetadataCalls).toHaveLength(0);
expect(capturedEvents).toHaveLength(0);
});
});
describe("trackSpawnDeleted", () => {
it("emits spawn_deleted with lifetime_hours computed from timestamp", () => {
// Record created 3 hours ago. With `new Date()` in the helper we can't
// easily mock the clock here, so we assert on a loose-but-correct
// range (3h +/- a minute).
const threeHoursAgo = new Date(Date.now() - 3 * 60 * 60 * 1000).toISOString();
const record = makeRecord({
timestamp: threeHoursAgo,
});
trackSpawnDeleted(record);
expect(capturedEvents).toHaveLength(1);
expect(capturedEvents[0].event).toBe("spawn_deleted");
const rawLifetime = capturedEvents[0].properties.lifetime_hours;
const lifetime = isNumber(rawLifetime) ? rawLifetime : 0;
expect(lifetime).toBeGreaterThanOrEqual(2.98);
expect(lifetime).toBeLessThanOrEqual(3.02);
});
it("reports the final connect count", () => {
const record = makeRecord({
connection: {
ip: "10.0.0.1",
user: "root",
cloud: "digitalocean",
metadata: {
connect_count: "7",
},
},
});
trackSpawnDeleted(record);
expect(capturedEvents[0].properties.connect_count).toBe(7);
});
it("clamps negative lifetimes to 0 (corrupt clock / timestamp)", () => {
const futureTimestamp = new Date(Date.now() + 60 * 60 * 1000).toISOString();
const record = makeRecord({
timestamp: futureTimestamp,
});
trackSpawnDeleted(record);
expect(capturedEvents[0].properties.lifetime_hours).toBe(0);
});
it("is a no-op for records without an id", () => {
trackSpawnDeleted(
makeRecord({
id: undefined,
}),
);
expect(capturedEvents).toHaveLength(0);
});
it("includes spawn_id, agent, cloud, and date on every event", () => {
trackSpawnDeleted(makeRecord());
const props = capturedEvents[0].properties;
expect(props.spawn_id).toBe("spawn-abc123");
expect(props.agent).toBe("claude");
expect(props.cloud).toBe("digitalocean");
expect(isString(props.date)).toBe(true);
expect(props.date).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/);
});
});
});

View file

@ -11,6 +11,7 @@
*/
import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test";
import { isString } from "@openrouter/spawn-shared";
import * as v from "valibot";
// ── Schemas for validating PostHog payloads ─────────────────────────────────
@ -408,9 +409,62 @@ describe("telemetry", () => {
mod.captureWarning("should not send");
mod.captureError("test", new Error("should not send"));
mod.captureEvent("should_not_send", {
spawn_id: "abc",
});
await flushAndWait();
expect(fetchMock).not.toHaveBeenCalled();
});
});
describe("captureEvent", () => {
it("emits a batched event with the given name and properties", async () => {
const mod = await import("../shared/telemetry.js");
mod.initTelemetry("1.2.3-test");
await drainStaleEvents();
mod.captureEvent("funnel_started", {
fast_mode: true,
elapsed_ms: 0,
});
await flushAndWait();
const body = getLastBatchBody(fetchMock);
expect(body).not.toBeNull();
const evt = body?.batch[0];
expect(evt?.event).toBe("funnel_started");
expect(evt?.properties.fast_mode).toBe(true);
expect(evt?.properties.elapsed_ms).toBe(0);
expect(evt?.properties.spawn_version).toBe("1.2.3-test");
});
it("scrubs string property values but leaves non-strings alone", async () => {
const mod = await import("../shared/telemetry.js");
mod.initTelemetry("1.2.3-test");
await drainStaleEvents();
mod.captureEvent("spawn_connected", {
spawn_id: "abc123",
note: "contact me at alice@example.com about sk-or-v1-1234567890abcdef",
connect_count: 5,
lifetime_hours: 3.5,
});
await flushAndWait();
const body = getLastBatchBody(fetchMock);
const props = body?.batch[0]?.properties;
// Non-string values pass through untouched.
expect(props?.spawn_id).toBe("abc123");
expect(props?.connect_count).toBe(5);
expect(props?.lifetime_hours).toBe(3.5);
// String values get scrubbed.
const rawNote = props?.note;
const note = isString(rawNote) ? rawNote : "";
expect(note).toContain("[REDACTED_EMAIL]");
expect(note).not.toContain("alice@example.com");
expect(note).toContain("[REDACTED_KEY]");
expect(note).not.toContain("sk-or-v1-1234567890abcdef");
});
});
});

View file

@ -22,6 +22,7 @@ import {
validateServerIdentifier,
validateUsername,
} from "../security.js";
import { trackSpawnDeleted } from "../shared/lifecycle-telemetry.js";
import { getHistoryPath } from "../shared/paths.js";
import { asyncTryCatch, asyncTryCatchIf, isNetworkError, tryCatch } from "../shared/result.js";
import { ensureSpriteAuthenticated, ensureSpriteCli, destroyServer as spriteDestroyServer } from "../sprite/sprite.js";
@ -259,6 +260,8 @@ export async function confirmAndDelete(
if (success) {
const detail = lastMessage ? `: ${lastMessage}` : "";
p.log.success(`Server "${label}" deleted${detail}`);
// Lifecycle telemetry: lifetime hours + final login count.
trackSpawnDeleted(record);
} else {
const detail = lastMessage ? `: ${lastMessage}` : "";
p.log.error(`Failed to delete "${label}"${detail}`);
@ -448,6 +451,8 @@ export async function cmdDelete(
const ok = await execDeleteServer(record);
if (ok) {
p.log.success(`Server "${label}" deleted`);
// Lifecycle telemetry: headless path also fires the event.
trackSpawnDeleted(record);
}
}
return;

View file

@ -15,6 +15,7 @@ import {
updateRecordIp,
} from "../history.js";
import { agentKeys, cloudKeys, loadManifest } from "../manifest.js";
import { trackSpawnConnected } from "../shared/lifecycle-telemetry.js";
import { asyncTryCatch, tryCatch, unwrapOr } from "../shared/result.js";
import { cmdConnect, cmdEnterAgent, cmdOpenDashboard } from "./connect.js";
import { confirmAndDelete } from "./delete.js";
@ -707,6 +708,10 @@ export async function handleRecordAction(
}
if (action === "reconnect") {
// Lifecycle telemetry: record the login BEFORE we hand off to SSH.
// cmdConnect spawns an interactive session and never returns under normal
// use, so calling trackSpawnConnected after would be unreachable code.
trackSpawnConnected(selected);
const reconnectResult = await asyncTryCatch(() => cmdConnect(conn, selected.agent));
if (!reconnectResult.ok) {
p.log.error(`Connection failed: ${getErrorMessage(reconnectResult.error)}`);

View file

@ -0,0 +1,101 @@
// shared/lifecycle-telemetry.ts — Track spawn-level lifecycle events:
// login count and total lifetime on delete.
//
// Why it's here and not in telemetry.ts:
// telemetry.ts is a low-level primitive (PostHog batching, scrubbing,
// session context). It deliberately has no knowledge of SpawnRecord,
// history, or any product concepts. Lifecycle helpers need both, so
// they live one layer up.
//
// Event shapes (all respect SPAWN_TELEMETRY=0 opt-out via captureEvent):
//
// spawn_connected { spawn_id, agent, cloud, connect_count, date }
// spawn_deleted { spawn_id, agent, cloud, lifetime_hours, connect_count, date }
//
// Persistence model:
// connect_count + last_connected_at are stored inside
// SpawnRecord.connection.metadata as strings (the existing schema is
// Record<string, string>, so we serialize numbers as strings and parse
// on read). saveMetadata merges — no risk of clobbering other keys.
import type { SpawnRecord } from "../history.js";
import { saveMetadata } from "../history.js";
import { captureEvent } from "./telemetry.js";
/** Read the stored connect count for a spawn, defaulting to 0. */
function readConnectCount(record: SpawnRecord): number {
const raw = record.connection?.metadata?.connect_count;
if (!raw) {
return 0;
}
const n = Number.parseInt(raw, 10);
return Number.isFinite(n) && n >= 0 ? n : 0;
}
/** Compute lifetime hours between spawn creation and now (or delete time). */
function computeLifetimeHours(record: SpawnRecord, endIso?: string): number {
const start = Date.parse(record.timestamp);
const end = endIso ? Date.parse(endIso) : Date.now();
if (!Number.isFinite(start) || !Number.isFinite(end) || end < start) {
return 0;
}
return Math.round(((end - start) / (1000 * 60 * 60)) * 100) / 100;
}
/**
* Record a user reconnecting to an existing spawn.
*
* Increments the stored connect_count, updates last_connected_at, and fires
* a spawn_connected telemetry event. Returns the new count so callers can
* also display it if they want.
*/
export function trackSpawnConnected(record: SpawnRecord): number {
if (!record.id || !record.connection) {
return 0;
}
const newCount = readConnectCount(record) + 1;
const nowIso = new Date().toISOString();
saveMetadata(
{
connect_count: String(newCount),
last_connected_at: nowIso,
},
record.id,
);
captureEvent("spawn_connected", {
spawn_id: record.id,
agent: record.agent,
cloud: record.cloud,
connect_count: newCount,
date: nowIso,
});
return newCount;
}
/**
* Record a user deleting a spawn.
*
* Emits a spawn_deleted event with the total lifetime (hours) and final
* login count, so we can build a "typical spawn lives N hours, N logins"
* picture in aggregate. Call AFTER the cloud destroy succeeds failed
* deletes should not fire this event.
*/
export function trackSpawnDeleted(record: SpawnRecord): void {
if (!record.id) {
return;
}
const nowIso = new Date().toISOString();
captureEvent("spawn_deleted", {
spawn_id: record.id,
agent: record.agent,
cloud: record.cloud,
lifetime_hours: computeLifetimeHours(record, nowIso),
connect_count: readConnectCount(record),
date: nowIso,
});
}

View file

@ -28,6 +28,7 @@ import { isWindows } from "./shell.js";
import { injectSpawnSkill } from "./spawn-skill.js";
import { sleep, startSshTunnel } from "./ssh.js";
import { ensureSshKeys, getSshKeyOpts } from "./ssh-keys.js";
import { captureEvent, setTelemetryContext } from "./telemetry.js";
import {
logDebug,
logError,
@ -43,6 +44,25 @@ import {
withRetry,
} from "./ui.js";
// ── Funnel telemetry ────────────────────────────────────────────────────────
//
// Tracks onboarding pipeline drop-off. Events flow through the shared
// PostHog pipeline in shared/telemetry.ts and respect SPAWN_TELEMETRY=0 opt-out.
// No PII — only agent/cloud names and elapsed timing. The goal is to answer
// "where do users bail before reaching a running agent" at the fleet level.
let _funnelStart = 0;
function funnelElapsedMs(): number {
return _funnelStart > 0 ? Date.now() - _funnelStart : 0;
}
function trackFunnel(step: string, extra: Record<string, unknown> = {}): void {
captureEvent(step, {
elapsed_ms: funnelElapsedMs(),
...extra,
});
}
/** Docker container name used by --beta docker deployments. */
export const DOCKER_CONTAINER_NAME = "spawn-agent";
/** Docker registry hosting spawn agent images. */
@ -298,8 +318,16 @@ export async function runOrchestration(
logInfo(`${agent.name} on ${cloud.cloudLabel}`);
process.stderr.write("\n");
// Funnel telemetry: mark the start of the onboarding pipeline and attach
// agent/cloud as context so every event carries them automatically.
_funnelStart = Date.now();
setTelemetryContext("agent", agentName);
setTelemetryContext("cloud", cloud.cloudName);
trackFunnel("funnel_started");
// 1. Authenticate with cloud provider
await cloud.authenticate();
trackFunnel("funnel_cloud_authed");
const betaFeatures = new Set((process.env.SPAWN_BETA ?? "").split(",").filter(Boolean));
const fastMode = process.env.SPAWN_FAST === "1" || betaFeatures.has("parallel");
@ -370,12 +398,14 @@ export async function runOrchestration(
recordSpawn(spawnId, agentName, cloud.cloudName, connection);
await cloud.waitForReady();
}
trackFunnel("funnel_vm_ready");
// API key must succeed
if (apiKeyResult.status === "rejected") {
throw apiKeyResult.reason;
}
const apiKey = apiKeyResult.value;
trackFunnel("funnel_credentials_ready");
// Model ID
const rawModelId = process.env.MODEL_ID || loadPreferredModel(agentName) || agent.modelDefault;
@ -414,6 +444,7 @@ export async function runOrchestration(
}
}
}
trackFunnel("funnel_install_completed");
// Inject env + continue with shared post-install flow
clearInterval(keepAlive);
@ -434,6 +465,7 @@ export async function runOrchestration(
// 2. Get API key
const resolveApiKey = options?.getApiKey ?? getOrPromptApiKey;
const apiKey = await resolveApiKey(agentName, cloud.cloudName);
trackFunnel("funnel_credentials_ready");
// 3. Pre-provision hooks
if (agent.preProvision) {
@ -473,6 +505,7 @@ export async function runOrchestration(
logError(getErrorMessage(r.error));
await retryOrQuit("Server may still be starting. Keep waiting?");
}
trackFunnel("funnel_vm_ready");
// 7. Env config
const envPairs = agent.envVars(apiKey);
@ -504,6 +537,7 @@ export async function runOrchestration(
}
}
}
trackFunnel("funnel_install_completed");
// Inject env + continue with shared post-install flow
await injectEnvVars(cloud, envContent);
@ -595,6 +629,7 @@ async function postInstall(
logWarn("Agent configuration failed (continuing with defaults)");
}
}
trackFunnel("funnel_configure_completed");
// GitHub CLI setup
if (!enabledSteps || enabledSteps.has("github")) {
@ -715,6 +750,7 @@ async function postInstall(
await retryOrQuit("Retry pre-launch setup?");
}
}
trackFunnel("funnel_prelaunch_completed");
// Web dashboard access
let tunnelHandle: SshTunnelHandle | undefined;
@ -809,6 +845,13 @@ async function postInstall(
logInfo(`Agent setup complete — ${agent.name} is ready on ${cloud.cloudLabel}`);
process.stderr.write("\n");
// Final funnel event — pipeline completed all the way to handoff.
// Downstream analysis: (funnel_started count) - (funnel_handoff count) =
// total drop-off. Per-step counts reveal where the drop-off happens.
trackFunnel("funnel_handoff", {
headless: process.env.SPAWN_HEADLESS === "1",
});
const launchCmd = agent.launchCmd();
saveLaunchCmd(launchCmd, spawnId);

View file

@ -1,7 +1,9 @@
// shared/telemetry.ts — PostHog telemetry for errors, warnings, and crashes.
// shared/telemetry.ts — PostHog telemetry for errors, warnings, crashes, and
// low-volume product events (funnel steps, spawn lifecycle).
// Default on. Disable with SPAWN_TELEMETRY=0.
// Strictly errors/warnings/crashes — no command tracking, no session events.
// Never sends command args, file paths, or user prompt content.
import { isString } from "@openrouter/spawn-shared";
import { asyncTryCatch } from "./result.js";
// Same PostHog project as feedback.ts
@ -177,6 +179,31 @@ export function captureWarning(message: string): void {
});
}
/**
* Capture a generic telemetry event (funnel steps, lifecycle events, etc.).
*
* Respects SPAWN_TELEMETRY=0 when opt-out is set this is a no-op. All string
* values in `properties` are passed through the same scrubber as errors and
* warnings, so paths, API keys, emails, and IPs are redacted before upload.
*
* Intended for low-volume, high-signal product events like:
* - funnel_* (onboarding pipeline drop-off tracking in orchestrate.ts)
* - spawn_connected / spawn_deleted (lifecycle events)
*
* NOT intended for command tracking, keystroke tracking, or anything that
* could incidentally capture user-typed prompts or file paths.
*/
export function captureEvent(event: string, properties: Record<string, unknown> = {}): void {
if (!_enabled) {
return;
}
const scrubbed: Record<string, unknown> = {};
for (const [key, value] of Object.entries(properties)) {
scrubbed[key] = isString(value) ? scrub(value) : value;
}
pushEvent(event, scrubbed);
}
/** Map our error types to PostHog mechanism types. */
function mechanismType(type: string): string {
switch (type) {