diff --git a/packages/core/src/core/coreToolScheduler.test.ts b/packages/core/src/core/coreToolScheduler.test.ts index 0ac516e2d..b1d9137e6 100644 --- a/packages/core/src/core/coreToolScheduler.test.ts +++ b/packages/core/src/core/coreToolScheduler.test.ts @@ -88,7 +88,9 @@ vi.mock('../telemetry/tracer.js', () => ({ setAttribute: (key: string, value: string | number | boolean) => void; end: () => void; }) => Promise, + options?: { autoOkOnSuccess?: boolean }, ) => { + const autoOkOnSuccess = options?.autoOkOnSuccess ?? true; const record: ToolSpanRecord = { name, attributes, @@ -119,7 +121,7 @@ vi.mock('../telemetry/tracer.js', () => ({ try { const result = await fn(span); - if (!statusSet) { + if (autoOkOnSuccess && !statusSet) { record.statusCalls.push({ code: 1 }); } return result; @@ -3273,7 +3275,7 @@ describe('CoreToolScheduler telemetry spans', () => { ); }); - it('marks cancellation as UNSET with a failure kind and no auto-OK', async () => { + it('leaves cancellation spans with no explicit status (autoOkOnSuccess: false)', async () => { const abortController = new AbortController(); const { spanRecord, completedCalls } = await runSingleTool({ abortController, @@ -3287,12 +3289,14 @@ describe('CoreToolScheduler telemetry spans', () => { }); expect(completedCalls[0].status).toBe('cancelled'); - expect(spanRecord.statusCalls).toEqual([{ code: SpanStatusCode.UNSET }]); + // autoOkOnSuccess: false prevents withSpan from auto-setting OK; + // setToolSpanCancelled only sets the failure_kind attribute, not a status. + expect(spanRecord.statusCalls).toEqual([]); expect(spanRecord.spanAttributes['tool.failure_kind']).toBe('cancelled'); expect(spanRecord.ended).toBe(true); }); - it('sets cancellation status when span attribute recording fails', async () => { + it('sets cancellation attribute even when span attribute recording fails', async () => { const abortController = new AbortController(); const { spanRecord, completedCalls } = await runSingleTool({ abortController, @@ -3307,7 +3311,9 @@ describe('CoreToolScheduler telemetry spans', () => { }); expect(completedCalls[0].status).toBe('cancelled'); - expect(spanRecord.statusCalls).toEqual([{ code: SpanStatusCode.UNSET }]); + // No status set — autoOkOnSuccess: false, and setToolSpanCancelled + // only sets the attribute (which fails here, caught internally). + expect(spanRecord.statusCalls).toEqual([]); expect(spanRecord.spanAttributes).not.toHaveProperty('tool.failure_kind'); expect(spanRecord.ended).toBe(true); }); @@ -3327,11 +3333,25 @@ describe('CoreToolScheduler telemetry spans', () => { }); expect(completedCalls[0].status).toBe('cancelled'); + // setToolSpanCancelled no longer calls setStatus, so throwSpanSetStatus + // only affects the safeSetStatus(span, OK) in the success path (not hit). + // With autoOkOnSuccess: false, withSpan does not attempt setStatus either. expect(spanRecord.statusCalls).toEqual([]); expect(spanRecord.spanAttributes['tool.failure_kind']).toBe('cancelled'); expect(spanRecord.ended).toBe(true); }); + it('does not crash when safeSetStatus throws on the success path', async () => { + const { spanRecord, completedCalls } = await runSingleTool({ + throwSpanSetStatus: true, + }); + + expect(completedCalls[0].status).toBe('success'); + expect(spanRecord.statusCalls).toEqual([]); + expect(spanRecord.spanAttributes).not.toHaveProperty('tool.failure_kind'); + expect(spanRecord.ended).toBe(true); + }); + it('leaves successful tool calls to be marked OK by withSpan', async () => { const { spanRecord, completedCalls } = await runSingleTool(); diff --git a/packages/core/src/core/coreToolScheduler.ts b/packages/core/src/core/coreToolScheduler.ts index 9801bf581..8ac3f2895 100644 --- a/packages/core/src/core/coreToolScheduler.ts +++ b/packages/core/src/core/coreToolScheduler.ts @@ -131,9 +131,9 @@ function setToolSpanCancelled(span: Span): void { } catch { // OTel errors must not block the cancellation status update. } - safeSetStatus(span, { - code: SpanStatusCode.UNSET, - }); + // No explicit span status — cancellation is neither OK nor ERROR. + // The caller uses withSpan({ autoOkOnSuccess: false }) so withSpan + // will not auto-set OK, and the span ends with the default UNSET status. } async function safelyFirePostToolUseFailureHook( @@ -1993,7 +1993,6 @@ export class CoreToolScheduler { 'User cancelled tool execution.', ); } - // Load-bearing: prevents withSpan from auto-setting OK on normal return setToolSpanCancelled(span); return; // Both code paths should return here } @@ -2160,6 +2159,7 @@ export class CoreToolScheduler { : {}), }; this.setStatusInternal(callId, 'success', successResponse); + safeSetStatus(span, { code: SpanStatusCode.OK }); } else { // It is a failure // PostToolUseFailure Hook @@ -2226,7 +2226,6 @@ export class CoreToolScheduler { 'User cancelled tool execution.', ); } - // Load-bearing: prevents withSpan from auto-setting OK on normal return setToolSpanCancelled(span); return; } else { @@ -2267,6 +2266,7 @@ export class CoreToolScheduler { } } }, + { autoOkOnSuccess: false }, ); } diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts index d229e108a..14ffad815 100644 --- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts +++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts @@ -403,6 +403,7 @@ export class LoggingContentGenerator implements ContentGenerator { let firstModelVersion = ''; let lastUsageMetadata: GenerateContentResponseUsageMetadata | undefined; let terminalStatusAttempted = false; + let spanEnded = false; // Helper to run code within the span context during iteration. // This ensures debug log lines emitted during stream processing @@ -410,6 +411,34 @@ export class LoggingContentGenerator implements ContentGenerator { const runInSpan = (fn: () => T): T => spanContext ? context.with(spanContext, fn) : fn(); + // Idle timeout: if no chunks arrive for this duration the consumer has + // likely abandoned the generator without calling .return(). Close the + // span so it doesn't leak forever. The timer resets on every chunk, + // so legitimately long-running streams are never affected. + const STREAM_IDLE_TIMEOUT_MS = 5 * 60_000; // 5 minutes + let spanEndTimeout: ReturnType | undefined; + const resetSpanTimeout = span + ? () => { + if (spanEnded) return; + if (spanEndTimeout !== undefined) clearTimeout(spanEndTimeout); + spanEndTimeout = setTimeout(() => { + try { + safeSetStatus(span, { + code: SpanStatusCode.ERROR, + message: 'Stream span timed out (idle)', + }); + spanEnded = true; + span.setAttribute('stream.timed_out', true); + span.end(); + } catch { + // OTel errors must not interrupt the consumer. + } + }, STREAM_IDLE_TIMEOUT_MS); + spanEndTimeout.unref(); + } + : undefined; + resetSpanTimeout?.(); + try { for await (const response of stream) { if (!firstResponseId && response.responseId) { @@ -424,8 +453,13 @@ export class LoggingContentGenerator implements ContentGenerator { if (response.usageMetadata) { lastUsageMetadata = response.usageMetadata; } + resetSpanTimeout?.(); yield response; } + if (spanEndTimeout !== undefined) { + clearTimeout(spanEndTimeout); + spanEndTimeout = undefined; + } // Only log successful API response if no error occurred const durationMs = Date.now() - startTime; const consolidatedResponse = shouldCollectResponses @@ -483,15 +517,20 @@ export class LoggingContentGenerator implements ContentGenerator { } throw error; } finally { - if (!terminalStatusAttempted) { - if (span) { - safeSetStatus(span, { code: SpanStatusCode.OK }); - } + if (spanEndTimeout !== undefined) { + clearTimeout(spanEndTimeout); } - try { - span?.end(); - } catch { - // OTel errors must not mask the original API error + if (!spanEnded) { + if (!terminalStatusAttempted) { + if (span) { + safeSetStatus(span, { code: SpanStatusCode.OK }); + } + } + try { + span?.end(); + } catch { + // OTel errors must not mask the original API error + } } } } diff --git a/packages/core/src/telemetry/log-to-span-processor.test.ts b/packages/core/src/telemetry/log-to-span-processor.test.ts index bbc8dbfe4..26d9f098a 100644 --- a/packages/core/src/telemetry/log-to-span-processor.test.ts +++ b/packages/core/src/telemetry/log-to-span-processor.test.ts @@ -16,6 +16,12 @@ import { LogToSpanProcessor } from './log-to-span-processor.js'; import type { ReadableLogRecord } from '@opentelemetry/sdk-logs'; import type { SpanExporter } from '@opentelemetry/sdk-trace-base'; +let mockCurrentSessionId: string | undefined = undefined; + +vi.mock('./session-context.js', () => ({ + getCurrentSessionId: () => mockCurrentSessionId, +})); + interface ExportedSpan { name: string; kind: number; @@ -34,6 +40,7 @@ describe('LogToSpanProcessor', () => { beforeEach(() => { exportedSpans = []; + mockCurrentSessionId = undefined; mockExporter = { export: vi.fn((spans, cb) => { exportedSpans.push(...spans); @@ -708,4 +715,40 @@ describe('LogToSpanProcessor', () => { expect(mockExporter.shutdown).toHaveBeenCalledTimes(1); }); + + it('falls back to getCurrentSessionId when log record has no session.id', async () => { + mockCurrentSessionId = 'session-from-context'; + const logRecord = { + body: 'event without session attr', + hrTime: [1000, 0] as [number, number], + attributes: {}, + } as unknown as ReadableLogRecord; + + processor.onEmit(logRecord); + await processor.forceFlush(); + + // The traceId should be derived from the fallback session ID, + // not a random one. + const { deriveTraceId } = await import('./trace-id-utils.js'); + expect(exportedSpans[0].spanContext().traceId).toBe( + deriveTraceId('session-from-context'), + ); + }); + + it('prefers log record session.id over getCurrentSessionId', async () => { + mockCurrentSessionId = 'stale-session'; + const logRecord = { + body: 'event with session attr', + hrTime: [1000, 0] as [number, number], + attributes: { 'session.id': 'fresh-session' }, + } as unknown as ReadableLogRecord; + + processor.onEmit(logRecord); + await processor.forceFlush(); + + const { deriveTraceId } = await import('./trace-id-utils.js'); + expect(exportedSpans[0].spanContext().traceId).toBe( + deriveTraceId('fresh-session'), + ); + }); }); diff --git a/packages/core/src/telemetry/log-to-span-processor.ts b/packages/core/src/telemetry/log-to-span-processor.ts index c0e3951ea..fd6ada9d4 100644 --- a/packages/core/src/telemetry/log-to-span-processor.ts +++ b/packages/core/src/telemetry/log-to-span-processor.ts @@ -28,6 +28,7 @@ import { randomHexString, randomSpanId, } from './trace-id-utils.js'; +import { getCurrentSessionId } from './session-context.js'; const EXPORT_TIMEOUT_MS = 30_000; const DEFAULT_MAX_BUFFER_SIZE = 10_000; @@ -156,9 +157,13 @@ export class LogToSpanProcessor implements LogRecordProcessor { // Prefer a real active span context when OTel logs provide one, preserving // direct parentage. Otherwise derive traceId from session.id so all events - // in one session appear under a single trace. + // in one session appear under a single trace. Fall back to + // getCurrentSessionId() when the log record has no session.id attribute + // (e.g. after a session change via /clear or /resume). const parentSpanContext = getValidParentSpanContext(logRecord.spanContext); - const sessionId = logRecord.attributes?.['session.id']; + // || (not ??) so empty-string session.id also falls through to the fallback + const sessionId = + logRecord.attributes?.['session.id'] || getCurrentSessionId(); let traceId: string; if (parentSpanContext) { traceId = parentSpanContext.traceId; diff --git a/packages/core/src/telemetry/sdk.test.ts b/packages/core/src/telemetry/sdk.test.ts index d28eb92b9..970b198d0 100644 --- a/packages/core/src/telemetry/sdk.test.ts +++ b/packages/core/src/telemetry/sdk.test.ts @@ -544,9 +544,10 @@ describe('refreshSessionContext', () => { refreshSessionContext('new-session-id'); expect(createSessionRootContext).toHaveBeenCalledWith('new-session-id'); - expect(setSessionContext).toHaveBeenCalledWith({ - __sessionId: 'new-session-id', - }); + expect(setSessionContext).toHaveBeenCalledWith( + { __sessionId: 'new-session-id' }, + 'new-session-id', + ); }); it('should be a no-op when telemetry is not initialized', () => { diff --git a/packages/core/src/telemetry/sdk.ts b/packages/core/src/telemetry/sdk.ts index 815dd347a..a8c5a361e 100644 --- a/packages/core/src/telemetry/sdk.ts +++ b/packages/core/src/telemetry/sdk.ts @@ -290,7 +290,8 @@ export function initializeTelemetry(config: Config): void { sdk.start(); debugLogger.debug('OpenTelemetry SDK started successfully.'); telemetryInitialized = true; - setSessionContext(createSessionRootContext(config.getSessionId())); + const sessionId = config.getSessionId(); + setSessionContext(createSessionRootContext(sessionId), sessionId); initializeMetrics(config); } catch (error) { debugLogger.error('Error starting OpenTelemetry SDK:', error); @@ -305,7 +306,7 @@ export function initializeTelemetry(config: Config): void { export function refreshSessionContext(sessionId: string): void { if (!telemetryInitialized) return; try { - setSessionContext(createSessionRootContext(sessionId)); + setSessionContext(createSessionRootContext(sessionId), sessionId); } catch (error) { createDebugLogger('OTEL').warn('Failed to refresh session context:', error); } diff --git a/packages/core/src/telemetry/session-context.test.ts b/packages/core/src/telemetry/session-context.test.ts index 1a1b0a8c2..f4f4b23a7 100644 --- a/packages/core/src/telemetry/session-context.test.ts +++ b/packages/core/src/telemetry/session-context.test.ts @@ -6,7 +6,11 @@ import { afterEach, describe, expect, it } from 'vitest'; import { ROOT_CONTEXT, createContextKey } from '@opentelemetry/api'; -import { getSessionContext, setSessionContext } from './session-context.js'; +import { + getSessionContext, + setSessionContext, + getCurrentSessionId, +} from './session-context.js'; describe('session-context', () => { afterEach(() => { @@ -42,3 +46,36 @@ describe('session-context', () => { expect(getSessionContext()).toBeUndefined(); }); }); + +describe('getCurrentSessionId', () => { + afterEach(() => { + setSessionContext(undefined); + }); + + it('returns undefined when no session has been set', () => { + expect(getCurrentSessionId()).toBeUndefined(); + }); + + it('returns the session ID passed to setSessionContext', () => { + const key = createContextKey('sid-test-key'); + setSessionContext(ROOT_CONTEXT.setValue(key, 'ctx'), 'session-abc'); + + expect(getCurrentSessionId()).toBe('session-abc'); + }); + + it('updates when setSessionContext is called with a new session ID', () => { + const key = createContextKey('sid-update-key'); + setSessionContext(ROOT_CONTEXT.setValue(key, 'first'), 'session-1'); + setSessionContext(ROOT_CONTEXT.setValue(key, 'second'), 'session-2'); + + expect(getCurrentSessionId()).toBe('session-2'); + }); + + it('clears when setSessionContext is called without a session ID', () => { + const key = createContextKey('sid-clear-key'); + setSessionContext(ROOT_CONTEXT.setValue(key, 'ctx'), 'session-xyz'); + setSessionContext(undefined); + + expect(getCurrentSessionId()).toBeUndefined(); + }); +}); diff --git a/packages/core/src/telemetry/session-context.ts b/packages/core/src/telemetry/session-context.ts index e6b672790..e4bbf3e2a 100644 --- a/packages/core/src/telemetry/session-context.ts +++ b/packages/core/src/telemetry/session-context.ts @@ -7,11 +7,25 @@ import type { Context } from '@opentelemetry/api'; let sessionRootContext: Context | undefined; +let currentSessionId: string | undefined; -export function setSessionContext(ctx: Context | undefined): void { +export function setSessionContext( + ctx: Context | undefined, + sessionId?: string, +): void { sessionRootContext = ctx; + currentSessionId = sessionId; } export function getSessionContext(): Context | undefined { return sessionRootContext; } + +/** + * Returns the most recent session ID passed to setSessionContext. + * Used by LogToSpanProcessor as a fallback to derive the correct traceId + * when a log record has no session.id attribute (e.g. after /clear or /resume). + */ +export function getCurrentSessionId(): string | undefined { + return currentSessionId; +} diff --git a/packages/core/src/telemetry/tracer.test.ts b/packages/core/src/telemetry/tracer.test.ts index 89ecf0771..93d79f5c8 100644 --- a/packages/core/src/telemetry/tracer.test.ts +++ b/packages/core/src/telemetry/tracer.test.ts @@ -303,6 +303,70 @@ describe('withSpan', () => { expect(spans[0].attributes).toEqual({ tool_name: 'read', call_id: '123' }); }); + + describe('autoOkOnSuccess option', () => { + it('does not auto-set OK when autoOkOnSuccess is false and callback resolves', async () => { + await withSpan('test.no-auto-ok', {}, async () => 42, { + autoOkOnSuccess: false, + }); + + expect(spans).toHaveLength(1); + expect(spans[0].statuses).toEqual([]); + expect(spans[0].ended).toBe(true); + }); + + it('still sets ERROR when callback throws and autoOkOnSuccess is false', async () => { + await expect( + withSpan( + 'test.throw-no-auto', + {}, + async () => { + throw new Error('fail'); + }, + { autoOkOnSuccess: false }, + ), + ).rejects.toThrow('fail'); + + expect(spans).toHaveLength(1); + expect(spans[0].statuses).toEqual([ + { code: SpanStatusCode.ERROR, message: 'Operation failed' }, + ]); + }); + + it('preserves caller-set ERROR with autoOkOnSuccess false', async () => { + await withSpan( + 'test.error-no-auto', + {}, + async (span) => { + span.setStatus({ + code: SpanStatusCode.ERROR, + message: 'hook denied', + }); + }, + { autoOkOnSuccess: false }, + ); + + expect(spans).toHaveLength(1); + expect(spans[0].statuses).toEqual([ + { code: SpanStatusCode.ERROR, message: 'hook denied' }, + ]); + }); + + it('allows caller to set OK explicitly with autoOkOnSuccess false', async () => { + await withSpan( + 'test.explicit-ok', + {}, + async (span) => { + span.setStatus({ code: SpanStatusCode.OK }); + return 'done'; + }, + { autoOkOnSuccess: false }, + ); + + expect(spans).toHaveLength(1); + expect(spans[0].statuses).toEqual([{ code: SpanStatusCode.OK }]); + }); + }); }); describe('startSpanWithContext', () => { @@ -333,11 +397,102 @@ describe('createSessionRootContext', () => { expect(ctx.traceId).toBe(deriveTraceId('session-123')); }); - it('uses TraceFlags.SAMPLED so children are recorded by default', () => { - const ctx = createSessionRootContext('session-123') as unknown as { - traceFlags: number; - }; - expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED); + it('uses TraceFlags.SAMPLED by default (no OTEL_TRACES_SAMPLER)', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + delete process.env['OTEL_TRACES_SAMPLER']; + try { + const ctx = createSessionRootContext('session-123') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.NONE when a custom sampler is configured', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'traceidratio'; + try { + const ctx = createSessionRootContext('session-456') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.NONE); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.SAMPLED when OTEL_TRACES_SAMPLER=always_on', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'always_on'; + try { + const ctx = createSessionRootContext('session-ao') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.NONE when OTEL_TRACES_SAMPLER=always_off', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'always_off'; + try { + const ctx = createSessionRootContext('session-aoff') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.NONE); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.SAMPLED when OTEL_TRACES_SAMPLER=parentbased_always_on', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_always_on'; + try { + const ctx = createSessionRootContext('session-789') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.NONE when OTEL_TRACES_SAMPLER=parentbased_always_off', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_always_off'; + try { + const ctx = createSessionRootContext('session-off') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.NONE); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } + }); + + it('uses TraceFlags.SAMPLED for parentbased_traceidratio (parent flag gates children)', () => { + const original = process.env['OTEL_TRACES_SAMPLER']; + process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_traceidratio'; + try { + const ctx = createSessionRootContext('session-pb-ratio') as unknown as { + traceFlags: number; + }; + expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED); + } finally { + if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original; + else delete process.env['OTEL_TRACES_SAMPLER']; + } }); it('generates a valid 16-char hex spanId', () => { diff --git a/packages/core/src/telemetry/tracer.ts b/packages/core/src/telemetry/tracer.ts index 8180be74e..78cfa562c 100644 --- a/packages/core/src/telemetry/tracer.ts +++ b/packages/core/src/telemetry/tracer.ts @@ -108,6 +108,20 @@ function wrapSpanWithStatusTracking(span: Span): { return { wrappedSpan, wasStatusSet: () => statusSet }; } +/** + * Options for {@link withSpan}. + */ +export interface WithSpanOptions { + /** + * When true (default), withSpan automatically sets OK status if the + * callback resolves without having set a status. When false, the caller + * is responsible for setting a terminal status in every code path. + * Use false when the callback handles multiple outcomes (success, error, + * cancellation) and each path sets its own status. + */ + autoOkOnSuccess?: boolean; +} + /** * Run an async function within a new OTel span. * The span inherits the session root traceId when no parent span is active. @@ -115,15 +129,18 @@ function wrapSpanWithStatusTracking(span: Span): { * * If the callback sets a status explicitly (e.g. ERROR on a handled failure), * withSpan will not overwrite it. Only when no status has been set and the - * callback resolves without throwing will the span be marked OK. If the - * callback throws before setting status, the span is marked ERROR with a - * generic message so raw exception text is not exported to OTel backends. + * callback resolves without throwing will the span be marked OK (unless + * autoOkOnSuccess is false). If the callback throws before setting status, + * the span is marked ERROR with a generic message so raw exception text is + * not exported to OTel backends. */ export async function withSpan( name: string, attributes: Record, fn: (span: Span) => Promise, + options?: WithSpanOptions, ): Promise { + const autoOkOnSuccess = options?.autoOkOnSuccess ?? true; const parentCtx = getParentContext(); return tracer.startActiveSpan( name, @@ -133,7 +150,7 @@ export async function withSpan( const { wrappedSpan, wasStatusSet } = wrapSpanWithStatusTracking(span); try { const result = await fn(wrappedSpan); - if (!wasStatusSet()) { + if (autoOkOnSuccess && !wasStatusSet()) { safeSetStatus(span, { code: SpanStatusCode.OK }); } return result; @@ -194,6 +211,43 @@ export function startSpanWithContext( }; } +/** + * Determine whether the synthetic session root should force the SAMPLED flag. + * + * This function reads `OTEL_TRACES_SAMPLER` to infer the sampler type. + * If a sampler is configured programmatically (e.g. via NodeSDK constructor) + * without setting the env var, this heuristic will not detect it. + * + * parentbased_* samplers delegate to localParentNotSampled (default AlwaysOff) + * when the parent carries TraceFlags.NONE. Since our synthetic root is the + * parent of all session spans, it MUST carry SAMPLED for most parentbased_* + * samplers — otherwise zero traces are exported. + * + * Note: `parentbased_traceidratio` users expect probabilistic sampling, but + * because our synthetic root is always present with SAMPLED, the ratio sampler + * (only consulted for parentless root spans) is never invoked — they + * effectively get 100% sampling. This is intentional: the alternative + * (TraceFlags.NONE) would produce zero traces. + * + * Exception: `parentbased_always_off` explicitly wants no sampling. Forcing + * SAMPLED would cause ParentBasedSampler to delegate to localParentSampled + * (default AlwaysOn), sampling everything — the opposite of the user's intent. + * + * For non-parentbased samplers (e.g. `traceidratio`, `always_off`), each span + * is evaluated independently regardless of parent flags, so we use NONE to + * let the sampler decide. `always_on` is the exception — it ignores parent + * flags, so SAMPLED is harmless and keeps the decision matrix explicit. + */ +function shouldForceSampled(): boolean { + const sampler = + process.env['OTEL_TRACES_SAMPLER']?.trim().toLowerCase() ?? ''; + if (!sampler || sampler.startsWith('parentbased_')) { + if (sampler.includes('always_off')) return false; + return true; + } + return sampler === 'always_on'; +} + /** * Create a root context with a deterministic traceId derived from sessionId. * All spans created within this context will share the same traceId, @@ -205,7 +259,7 @@ export function createSessionRootContext(sessionId: string): Context { const rootSpan = trace.wrapSpanContext({ traceId, spanId, - traceFlags: TraceFlags.SAMPLED, + traceFlags: shouldForceSampled() ? TraceFlags.SAMPLED : TraceFlags.NONE, isRemote: false, }); return trace.setSpan(ROOT_CONTEXT, rootSpan); diff --git a/packages/core/src/utils/debugLogger.test.ts b/packages/core/src/utils/debugLogger.test.ts index ea82719dd..813d84a76 100644 --- a/packages/core/src/utils/debugLogger.test.ts +++ b/packages/core/src/utils/debugLogger.test.ts @@ -207,7 +207,7 @@ describe('debugLogger', () => { it('uses the session root span context for fallback trace context', async () => { const sessionRootContext = { root: true } as unknown as Context; - setSessionContext(sessionRootContext); + setSessionContext(sessionRootContext, 'test-session'); vi.mocked(trace.getSpan).mockImplementation((ctx) => ctx === sessionRootContext ? ({