mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-05-19 07:54:38 +00:00
* fix(telemetry): address PR #3847 review follow-ups for trace correlation Addresses unresolved review feedback from PR #3847: - Respect OTEL_TRACES_SAMPLER env var when setting TraceFlags on the synthetic session root, so custom samplers (e.g. traceidratio) are not bypassed by forced SAMPLED flag - Store current session ID in session-context.ts and use it as a fallback in LogToSpanProcessor when the OTel Resource session.id attribute is stale after /clear or /resume - Wrap for-await loop body in runInSpan() so debug logs emitted during stream iteration see the stream span as active - Add autoOkOnSuccess option to withSpan, eliminating the need for the load-bearing setStatus(UNSET) hack in cancellation paths - Add defensive 5-minute timeout for stream spans to prevent leaks from abandoned generators 🤖 Generated with [Qwen Code](https://github.com/QwenLM/qwen-code) * fix(telemetry): address review issues in PR #4058 - Fix sdk.test.ts assertion to match new two-arg setSessionContext call - Add spanEnded guard to prevent double span.end() when timeout fires - Add .trim() to OTEL_TRACES_SAMPLER env var for robustness - Pass sessionId in debugLogger.test.ts for signature completeness - Clarify log-to-span-processor comment: fallback covers "missing" not "stale" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(telemetry): adopt review feedback for sampler and idle timeout - Fix shouldForceSampled to force SAMPLED for all parentbased_* samplers, not just parentbased_always_on — parentbased samplers delegate to localParentNotSampled (default AlwaysOff) when parent has NONE, which silently drops all traces - Convert stream span timeout from fixed wall-clock to idle timeout: reset timer on each chunk so legitimately long streams are never affected; timeout only fires when no chunks arrive for 5 minutes - Add test for parentbased_traceidratio → TraceFlags.SAMPLED * fix(telemetry): guard resetSpanTimeout against already-ended span Prevent zombie timer accumulation when chunks arrive after idle timeout has already ended the span. * fix(telemetry): handle parentbased_always_off sampler and fill test gaps - shouldForceSampled() now returns false for parentbased_always_off, preventing silent over-sampling that contradicts user intent. - Updated JSDoc to document the always_on exception for non-parentbased samplers. - Added test for parentbased_always_off → TraceFlags.NONE. - Added test for success path resilience when safeSetStatus throws in coreToolScheduler. * fix(telemetry): harden span timeout ordering and session ID fallback - Swap spanEnded/span.end() order so finally block can retry if end() throws. - Clear idle timeout immediately after for-await loop exits, before post-loop processing. - Use || instead of ?? for session.id fallback to handle empty strings. * fix(telemetry): address review round 4 — docs, cleanup, and test gaps - Expand shouldForceSampled JSDoc: document env-var assumption, parentbased_traceidratio 100% sampling semantics. - Remove unnecessary runInSpan wrapper around chunk field assignments. - Add stream.timed_out span attribute when idle timeout fires. - Deduplicate config.getSessionId() call in initializeTelemetry. - Add tests for always_on and always_off sampler values. * fix(telemetry): harden timeout callback ordering and add || comment - Move safeSetStatus before setAttribute in timeout callback so ERROR status is set even if setAttribute throws. - Set spanEnded=true before span.end() so finally block never overwrites ERROR with OK if end() throws. - Add comment explaining || vs ?? choice for session.id fallback. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
97ac766405
commit
4bb8dc894a
12 changed files with 407 additions and 38 deletions
|
|
@ -88,7 +88,9 @@ vi.mock('../telemetry/tracer.js', () => ({
|
|||
setAttribute: (key: string, value: string | number | boolean) => void;
|
||||
end: () => void;
|
||||
}) => Promise<unknown>,
|
||||
options?: { autoOkOnSuccess?: boolean },
|
||||
) => {
|
||||
const autoOkOnSuccess = options?.autoOkOnSuccess ?? true;
|
||||
const record: ToolSpanRecord = {
|
||||
name,
|
||||
attributes,
|
||||
|
|
@ -119,7 +121,7 @@ vi.mock('../telemetry/tracer.js', () => ({
|
|||
|
||||
try {
|
||||
const result = await fn(span);
|
||||
if (!statusSet) {
|
||||
if (autoOkOnSuccess && !statusSet) {
|
||||
record.statusCalls.push({ code: 1 });
|
||||
}
|
||||
return result;
|
||||
|
|
@ -3273,7 +3275,7 @@ describe('CoreToolScheduler telemetry spans', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('marks cancellation as UNSET with a failure kind and no auto-OK', async () => {
|
||||
it('leaves cancellation spans with no explicit status (autoOkOnSuccess: false)', async () => {
|
||||
const abortController = new AbortController();
|
||||
const { spanRecord, completedCalls } = await runSingleTool({
|
||||
abortController,
|
||||
|
|
@ -3287,12 +3289,14 @@ describe('CoreToolScheduler telemetry spans', () => {
|
|||
});
|
||||
|
||||
expect(completedCalls[0].status).toBe('cancelled');
|
||||
expect(spanRecord.statusCalls).toEqual([{ code: SpanStatusCode.UNSET }]);
|
||||
// autoOkOnSuccess: false prevents withSpan from auto-setting OK;
|
||||
// setToolSpanCancelled only sets the failure_kind attribute, not a status.
|
||||
expect(spanRecord.statusCalls).toEqual([]);
|
||||
expect(spanRecord.spanAttributes['tool.failure_kind']).toBe('cancelled');
|
||||
expect(spanRecord.ended).toBe(true);
|
||||
});
|
||||
|
||||
it('sets cancellation status when span attribute recording fails', async () => {
|
||||
it('sets cancellation attribute even when span attribute recording fails', async () => {
|
||||
const abortController = new AbortController();
|
||||
const { spanRecord, completedCalls } = await runSingleTool({
|
||||
abortController,
|
||||
|
|
@ -3307,7 +3311,9 @@ describe('CoreToolScheduler telemetry spans', () => {
|
|||
});
|
||||
|
||||
expect(completedCalls[0].status).toBe('cancelled');
|
||||
expect(spanRecord.statusCalls).toEqual([{ code: SpanStatusCode.UNSET }]);
|
||||
// No status set — autoOkOnSuccess: false, and setToolSpanCancelled
|
||||
// only sets the attribute (which fails here, caught internally).
|
||||
expect(spanRecord.statusCalls).toEqual([]);
|
||||
expect(spanRecord.spanAttributes).not.toHaveProperty('tool.failure_kind');
|
||||
expect(spanRecord.ended).toBe(true);
|
||||
});
|
||||
|
|
@ -3327,11 +3333,25 @@ describe('CoreToolScheduler telemetry spans', () => {
|
|||
});
|
||||
|
||||
expect(completedCalls[0].status).toBe('cancelled');
|
||||
// setToolSpanCancelled no longer calls setStatus, so throwSpanSetStatus
|
||||
// only affects the safeSetStatus(span, OK) in the success path (not hit).
|
||||
// With autoOkOnSuccess: false, withSpan does not attempt setStatus either.
|
||||
expect(spanRecord.statusCalls).toEqual([]);
|
||||
expect(spanRecord.spanAttributes['tool.failure_kind']).toBe('cancelled');
|
||||
expect(spanRecord.ended).toBe(true);
|
||||
});
|
||||
|
||||
it('does not crash when safeSetStatus throws on the success path', async () => {
|
||||
const { spanRecord, completedCalls } = await runSingleTool({
|
||||
throwSpanSetStatus: true,
|
||||
});
|
||||
|
||||
expect(completedCalls[0].status).toBe('success');
|
||||
expect(spanRecord.statusCalls).toEqual([]);
|
||||
expect(spanRecord.spanAttributes).not.toHaveProperty('tool.failure_kind');
|
||||
expect(spanRecord.ended).toBe(true);
|
||||
});
|
||||
|
||||
it('leaves successful tool calls to be marked OK by withSpan', async () => {
|
||||
const { spanRecord, completedCalls } = await runSingleTool();
|
||||
|
||||
|
|
|
|||
|
|
@ -131,9 +131,9 @@ function setToolSpanCancelled(span: Span): void {
|
|||
} catch {
|
||||
// OTel errors must not block the cancellation status update.
|
||||
}
|
||||
safeSetStatus(span, {
|
||||
code: SpanStatusCode.UNSET,
|
||||
});
|
||||
// No explicit span status — cancellation is neither OK nor ERROR.
|
||||
// The caller uses withSpan({ autoOkOnSuccess: false }) so withSpan
|
||||
// will not auto-set OK, and the span ends with the default UNSET status.
|
||||
}
|
||||
|
||||
async function safelyFirePostToolUseFailureHook(
|
||||
|
|
@ -1993,7 +1993,6 @@ export class CoreToolScheduler {
|
|||
'User cancelled tool execution.',
|
||||
);
|
||||
}
|
||||
// Load-bearing: prevents withSpan from auto-setting OK on normal return
|
||||
setToolSpanCancelled(span);
|
||||
return; // Both code paths should return here
|
||||
}
|
||||
|
|
@ -2160,6 +2159,7 @@ export class CoreToolScheduler {
|
|||
: {}),
|
||||
};
|
||||
this.setStatusInternal(callId, 'success', successResponse);
|
||||
safeSetStatus(span, { code: SpanStatusCode.OK });
|
||||
} else {
|
||||
// It is a failure
|
||||
// PostToolUseFailure Hook
|
||||
|
|
@ -2226,7 +2226,6 @@ export class CoreToolScheduler {
|
|||
'User cancelled tool execution.',
|
||||
);
|
||||
}
|
||||
// Load-bearing: prevents withSpan from auto-setting OK on normal return
|
||||
setToolSpanCancelled(span);
|
||||
return;
|
||||
} else {
|
||||
|
|
@ -2267,6 +2266,7 @@ export class CoreToolScheduler {
|
|||
}
|
||||
}
|
||||
},
|
||||
{ autoOkOnSuccess: false },
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -403,6 +403,7 @@ export class LoggingContentGenerator implements ContentGenerator {
|
|||
let firstModelVersion = '';
|
||||
let lastUsageMetadata: GenerateContentResponseUsageMetadata | undefined;
|
||||
let terminalStatusAttempted = false;
|
||||
let spanEnded = false;
|
||||
|
||||
// Helper to run code within the span context during iteration.
|
||||
// This ensures debug log lines emitted during stream processing
|
||||
|
|
@ -410,6 +411,34 @@ export class LoggingContentGenerator implements ContentGenerator {
|
|||
const runInSpan = <T>(fn: () => T): T =>
|
||||
spanContext ? context.with(spanContext, fn) : fn();
|
||||
|
||||
// Idle timeout: if no chunks arrive for this duration the consumer has
|
||||
// likely abandoned the generator without calling .return(). Close the
|
||||
// span so it doesn't leak forever. The timer resets on every chunk,
|
||||
// so legitimately long-running streams are never affected.
|
||||
const STREAM_IDLE_TIMEOUT_MS = 5 * 60_000; // 5 minutes
|
||||
let spanEndTimeout: ReturnType<typeof setTimeout> | undefined;
|
||||
const resetSpanTimeout = span
|
||||
? () => {
|
||||
if (spanEnded) return;
|
||||
if (spanEndTimeout !== undefined) clearTimeout(spanEndTimeout);
|
||||
spanEndTimeout = setTimeout(() => {
|
||||
try {
|
||||
safeSetStatus(span, {
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: 'Stream span timed out (idle)',
|
||||
});
|
||||
spanEnded = true;
|
||||
span.setAttribute('stream.timed_out', true);
|
||||
span.end();
|
||||
} catch {
|
||||
// OTel errors must not interrupt the consumer.
|
||||
}
|
||||
}, STREAM_IDLE_TIMEOUT_MS);
|
||||
spanEndTimeout.unref();
|
||||
}
|
||||
: undefined;
|
||||
resetSpanTimeout?.();
|
||||
|
||||
try {
|
||||
for await (const response of stream) {
|
||||
if (!firstResponseId && response.responseId) {
|
||||
|
|
@ -424,8 +453,13 @@ export class LoggingContentGenerator implements ContentGenerator {
|
|||
if (response.usageMetadata) {
|
||||
lastUsageMetadata = response.usageMetadata;
|
||||
}
|
||||
resetSpanTimeout?.();
|
||||
yield response;
|
||||
}
|
||||
if (spanEndTimeout !== undefined) {
|
||||
clearTimeout(spanEndTimeout);
|
||||
spanEndTimeout = undefined;
|
||||
}
|
||||
// Only log successful API response if no error occurred
|
||||
const durationMs = Date.now() - startTime;
|
||||
const consolidatedResponse = shouldCollectResponses
|
||||
|
|
@ -483,15 +517,20 @@ export class LoggingContentGenerator implements ContentGenerator {
|
|||
}
|
||||
throw error;
|
||||
} finally {
|
||||
if (!terminalStatusAttempted) {
|
||||
if (span) {
|
||||
safeSetStatus(span, { code: SpanStatusCode.OK });
|
||||
}
|
||||
if (spanEndTimeout !== undefined) {
|
||||
clearTimeout(spanEndTimeout);
|
||||
}
|
||||
try {
|
||||
span?.end();
|
||||
} catch {
|
||||
// OTel errors must not mask the original API error
|
||||
if (!spanEnded) {
|
||||
if (!terminalStatusAttempted) {
|
||||
if (span) {
|
||||
safeSetStatus(span, { code: SpanStatusCode.OK });
|
||||
}
|
||||
}
|
||||
try {
|
||||
span?.end();
|
||||
} catch {
|
||||
// OTel errors must not mask the original API error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,12 @@ import { LogToSpanProcessor } from './log-to-span-processor.js';
|
|||
import type { ReadableLogRecord } from '@opentelemetry/sdk-logs';
|
||||
import type { SpanExporter } from '@opentelemetry/sdk-trace-base';
|
||||
|
||||
let mockCurrentSessionId: string | undefined = undefined;
|
||||
|
||||
vi.mock('./session-context.js', () => ({
|
||||
getCurrentSessionId: () => mockCurrentSessionId,
|
||||
}));
|
||||
|
||||
interface ExportedSpan {
|
||||
name: string;
|
||||
kind: number;
|
||||
|
|
@ -34,6 +40,7 @@ describe('LogToSpanProcessor', () => {
|
|||
|
||||
beforeEach(() => {
|
||||
exportedSpans = [];
|
||||
mockCurrentSessionId = undefined;
|
||||
mockExporter = {
|
||||
export: vi.fn((spans, cb) => {
|
||||
exportedSpans.push(...spans);
|
||||
|
|
@ -708,4 +715,40 @@ describe('LogToSpanProcessor', () => {
|
|||
|
||||
expect(mockExporter.shutdown).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('falls back to getCurrentSessionId when log record has no session.id', async () => {
|
||||
mockCurrentSessionId = 'session-from-context';
|
||||
const logRecord = {
|
||||
body: 'event without session attr',
|
||||
hrTime: [1000, 0] as [number, number],
|
||||
attributes: {},
|
||||
} as unknown as ReadableLogRecord;
|
||||
|
||||
processor.onEmit(logRecord);
|
||||
await processor.forceFlush();
|
||||
|
||||
// The traceId should be derived from the fallback session ID,
|
||||
// not a random one.
|
||||
const { deriveTraceId } = await import('./trace-id-utils.js');
|
||||
expect(exportedSpans[0].spanContext().traceId).toBe(
|
||||
deriveTraceId('session-from-context'),
|
||||
);
|
||||
});
|
||||
|
||||
it('prefers log record session.id over getCurrentSessionId', async () => {
|
||||
mockCurrentSessionId = 'stale-session';
|
||||
const logRecord = {
|
||||
body: 'event with session attr',
|
||||
hrTime: [1000, 0] as [number, number],
|
||||
attributes: { 'session.id': 'fresh-session' },
|
||||
} as unknown as ReadableLogRecord;
|
||||
|
||||
processor.onEmit(logRecord);
|
||||
await processor.forceFlush();
|
||||
|
||||
const { deriveTraceId } = await import('./trace-id-utils.js');
|
||||
expect(exportedSpans[0].spanContext().traceId).toBe(
|
||||
deriveTraceId('fresh-session'),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import {
|
|||
randomHexString,
|
||||
randomSpanId,
|
||||
} from './trace-id-utils.js';
|
||||
import { getCurrentSessionId } from './session-context.js';
|
||||
|
||||
const EXPORT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_MAX_BUFFER_SIZE = 10_000;
|
||||
|
|
@ -156,9 +157,13 @@ export class LogToSpanProcessor implements LogRecordProcessor {
|
|||
|
||||
// Prefer a real active span context when OTel logs provide one, preserving
|
||||
// direct parentage. Otherwise derive traceId from session.id so all events
|
||||
// in one session appear under a single trace.
|
||||
// in one session appear under a single trace. Fall back to
|
||||
// getCurrentSessionId() when the log record has no session.id attribute
|
||||
// (e.g. after a session change via /clear or /resume).
|
||||
const parentSpanContext = getValidParentSpanContext(logRecord.spanContext);
|
||||
const sessionId = logRecord.attributes?.['session.id'];
|
||||
// || (not ??) so empty-string session.id also falls through to the fallback
|
||||
const sessionId =
|
||||
logRecord.attributes?.['session.id'] || getCurrentSessionId();
|
||||
let traceId: string;
|
||||
if (parentSpanContext) {
|
||||
traceId = parentSpanContext.traceId;
|
||||
|
|
|
|||
|
|
@ -544,9 +544,10 @@ describe('refreshSessionContext', () => {
|
|||
refreshSessionContext('new-session-id');
|
||||
|
||||
expect(createSessionRootContext).toHaveBeenCalledWith('new-session-id');
|
||||
expect(setSessionContext).toHaveBeenCalledWith({
|
||||
__sessionId: 'new-session-id',
|
||||
});
|
||||
expect(setSessionContext).toHaveBeenCalledWith(
|
||||
{ __sessionId: 'new-session-id' },
|
||||
'new-session-id',
|
||||
);
|
||||
});
|
||||
|
||||
it('should be a no-op when telemetry is not initialized', () => {
|
||||
|
|
|
|||
|
|
@ -290,7 +290,8 @@ export function initializeTelemetry(config: Config): void {
|
|||
sdk.start();
|
||||
debugLogger.debug('OpenTelemetry SDK started successfully.');
|
||||
telemetryInitialized = true;
|
||||
setSessionContext(createSessionRootContext(config.getSessionId()));
|
||||
const sessionId = config.getSessionId();
|
||||
setSessionContext(createSessionRootContext(sessionId), sessionId);
|
||||
initializeMetrics(config);
|
||||
} catch (error) {
|
||||
debugLogger.error('Error starting OpenTelemetry SDK:', error);
|
||||
|
|
@ -305,7 +306,7 @@ export function initializeTelemetry(config: Config): void {
|
|||
export function refreshSessionContext(sessionId: string): void {
|
||||
if (!telemetryInitialized) return;
|
||||
try {
|
||||
setSessionContext(createSessionRootContext(sessionId));
|
||||
setSessionContext(createSessionRootContext(sessionId), sessionId);
|
||||
} catch (error) {
|
||||
createDebugLogger('OTEL').warn('Failed to refresh session context:', error);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,11 @@
|
|||
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import { ROOT_CONTEXT, createContextKey } from '@opentelemetry/api';
|
||||
import { getSessionContext, setSessionContext } from './session-context.js';
|
||||
import {
|
||||
getSessionContext,
|
||||
setSessionContext,
|
||||
getCurrentSessionId,
|
||||
} from './session-context.js';
|
||||
|
||||
describe('session-context', () => {
|
||||
afterEach(() => {
|
||||
|
|
@ -42,3 +46,36 @@ describe('session-context', () => {
|
|||
expect(getSessionContext()).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('getCurrentSessionId', () => {
|
||||
afterEach(() => {
|
||||
setSessionContext(undefined);
|
||||
});
|
||||
|
||||
it('returns undefined when no session has been set', () => {
|
||||
expect(getCurrentSessionId()).toBeUndefined();
|
||||
});
|
||||
|
||||
it('returns the session ID passed to setSessionContext', () => {
|
||||
const key = createContextKey('sid-test-key');
|
||||
setSessionContext(ROOT_CONTEXT.setValue(key, 'ctx'), 'session-abc');
|
||||
|
||||
expect(getCurrentSessionId()).toBe('session-abc');
|
||||
});
|
||||
|
||||
it('updates when setSessionContext is called with a new session ID', () => {
|
||||
const key = createContextKey('sid-update-key');
|
||||
setSessionContext(ROOT_CONTEXT.setValue(key, 'first'), 'session-1');
|
||||
setSessionContext(ROOT_CONTEXT.setValue(key, 'second'), 'session-2');
|
||||
|
||||
expect(getCurrentSessionId()).toBe('session-2');
|
||||
});
|
||||
|
||||
it('clears when setSessionContext is called without a session ID', () => {
|
||||
const key = createContextKey('sid-clear-key');
|
||||
setSessionContext(ROOT_CONTEXT.setValue(key, 'ctx'), 'session-xyz');
|
||||
setSessionContext(undefined);
|
||||
|
||||
expect(getCurrentSessionId()).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -7,11 +7,25 @@
|
|||
import type { Context } from '@opentelemetry/api';
|
||||
|
||||
let sessionRootContext: Context | undefined;
|
||||
let currentSessionId: string | undefined;
|
||||
|
||||
export function setSessionContext(ctx: Context | undefined): void {
|
||||
export function setSessionContext(
|
||||
ctx: Context | undefined,
|
||||
sessionId?: string,
|
||||
): void {
|
||||
sessionRootContext = ctx;
|
||||
currentSessionId = sessionId;
|
||||
}
|
||||
|
||||
export function getSessionContext(): Context | undefined {
|
||||
return sessionRootContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most recent session ID passed to setSessionContext.
|
||||
* Used by LogToSpanProcessor as a fallback to derive the correct traceId
|
||||
* when a log record has no session.id attribute (e.g. after /clear or /resume).
|
||||
*/
|
||||
export function getCurrentSessionId(): string | undefined {
|
||||
return currentSessionId;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -303,6 +303,70 @@ describe('withSpan', () => {
|
|||
|
||||
expect(spans[0].attributes).toEqual({ tool_name: 'read', call_id: '123' });
|
||||
});
|
||||
|
||||
describe('autoOkOnSuccess option', () => {
|
||||
it('does not auto-set OK when autoOkOnSuccess is false and callback resolves', async () => {
|
||||
await withSpan('test.no-auto-ok', {}, async () => 42, {
|
||||
autoOkOnSuccess: false,
|
||||
});
|
||||
|
||||
expect(spans).toHaveLength(1);
|
||||
expect(spans[0].statuses).toEqual([]);
|
||||
expect(spans[0].ended).toBe(true);
|
||||
});
|
||||
|
||||
it('still sets ERROR when callback throws and autoOkOnSuccess is false', async () => {
|
||||
await expect(
|
||||
withSpan(
|
||||
'test.throw-no-auto',
|
||||
{},
|
||||
async () => {
|
||||
throw new Error('fail');
|
||||
},
|
||||
{ autoOkOnSuccess: false },
|
||||
),
|
||||
).rejects.toThrow('fail');
|
||||
|
||||
expect(spans).toHaveLength(1);
|
||||
expect(spans[0].statuses).toEqual([
|
||||
{ code: SpanStatusCode.ERROR, message: 'Operation failed' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('preserves caller-set ERROR with autoOkOnSuccess false', async () => {
|
||||
await withSpan(
|
||||
'test.error-no-auto',
|
||||
{},
|
||||
async (span) => {
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: 'hook denied',
|
||||
});
|
||||
},
|
||||
{ autoOkOnSuccess: false },
|
||||
);
|
||||
|
||||
expect(spans).toHaveLength(1);
|
||||
expect(spans[0].statuses).toEqual([
|
||||
{ code: SpanStatusCode.ERROR, message: 'hook denied' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('allows caller to set OK explicitly with autoOkOnSuccess false', async () => {
|
||||
await withSpan(
|
||||
'test.explicit-ok',
|
||||
{},
|
||||
async (span) => {
|
||||
span.setStatus({ code: SpanStatusCode.OK });
|
||||
return 'done';
|
||||
},
|
||||
{ autoOkOnSuccess: false },
|
||||
);
|
||||
|
||||
expect(spans).toHaveLength(1);
|
||||
expect(spans[0].statuses).toEqual([{ code: SpanStatusCode.OK }]);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('startSpanWithContext', () => {
|
||||
|
|
@ -333,11 +397,102 @@ describe('createSessionRootContext', () => {
|
|||
expect(ctx.traceId).toBe(deriveTraceId('session-123'));
|
||||
});
|
||||
|
||||
it('uses TraceFlags.SAMPLED so children are recorded by default', () => {
|
||||
const ctx = createSessionRootContext('session-123') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED);
|
||||
it('uses TraceFlags.SAMPLED by default (no OTEL_TRACES_SAMPLER)', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-123') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.NONE when a custom sampler is configured', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'traceidratio';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-456') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.NONE);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.SAMPLED when OTEL_TRACES_SAMPLER=always_on', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'always_on';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-ao') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.NONE when OTEL_TRACES_SAMPLER=always_off', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'always_off';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-aoff') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.NONE);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.SAMPLED when OTEL_TRACES_SAMPLER=parentbased_always_on', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_always_on';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-789') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.NONE when OTEL_TRACES_SAMPLER=parentbased_always_off', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_always_off';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-off') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.NONE);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('uses TraceFlags.SAMPLED for parentbased_traceidratio (parent flag gates children)', () => {
|
||||
const original = process.env['OTEL_TRACES_SAMPLER'];
|
||||
process.env['OTEL_TRACES_SAMPLER'] = 'parentbased_traceidratio';
|
||||
try {
|
||||
const ctx = createSessionRootContext('session-pb-ratio') as unknown as {
|
||||
traceFlags: number;
|
||||
};
|
||||
expect(ctx.traceFlags).toBe(TraceFlags.SAMPLED);
|
||||
} finally {
|
||||
if (original !== undefined) process.env['OTEL_TRACES_SAMPLER'] = original;
|
||||
else delete process.env['OTEL_TRACES_SAMPLER'];
|
||||
}
|
||||
});
|
||||
|
||||
it('generates a valid 16-char hex spanId', () => {
|
||||
|
|
|
|||
|
|
@ -108,6 +108,20 @@ function wrapSpanWithStatusTracking(span: Span): {
|
|||
return { wrappedSpan, wasStatusSet: () => statusSet };
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for {@link withSpan}.
|
||||
*/
|
||||
export interface WithSpanOptions {
|
||||
/**
|
||||
* When true (default), withSpan automatically sets OK status if the
|
||||
* callback resolves without having set a status. When false, the caller
|
||||
* is responsible for setting a terminal status in every code path.
|
||||
* Use false when the callback handles multiple outcomes (success, error,
|
||||
* cancellation) and each path sets its own status.
|
||||
*/
|
||||
autoOkOnSuccess?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run an async function within a new OTel span.
|
||||
* The span inherits the session root traceId when no parent span is active.
|
||||
|
|
@ -115,15 +129,18 @@ function wrapSpanWithStatusTracking(span: Span): {
|
|||
*
|
||||
* If the callback sets a status explicitly (e.g. ERROR on a handled failure),
|
||||
* withSpan will not overwrite it. Only when no status has been set and the
|
||||
* callback resolves without throwing will the span be marked OK. If the
|
||||
* callback throws before setting status, the span is marked ERROR with a
|
||||
* generic message so raw exception text is not exported to OTel backends.
|
||||
* callback resolves without throwing will the span be marked OK (unless
|
||||
* autoOkOnSuccess is false). If the callback throws before setting status,
|
||||
* the span is marked ERROR with a generic message so raw exception text is
|
||||
* not exported to OTel backends.
|
||||
*/
|
||||
export async function withSpan<T>(
|
||||
name: string,
|
||||
attributes: Record<string, string | number | boolean>,
|
||||
fn: (span: Span) => Promise<T>,
|
||||
options?: WithSpanOptions,
|
||||
): Promise<T> {
|
||||
const autoOkOnSuccess = options?.autoOkOnSuccess ?? true;
|
||||
const parentCtx = getParentContext();
|
||||
return tracer.startActiveSpan(
|
||||
name,
|
||||
|
|
@ -133,7 +150,7 @@ export async function withSpan<T>(
|
|||
const { wrappedSpan, wasStatusSet } = wrapSpanWithStatusTracking(span);
|
||||
try {
|
||||
const result = await fn(wrappedSpan);
|
||||
if (!wasStatusSet()) {
|
||||
if (autoOkOnSuccess && !wasStatusSet()) {
|
||||
safeSetStatus(span, { code: SpanStatusCode.OK });
|
||||
}
|
||||
return result;
|
||||
|
|
@ -194,6 +211,43 @@ export function startSpanWithContext(
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether the synthetic session root should force the SAMPLED flag.
|
||||
*
|
||||
* This function reads `OTEL_TRACES_SAMPLER` to infer the sampler type.
|
||||
* If a sampler is configured programmatically (e.g. via NodeSDK constructor)
|
||||
* without setting the env var, this heuristic will not detect it.
|
||||
*
|
||||
* parentbased_* samplers delegate to localParentNotSampled (default AlwaysOff)
|
||||
* when the parent carries TraceFlags.NONE. Since our synthetic root is the
|
||||
* parent of all session spans, it MUST carry SAMPLED for most parentbased_*
|
||||
* samplers — otherwise zero traces are exported.
|
||||
*
|
||||
* Note: `parentbased_traceidratio` users expect probabilistic sampling, but
|
||||
* because our synthetic root is always present with SAMPLED, the ratio sampler
|
||||
* (only consulted for parentless root spans) is never invoked — they
|
||||
* effectively get 100% sampling. This is intentional: the alternative
|
||||
* (TraceFlags.NONE) would produce zero traces.
|
||||
*
|
||||
* Exception: `parentbased_always_off` explicitly wants no sampling. Forcing
|
||||
* SAMPLED would cause ParentBasedSampler to delegate to localParentSampled
|
||||
* (default AlwaysOn), sampling everything — the opposite of the user's intent.
|
||||
*
|
||||
* For non-parentbased samplers (e.g. `traceidratio`, `always_off`), each span
|
||||
* is evaluated independently regardless of parent flags, so we use NONE to
|
||||
* let the sampler decide. `always_on` is the exception — it ignores parent
|
||||
* flags, so SAMPLED is harmless and keeps the decision matrix explicit.
|
||||
*/
|
||||
function shouldForceSampled(): boolean {
|
||||
const sampler =
|
||||
process.env['OTEL_TRACES_SAMPLER']?.trim().toLowerCase() ?? '';
|
||||
if (!sampler || sampler.startsWith('parentbased_')) {
|
||||
if (sampler.includes('always_off')) return false;
|
||||
return true;
|
||||
}
|
||||
return sampler === 'always_on';
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a root context with a deterministic traceId derived from sessionId.
|
||||
* All spans created within this context will share the same traceId,
|
||||
|
|
@ -205,7 +259,7 @@ export function createSessionRootContext(sessionId: string): Context {
|
|||
const rootSpan = trace.wrapSpanContext({
|
||||
traceId,
|
||||
spanId,
|
||||
traceFlags: TraceFlags.SAMPLED,
|
||||
traceFlags: shouldForceSampled() ? TraceFlags.SAMPLED : TraceFlags.NONE,
|
||||
isRemote: false,
|
||||
});
|
||||
return trace.setSpan(ROOT_CONTEXT, rootSpan);
|
||||
|
|
|
|||
|
|
@ -207,7 +207,7 @@ describe('debugLogger', () => {
|
|||
|
||||
it('uses the session root span context for fallback trace context', async () => {
|
||||
const sessionRootContext = { root: true } as unknown as Context;
|
||||
setSessionContext(sessionRootContext);
|
||||
setSessionContext(sessionRootContext, 'test-session');
|
||||
vi.mocked(trace.getSpan).mockImplementation((ctx) =>
|
||||
ctx === sessionRootContext
|
||||
? ({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue