airi/apps/server/instrumentation.ts
RainbowBird 272cdae03b
feat(server/otel): restructure observability metrics and add active sessions gauge
- Moved RateLimitMetrics import path to a more centralized location.
- Introduced a new file for active sessions gauge to track user sessions in the database.
- Updated index.ts to include new metrics and ensure proper initialization of observability metrics.
- Modified various routes and services to utilize the new observability structure.
- Added smoke tests for HTTP and WebSocket metrics to ensure proper metric registration and functionality.
- Enhanced error handling for metrics reading failures to improve observability.
2026-05-12 23:10:13 +08:00

176 lines
7.8 KiB
TypeScript

/**
* OpenTelemetry preload — single entry point for SDK setup.
*
* Loaded via `tsx --import ./instrumentation.ts`, runs BEFORE any application
* module is evaluated. By starting NodeSDK here:
* - require-in-the-middle hooks for http / pg / ioredis install before app
* code does `require('pg')` etc. (fixes the original commit-9451cd7c race).
* - The MeterProvider is real from the moment instrumentations construct, so
* `this._meter` is never NoopMeter — no setMeterProvider rebind dance.
*
* Trade-offs accepted:
* - Env vars are read directly from `process.env` (no valibot). The full
* business `Env` schema is parsed later in libs/env.ts; this preload only
* needs OTEL_* — and a config error here should crash early anyway.
* - `dotenvx run` injects .env.local before tsx, so process.env is fully
* populated by the time this file runs.
*
* Sources / why this shape:
* - https://opentelemetry.io/docs/languages/js/getting-started/nodejs/
* - https://github.com/open-telemetry/opentelemetry-js-contrib/blob/main/packages/auto-instrumentations-node/src/register.ts
* - https://github.com/open-telemetry/opentelemetry-js/issues/3146 (NodeSDK
* registers instrumentations early in start(), making single-file safe)
*/
import process, { env, exit } from 'node:process'
import { randomUUID } from 'node:crypto'
import { diag, DiagConsoleLogger, DiagLogLevel } from '@opentelemetry/api'
import { OTLPLogExporter } from '@opentelemetry/exporter-logs-otlp-proto'
import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-proto'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-proto'
import { HttpInstrumentation } from '@opentelemetry/instrumentation-http'
import { IORedisInstrumentation } from '@opentelemetry/instrumentation-ioredis'
import { PgInstrumentation } from '@opentelemetry/instrumentation-pg'
import { RuntimeNodeInstrumentation } from '@opentelemetry/instrumentation-runtime-node'
import { resourceFromAttributes } from '@opentelemetry/resources'
import { BatchLogRecordProcessor } from '@opentelemetry/sdk-logs'
import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics'
import { NodeSDK } from '@opentelemetry/sdk-node'
import { BatchSpanProcessor, ParentBasedSampler, TraceIdRatioBasedSampler } from '@opentelemetry/sdk-trace-node'
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic-conventions'
// NOTICE:
// instrumentation-http >=0.215 defaults to OLD semconv (http.server.duration in
// ms). Our Grafana dashboards / alerts only query the STABLE name
// (http.server.request.duration in seconds). MUST be set BEFORE the
// HttpInstrumentation constructor runs — that constructor reads the env var
// once and caches the result.
// Truthy check (not `??=`) so empty string from Railway / missing-var also
// falls back to STABLE.
if (!env.OTEL_SEMCONV_STABILITY_OPT_IN)
env.OTEL_SEMCONV_STABILITY_OPT_IN = 'http'
// Surface the resolved value early. Lets ops grep Railway logs for
// `[otel-preload]` to confirm the preload actually executed and what semconv
// mode is active. Without this, a misloaded preload (wrong --import path,
// missing flag, build cache) is invisible.
console.info(`[otel-preload] OTEL_SEMCONV_STABILITY_OPT_IN=${env.OTEL_SEMCONV_STABILITY_OPT_IN}`)
const otlpEndpoint = env.OTEL_EXPORTER_OTLP_ENDPOINT
if (!otlpEndpoint) {
console.info('[otel-preload] OpenTelemetry disabled (set OTEL_EXPORTER_OTLP_ENDPOINT to enable)')
}
else {
if (env.OTEL_DEBUG === 'true')
diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG)
// OTEL_EXPORTER_OTLP_HEADERS format: "key=value,key2=value2"
const headers: Record<string, string> = {}
for (const pair of (env.OTEL_EXPORTER_OTLP_HEADERS ?? '').split(',')) {
const idx = pair.indexOf('=')
if (idx > 0)
headers[pair.slice(0, idx).trim()] = pair.slice(idx + 1).trim()
}
const serviceName = env.OTEL_SERVICE_NAME || 'server'
const serviceNamespace = env.OTEL_SERVICE_NAMESPACE || 'airi'
const samplingRatioRaw = Number(env.OTEL_TRACES_SAMPLING_RATIO ?? '1')
// Head-based sampling. Metrics are always 100% accurate regardless.
const samplingRatio = Number.isFinite(samplingRatioRaw) && samplingRatioRaw >= 0 && samplingRatioRaw <= 1
? samplingRatioRaw
: 1
// service.instance.id MUST be unique per replica. Without it, two replicas
// emit the same (service_name, deployment_environment) label tuple — when
// an OTel collector / Prometheus receives both, it can either drop one
// sample as a "duplicate timestamp" or collapse the series outright,
// making per-replica `sum()` aggregates undercount.
//
// Source preference, strongest → weakest:
// 1. RAILWAY_REPLICA_ID — Railway-managed, guaranteed unique per replica.
// 2. SERVER_INSTANCE_ID — operator-supplied override.
// 3. randomUUID() — per-process fallback. Logged as a warning so ops
// know we're relying on a value that doesn't survive restarts (i.e.
// metric series cardinality climbs every deploy until staleness
// evicts old instance ids).
//
// HOSTNAME was previously used as a fallback but Railway's HOSTNAME
// semantics aren't documented as per-replica unique, so we no longer
// trust it. If you need to pin instance id to something stable across
// restarts, set SERVER_INSTANCE_ID explicitly.
let instanceId = env.RAILWAY_REPLICA_ID || env.SERVER_INSTANCE_ID
if (!instanceId) {
instanceId = randomUUID()
console.warn(`[otel-preload] No RAILWAY_REPLICA_ID or SERVER_INSTANCE_ID set — falling back to randomUUID() ${instanceId}. Multi-replica metric series will churn on every restart.`)
}
const resource = resourceFromAttributes({
[ATTR_SERVICE_NAME]: serviceName,
[ATTR_SERVICE_VERSION]: env.npm_package_version || '0.0.0',
'service.namespace': serviceNamespace,
'service.instance.id': instanceId,
'deployment.environment': env.NODE_ENV || 'development',
})
const sdk = new NodeSDK({
resource,
sampler: new ParentBasedSampler({
root: new TraceIdRatioBasedSampler(samplingRatio),
}),
spanProcessors: [new BatchSpanProcessor(new OTLPTraceExporter({
url: `${otlpEndpoint}/v1/traces`,
headers,
}))],
metricReaders: [new PeriodicExportingMetricReader({
exporter: new OTLPMetricExporter({
url: `${otlpEndpoint}/v1/metrics`,
headers,
}),
exportIntervalMillis: 15_000,
exportTimeoutMillis: 10_000,
})],
logRecordProcessors: [new BatchLogRecordProcessor(new OTLPLogExporter({
url: `${otlpEndpoint}/v1/logs`,
headers,
}))],
instrumentations: [
// Inbound HTTP is instrumented by @hono/otel inside the Hono pipeline
// (it sees Hono's matched route pattern; auto-instrumentation can't).
// Keep HttpInstrumentation for OUTBOUND traces only — LLM gateway,
// Stripe, Resend, OIDC discovery — so we still get spans on egress.
new HttpInstrumentation({
ignoreIncomingRequestHook: () => true,
}),
new PgInstrumentation({
enhancedDatabaseReporting: true,
}),
new IORedisInstrumentation(),
new RuntimeNodeInstrumentation(),
],
})
sdk.start()
console.info(`[otel-preload] OpenTelemetry initialized, exporting to ${otlpEndpoint}, sampling ratio: ${samplingRatio}`)
// Graceful shutdown — flush pending exports before exit. Idempotent.
let shuttingDown = false
const shutdown = async () => {
if (shuttingDown)
return
shuttingDown = true
try {
await sdk.shutdown()
console.info('[otel-preload] OpenTelemetry shut down successfully')
}
catch (err) {
console.error('[otel-preload] Error shutting down OpenTelemetry:', err)
}
}
const shutdownAndExit = () => {
void shutdown().then(() => exit(0))
}
process.on('SIGTERM', shutdownAndExit)
process.on('SIGINT', shutdownAndExit)
}