qwen-code/scripts/benchmark-api-latency.mjs
jinye 3b0b6c052b
feat(cli): add API preconnect to reduce first-call latency (#3318)
Fire a fire-and-forget HEAD request early in startup to warm the TCP+TLS connection. Subsequent SDK calls share an undici dispatcher with preconnect, reusing the warmed connection to save 100-200ms on the first request.

Skip conditions:
- NODE_EXTRA_CA_CERTS set (enterprise TLS inspection)
- Sandbox mode (process-restart context)
- Non-default baseUrl (mTLS / private deployment)
- Non-Node runtimes (Bun)

Disable via QWEN_CODE_DISABLE_PRECONNECT=1.

Closes #3223
2026-04-27 06:54:55 +08:00

167 lines
5.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* API Preconnect Latency Benchmark
*
* Measures the real TCP+TLS connection reuse benefit of preconnect by using
* undici (the same library as apiPreconnect.ts) within a single process.
*
* Unlike the previous curl-based approach, this correctly measures connection
* pool reuse: the same dispatcher instance is shared between the preconnect
* HEAD request and the subsequent measured request, just like in production.
*
* Usage:
* node scripts/benchmark-api-latency.mjs
*
* Environment variables:
* ITERATIONS=3 Number of cold/warm pairs per endpoint (default: 3)
* REQUEST_TIMEOUT_MS=5000 Per-request timeout in ms (default: 5000)
* BENCHMARK_URLS Space-separated extra URLs to benchmark
*/
import { createRequire } from 'module';
import { performance } from 'perf_hooks';
// Resolve undici from the core package (same version used by preconnect)
const require = createRequire(import.meta.url);
const { Agent } = require('../packages/core/node_modules/undici/index.js');
const ITERATIONS = parseInt(process.env['ITERATIONS'] ?? '3', 10);
const REQUEST_TIMEOUT_MS = parseInt(process.env['REQUEST_TIMEOUT_MS'] ?? '5000', 10);
const DEFAULT_ENDPOINTS = [
{ url: 'https://api.openai.com', label: 'OpenAI' },
{ url: 'https://api.anthropic.com', label: 'Anthropic' },
{ url: 'https://dashscope.aliyuncs.com/compatible-mode/v1', label: 'DashScope (openai-compatible)' },
];
const extraUrls = process.env['BENCHMARK_URLS']
? process.env['BENCHMARK_URLS'].split(' ').filter(Boolean).map((url) => ({ url, label: url }))
: [];
const ENDPOINTS = [...DEFAULT_ENDPOINTS, ...extraUrls];
// ---------------------------------------------------------------------------
function newDispatcher() {
return new Agent({
headersTimeout: 0,
bodyTimeout: 0,
keepAliveTimeout: 60_000,
});
}
async function fetchOnce(url, dispatcher, method = 'HEAD') {
const start = performance.now();
try {
await fetch(url, {
method,
signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS),
headers: { 'User-Agent': 'QwenCode-Benchmark/1.0' },
dispatcher,
});
} catch (err) {
// Timeouts and non-2xx are fine — we only care about connection timing
if (err?.name === 'TimeoutError') {
return performance.now() - start; // still records the time spent
}
}
return performance.now() - start;
}
/**
* Cold measurement: brand-new dispatcher, no preconnect.
* Returns elapsed ms of the measured request.
*/
async function measureCold(url) {
const dispatcher = newDispatcher();
return fetchOnce(url, dispatcher, 'HEAD');
}
/**
* Warm measurement: same dispatcher for preconnect HEAD + measured request.
* Returns elapsed ms of the measured request only (not the preconnect time).
*/
async function measureWarm(url) {
const dispatcher = newDispatcher();
// Preconnect — mirrors apiPreconnect.ts behaviour
await fetchOnce(url, dispatcher, 'HEAD').catch(() => {});
// Measured request reuses the warmed connection from the same pool
return fetchOnce(url, dispatcher, 'HEAD');
}
// ---------------------------------------------------------------------------
function fmt(ms) {
return `${ms.toFixed(1)}ms`;
}
function avg(arr) {
return arr.reduce((a, b) => a + b, 0) / arr.length;
}
async function benchmarkEndpoint({ url, label }) {
console.log(`\n ${label}`);
console.log(` ${url}`);
const coldTimes = [];
const warmTimes = [];
for (let i = 0; i < ITERATIONS; i++) {
const cold = await measureCold(url);
coldTimes.push(cold);
// Brief pause so the OS can release the cold connection
await new Promise((r) => setTimeout(r, 500));
const warm = await measureWarm(url);
warmTimes.push(warm);
console.log(` run ${i + 1}: cold=${fmt(cold)} warm=${fmt(warm)}`);
await new Promise((r) => setTimeout(r, 500));
}
const avgCold = avg(coldTimes);
const avgWarm = avg(warmTimes);
const saved = avgCold - avgWarm;
const pct = avgCold > 0 ? (saved / avgCold) * 100 : 0;
return { label, url, avgCold, avgWarm, saved, pct };
}
// ---------------------------------------------------------------------------
console.log('=== Qwen Code API Preconnect Latency Benchmark ===');
console.log(`Iterations per endpoint : ${ITERATIONS}`);
console.log(`Request timeout : ${REQUEST_TIMEOUT_MS}ms`);
console.log('\nRunning...');
const results = [];
for (const endpoint of ENDPOINTS) {
const result = await benchmarkEndpoint(endpoint);
results.push(result);
}
// Summary table
console.log('\n\n=== Results ===\n');
console.log(
'Endpoint'.padEnd(36) +
'Cold (avg)'.padStart(12) +
'Warm (avg)'.padStart(12) +
'Saved'.padStart(10) +
'Improvement'.padStart(13),
);
console.log('─'.repeat(83));
for (const r of results) {
const status = r.pct >= 30 ? '✓' : r.pct >= 10 ? '~' : '✗';
console.log(
r.label.slice(0, 35).padEnd(36) +
fmt(r.avgCold).padStart(12) +
fmt(r.avgWarm).padStart(12) +
fmt(r.saved).padStart(10) +
`${r.pct.toFixed(1)}% ${status}`.padStart(13),
);
}
console.log('\nLegend: ✓ ≥30% improvement ~ 1030% ✗ <10%');