mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-05-02 13:40:46 +00:00
feat(cli): add API preconnect to reduce first-call latency (#3318)
Fire a fire-and-forget HEAD request early in startup to warm the TCP+TLS connection. Subsequent SDK calls share an undici dispatcher with preconnect, reusing the warmed connection to save 100-200ms on the first request. Skip conditions: - NODE_EXTRA_CA_CERTS set (enterprise TLS inspection) - Sandbox mode (process-restart context) - Non-default baseUrl (mTLS / private deployment) - Non-Node runtimes (Bun) Disable via QWEN_CODE_DISABLE_PRECONNECT=1. Closes #3223
This commit is contained in:
parent
70127b5cd8
commit
3b0b6c052b
8 changed files with 731 additions and 13 deletions
167
scripts/benchmark-api-latency.mjs
Normal file
167
scripts/benchmark-api-latency.mjs
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
#!/usr/bin/env node
|
||||
/**
|
||||
* API Preconnect Latency Benchmark
|
||||
*
|
||||
* Measures the real TCP+TLS connection reuse benefit of preconnect by using
|
||||
* undici (the same library as apiPreconnect.ts) within a single process.
|
||||
*
|
||||
* Unlike the previous curl-based approach, this correctly measures connection
|
||||
* pool reuse: the same dispatcher instance is shared between the preconnect
|
||||
* HEAD request and the subsequent measured request, just like in production.
|
||||
*
|
||||
* Usage:
|
||||
* node scripts/benchmark-api-latency.mjs
|
||||
*
|
||||
* Environment variables:
|
||||
* ITERATIONS=3 Number of cold/warm pairs per endpoint (default: 3)
|
||||
* REQUEST_TIMEOUT_MS=5000 Per-request timeout in ms (default: 5000)
|
||||
* BENCHMARK_URLS Space-separated extra URLs to benchmark
|
||||
*/
|
||||
|
||||
import { createRequire } from 'module';
|
||||
import { performance } from 'perf_hooks';
|
||||
|
||||
// Resolve undici from the core package (same version used by preconnect)
|
||||
const require = createRequire(import.meta.url);
|
||||
const { Agent } = require('../packages/core/node_modules/undici/index.js');
|
||||
|
||||
const ITERATIONS = parseInt(process.env['ITERATIONS'] ?? '3', 10);
|
||||
const REQUEST_TIMEOUT_MS = parseInt(process.env['REQUEST_TIMEOUT_MS'] ?? '5000', 10);
|
||||
|
||||
const DEFAULT_ENDPOINTS = [
|
||||
{ url: 'https://api.openai.com', label: 'OpenAI' },
|
||||
{ url: 'https://api.anthropic.com', label: 'Anthropic' },
|
||||
{ url: 'https://dashscope.aliyuncs.com/compatible-mode/v1', label: 'DashScope (openai-compatible)' },
|
||||
];
|
||||
|
||||
const extraUrls = process.env['BENCHMARK_URLS']
|
||||
? process.env['BENCHMARK_URLS'].split(' ').filter(Boolean).map((url) => ({ url, label: url }))
|
||||
: [];
|
||||
|
||||
const ENDPOINTS = [...DEFAULT_ENDPOINTS, ...extraUrls];
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function newDispatcher() {
|
||||
return new Agent({
|
||||
headersTimeout: 0,
|
||||
bodyTimeout: 0,
|
||||
keepAliveTimeout: 60_000,
|
||||
});
|
||||
}
|
||||
|
||||
async function fetchOnce(url, dispatcher, method = 'HEAD') {
|
||||
const start = performance.now();
|
||||
try {
|
||||
await fetch(url, {
|
||||
method,
|
||||
signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS),
|
||||
headers: { 'User-Agent': 'QwenCode-Benchmark/1.0' },
|
||||
dispatcher,
|
||||
});
|
||||
} catch (err) {
|
||||
// Timeouts and non-2xx are fine — we only care about connection timing
|
||||
if (err?.name === 'TimeoutError') {
|
||||
return performance.now() - start; // still records the time spent
|
||||
}
|
||||
}
|
||||
return performance.now() - start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cold measurement: brand-new dispatcher, no preconnect.
|
||||
* Returns elapsed ms of the measured request.
|
||||
*/
|
||||
async function measureCold(url) {
|
||||
const dispatcher = newDispatcher();
|
||||
return fetchOnce(url, dispatcher, 'HEAD');
|
||||
}
|
||||
|
||||
/**
|
||||
* Warm measurement: same dispatcher for preconnect HEAD + measured request.
|
||||
* Returns elapsed ms of the measured request only (not the preconnect time).
|
||||
*/
|
||||
async function measureWarm(url) {
|
||||
const dispatcher = newDispatcher();
|
||||
// Preconnect — mirrors apiPreconnect.ts behaviour
|
||||
await fetchOnce(url, dispatcher, 'HEAD').catch(() => {});
|
||||
// Measured request reuses the warmed connection from the same pool
|
||||
return fetchOnce(url, dispatcher, 'HEAD');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function fmt(ms) {
|
||||
return `${ms.toFixed(1)}ms`;
|
||||
}
|
||||
|
||||
function avg(arr) {
|
||||
return arr.reduce((a, b) => a + b, 0) / arr.length;
|
||||
}
|
||||
|
||||
async function benchmarkEndpoint({ url, label }) {
|
||||
console.log(`\n ${label}`);
|
||||
console.log(` ${url}`);
|
||||
|
||||
const coldTimes = [];
|
||||
const warmTimes = [];
|
||||
|
||||
for (let i = 0; i < ITERATIONS; i++) {
|
||||
const cold = await measureCold(url);
|
||||
coldTimes.push(cold);
|
||||
|
||||
// Brief pause so the OS can release the cold connection
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
|
||||
const warm = await measureWarm(url);
|
||||
warmTimes.push(warm);
|
||||
|
||||
console.log(` run ${i + 1}: cold=${fmt(cold)} warm=${fmt(warm)}`);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
}
|
||||
|
||||
const avgCold = avg(coldTimes);
|
||||
const avgWarm = avg(warmTimes);
|
||||
const saved = avgCold - avgWarm;
|
||||
const pct = avgCold > 0 ? (saved / avgCold) * 100 : 0;
|
||||
|
||||
return { label, url, avgCold, avgWarm, saved, pct };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
console.log('=== Qwen Code API Preconnect Latency Benchmark ===');
|
||||
console.log(`Iterations per endpoint : ${ITERATIONS}`);
|
||||
console.log(`Request timeout : ${REQUEST_TIMEOUT_MS}ms`);
|
||||
console.log('\nRunning...');
|
||||
|
||||
const results = [];
|
||||
for (const endpoint of ENDPOINTS) {
|
||||
const result = await benchmarkEndpoint(endpoint);
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
// Summary table
|
||||
console.log('\n\n=== Results ===\n');
|
||||
console.log(
|
||||
'Endpoint'.padEnd(36) +
|
||||
'Cold (avg)'.padStart(12) +
|
||||
'Warm (avg)'.padStart(12) +
|
||||
'Saved'.padStart(10) +
|
||||
'Improvement'.padStart(13),
|
||||
);
|
||||
console.log('─'.repeat(83));
|
||||
|
||||
for (const r of results) {
|
||||
const status = r.pct >= 30 ? '✓' : r.pct >= 10 ? '~' : '✗';
|
||||
console.log(
|
||||
r.label.slice(0, 35).padEnd(36) +
|
||||
fmt(r.avgCold).padStart(12) +
|
||||
fmt(r.avgWarm).padStart(12) +
|
||||
fmt(r.saved).padStart(10) +
|
||||
`${r.pct.toFixed(1)}% ${status}`.padStart(13),
|
||||
);
|
||||
}
|
||||
|
||||
console.log('\nLegend: ✓ ≥30% improvement ~ 10–30% ✗ <10%');
|
||||
Loading…
Add table
Add a link
Reference in a new issue