fix(review): resolve PR #2678 multi-agent review findings (#2743)

Addresses 3 critical + 4 high + 4 medium findings from the cross-agent review of the v3.8.4 release branch. CRITICAL - combo: honour skipProviderBreaker in combo.ts:2452 so embedded service supervisor outages signalled via X-Omni-Fallback-Hint=connection_cooldown no longer trip the whole-provider circuit breaker. The G-02 contract was added to accountFallback but never honoured by its consumer. - combo: per-model timeout now creates an AbortController, propagates its signal via target.modelAbortSignal, and aborts the inner request when the timeout wins the race. Chat.ts wraps the request via AbortSignal.any so downstream cooldown/breaker/usage mutations stop instead of running behind the routing decision's back. - apiKey: getOrCreateApiKey now throws ServiceApiKeyDecryptError on decrypt failure instead of silently regenerating. Mutating embedded service auth without operator awareness made every subsequent request 401 with no log trail. HIGH - base.ts proactive refresh: classify isUnrecoverableRefreshError before spreading the result so the executor doesn't send an unrecoverable_refresh_error sentinel object as the access token. Mark the connection expired via onCredentialsRefreshed and elevate the catch log from warn to error per the documented onPersist contract. - kimi-coding: persist deviceId/deviceName/deviceModel/osVersion in providerSpecificData at login. tokenRefresh's fallback pbkdf2(refresh_token) rotates per refresh since Kimi rotates refresh tokens, contradicting the "stable deviceId" comment and tripping anti-bot detection mid-session. - inner-ai: resolveModels throws InnerAiModelsError on non-OK (with 401/403 invalidating the credential cache) instead of silently returning []. collectContent now propagates missing_credits / reached_limit / rate_limit_reached events via InnerAiStreamError so non-streaming callers get a 429 instead of HTTP 200 with an empty body. MEDIUM - chatCore.ts retry-after-refresh: capture and log the error at error level with sanitizeErrorMessage instead of a bare catch{}. - gemini-cli.ts refreshCredentials: capture body on !response.ok and map invalid_grant to unrecoverable_refresh_error for parity with refreshGoogleToken in tokenRefresh.ts. - usage.ts antigravity: introduce fractionReported sentinel so an upstream schema drift (Antigravity not reporting remainingFraction) no longer masquerades as "every model is exhausted". - proxyFetch.ts vercel relay: sanitize the missing-relayAuth throw message (no internal [ProxyFetch] label) and pass host through proxyUrlForLogs for consistent redaction. Backlog for follow-up: Inner.ai behavioural tests, tokenRefresh.ts @ts-nocheck removal + RefreshResult discriminated union, tokenHealthCheck tests, structural-vs-behavioural tests in token-refresh-race-comprehensive. Tracked in #2743.
2026-05-27 17:23:52 +00:00 · 2026-05-26 12:44:22 -03:00 · 2026-05-26 12:44:22 -03:00 · 0c94c397db
commit 0c94c397db
parent f234cd6c1c
11 changed files with 305 additions and 53 deletions
--- a/open-sse/executors/base.ts
+++ b/open-sse/executors/base.ts
@ -8,7 +8,11 @@ import {
 } from "../services/apiKeyRotator.ts";
 import type { KeyHealth } from "../services/apiKeyRotator.ts";
 import { getOpenAICompatibleType, isClaudeCodeCompatible } from "../services/provider.ts";
-import { runWithOnPersist, getRefreshLeadMs } from "../services/tokenRefresh.ts";
+import {
+  runWithOnPersist,
+  getRefreshLeadMs,
+  isUnrecoverableRefreshError,
+} from "../services/tokenRefresh.ts";
 import type { ProviderRequestDefaults } from "../services/providerRequestDefaults.ts";
 import { signRequestBody } from "../services/claudeCodeCCH.ts";
 import {
@ -108,8 +112,13 @@ export type ExecuteInput = {
  upstreamExtraHeaders?: Record<string, string> | null;
  /** Original client request headers (read-only). Executors may forward select headers upstream. */
  clientHeaders?: Record<string, string> | null;
-  /** Callback to persist tokens that are proactively refreshed during execution. */
-  onCredentialsRefreshed?: (newCredentials: ProviderCredentials) => Promise<void> | void;
+  /** Callback to persist tokens that are proactively refreshed during execution.
+   * Accepts a partial credentials patch (e.g. `{ accessToken, refreshToken }` or
+   * `{ testStatus: "expired", isActive: false }`); the caller merges into the
+   * stored connection row. */
+  onCredentialsRefreshed?: (
+    newCredentials: Partial<ProviderCredentials> & Record<string, unknown>
+  ) => Promise<void> | void;
  /** When true, skip the intra-URL 429 retry in execute() so the caller handles fallback. */
  skipUpstreamRetry?: boolean;
 };
@ -536,18 +545,20 @@ export class BaseExecutor {
    }
  }

-  async execute({
-    model,
-    body,
-    stream,
-    credentials,
-    signal,
-    log,
-    extendedContext,
-    upstreamExtraHeaders,
-    clientHeaders,
-    skipUpstreamRetry = false,
-  }: ExecuteInput) {
+  async execute(input: ExecuteInput) {
+    const {
+      model,
+      body,
+      stream,
+      credentials,
+      signal,
+      log,
+      extendedContext,
+      upstreamExtraHeaders,
+      clientHeaders,
+      skipUpstreamRetry = false,
+      onCredentialsRefreshed,
+    } = input;
    const fallbackCount = this.getFallbackCount();
    let lastError: unknown = null;
    let lastStatus = 0;
@ -563,16 +574,14 @@ export class BaseExecutor {
        // to detect whether the persist callback actually fired and fall back to
        // post-refresh mutation when it didn't.
        let proactivePersistRan = false;
-        const proactiveOnPersist = arguments[0].onCredentialsRefreshed
+        const proactiveOnPersist = onCredentialsRefreshed
          ? async (refreshResult: Record<string, unknown>) => {
              proactivePersistRan = true;
              activeCredentials = {
                ...credentials,
                ...(refreshResult as Partial<ProviderCredentials>),
              };
-              await arguments[0].onCredentialsRefreshed(
-                refreshResult as Partial<ProviderCredentials>
-              );
+              await onCredentialsRefreshed(refreshResult as Partial<ProviderCredentials>);
            }
          : null;

@ -581,16 +590,42 @@ export class BaseExecutor {
        );

        if (refreshed && !proactivePersistRan) {
-          activeCredentials = {
-            ...credentials,
-            ...refreshed,
-          };
-          if (arguments[0].onCredentialsRefreshed) {
-            await arguments[0].onCredentialsRefreshed(refreshed);
+          // Classify unrecoverable sentinels before spreading. Without this guard
+          // an { error: "unrecoverable_refresh_error", code } object passes the
+          // truthy check and pollutes activeCredentials with garbage, causing the
+          // executor to send a non-token to upstream. Mirrors the reactive path
+          // in chatCore.ts and keeps the proactive path's behaviour consistent.
+          if (isUnrecoverableRefreshError(refreshed)) {
+            const refreshCode = (refreshed as Record<string, unknown>).code;
+            log?.error?.(
+              "TOKEN",
+              `${this.provider.toUpperCase()} | unrecoverable refresh — marking account expired (code=${String(refreshCode ?? "unknown")})`
+            );
+            if (onCredentialsRefreshed) {
+              await onCredentialsRefreshed({
+                testStatus: "expired",
+                isActive: false,
+              });
+            }
+            // Don't spread the sentinel — keep stale credentials so the next
+            // upstream call surfaces the real auth error to the client.
+          } else {
+            activeCredentials = {
+              ...credentials,
+              ...refreshed,
+            };
+            if (onCredentialsRefreshed) {
+              await onCredentialsRefreshed(refreshed);
+            }
          }
        }
      } catch (error) {
-        log?.warn?.(
+        // tokenRefresh.ts:1352 documents that onPersist throws are re-thrown so
+        // the caller is aware of the persistence failure. Honor that contract:
+        // log at error level (not warn), with sanitized message — and let the
+        // request continue with stale credentials so the user-visible error
+        // surfaces upstream rather than being silently absorbed here.
+        log?.error?.(
          "TOKEN",
          `Credential refresh failed for ${this.provider}: ${error instanceof Error ? error.message : String(error)}`
        );
--- a/open-sse/executors/gemini-cli.ts
+++ b/open-sse/executors/gemini-cli.ts
@ -1,6 +1,6 @@
-import { BaseExecutor, mergeUpstreamExtraHeaders } from "./base.ts";
+import { BaseExecutor, mergeUpstreamExtraHeaders, mergeAbortSignals } from "./base.ts";
 import { randomUUID } from "crypto";
-import { PROVIDERS, OAUTH_ENDPOINTS } from "../config/constants.ts";
+import { PROVIDERS, OAUTH_ENDPOINTS, FETCH_TIMEOUT_MS } from "../config/constants.ts";
 import { getGeminiCliHeaders } from "../services/geminiCliHeaders.ts";
 import { scrubProxyAndFingerprintHeaders } from "../services/antigravityHeaderScrub.ts";
 import { obfuscateSensitiveWords } from "../services/antigravityObfuscation.ts";
@ -408,11 +408,13 @@ export class GeminiCLIExecutor extends BaseExecutor {
          `[Gemini CLI] Execute - URL: ${url}, Model: ${model}, Retry: ${retryAttemptsByUrl[urlIndex]}`
        );

+        const timeoutSignal = AbortSignal.timeout(FETCH_TIMEOUT_MS);
+        const mergedSignal = signal ? mergeAbortSignals(signal, timeoutSignal) : timeoutSignal;
        const response = await fetch(url, {
          method: "POST",
          headers,
          body: JSON.stringify(transformedBody),
-          signal,
+          signal: mergedSignal,
        });

        if (!response.ok) {
@ -479,7 +481,25 @@ export class GeminiCLIExecutor extends BaseExecutor {
        }),
      });

-      if (!response.ok) return null;
+      if (!response.ok) {
+        const errorText = await response.text().catch(() => "");
+        log?.error?.("TOKEN", "Gemini CLI refresh failed", {
+          status: response.status,
+          error: errorText.slice(0, 200),
+        });
+        // Match refreshGoogleToken's pattern: invalid_grant means the refresh
+        // token was revoked / replaced — surface as unrecoverable so the caller
+        // marks the account expired instead of retrying forever with a dead token.
+        try {
+          const errorBody = JSON.parse(errorText);
+          if (errorBody?.error === "invalid_grant") {
+            return { error: "unrecoverable_refresh_error", code: "invalid_grant" } as never;
+          }
+        } catch {
+          // not JSON — fall through
+        }
+        return null;
+      }

      const tokens = await response.json();
      log?.info?.("TOKEN", "Gemini CLI refreshed");
--- a/open-sse/executors/inner-ai.ts
+++ b/open-sse/executors/inner-ai.ts
@ -1,3 +1,4 @@
+import { createHash } from "node:crypto";
 import { BaseExecutor, type ExecuteInput } from "./base.ts";
 import { sanitizeErrorMessage } from "../utils/error.ts";

@ -30,12 +31,15 @@ interface CredentialCache {

 // ── In-memory caches ──────────────────────────────────────────────────────────

-// Keyed by first 32 chars of the token JWT
+// Keyed by sha256(token). Using a prefix slice of the JWT collides across
+// tokens that share the same algorithm header (the first ~36 chars of any
+// HS256/RS256 token are identical), which previously caused cross-tenant
+// credential cache hits.
 const credentialCache = new Map<string, CredentialCache>();
 const modelsCache = new Map<string, { models: InnerAiModel[]; expiresAt: number }>();

 function tokenCacheKey(token: string): string {
-  return token.slice(0, 32);
+  return createHash("sha256").update(token).digest("hex");
 }

 // ── Helpers ───────────────────────────────────────────────────────────────────
@ -178,6 +182,16 @@ async function resolveCredentials(

 // ── Model resolution (dynamic fetch + cache) ──────────────────────────────────

+class InnerAiModelsError extends Error {
+  constructor(
+    public readonly status: number,
+    public readonly responsePreview: string
+  ) {
+    super(`Inner.ai /ai-models returned HTTP ${status}`);
+    this.name = "InnerAiModelsError";
+  }
+}
+
 async function resolveModels(
  token: string,
  deviceId: string,
@ -193,7 +207,20 @@ async function resolveModels(
    signal: signal ?? undefined,
  });

-  if (!resp.ok) return [];
+  if (!resp.ok) {
+    // Don't silently fall through to an empty list — the synthetic model entry
+    // built downstream sends ai_model.id: undefined to chat, which Inner.ai
+    // responds to with a confusing "invalid model id" error keyed on a
+    // different message than the real root cause (auth or upstream outage).
+    const bodyPreview = await resp.text().catch(() => "");
+    const err = new InnerAiModelsError(resp.status, bodyPreview.slice(0, 200));
+    if (resp.status === 401 || resp.status === 403) {
+      // Auth failed on the models endpoint — drop the credential cache so the
+      // next request re-resolves the email/deviceId from /profile.
+      credentialCache.delete(tokenCacheKey(token));
+    }
+    throw err;
+  }

  const body = await resp.json().catch(() => null);
  let raw: InnerAiModel[] = [];
@ -396,7 +423,23 @@ function transformInnerAiSSE(upstream: ReadableStream, model: string): ReadableS
  });
 }

-/** Collect Inner.ai SSE stream into a single content string (non-streaming path). */
+class InnerAiStreamError extends Error {
+  constructor(
+    public readonly status: number,
+    public readonly code: string,
+    message: string
+  ) {
+    super(message);
+    this.name = "InnerAiStreamError";
+  }
+}
+
+/** Collect Inner.ai SSE stream into a single content string (non-streaming path).
+ *  Mirrors the event taxonomy in transformInnerAiSSE so credits/rate-limit
+ *  events become a thrown error instead of being silently discarded (which
+ *  produced HTTP 200 + empty body and tricked clients into retrying against
+ *  an exhausted account).
+ */
 async function collectContent(upstream: ReadableStream): Promise<string> {
  const decoder = new TextDecoder();
  const reader = upstream.getReader();
@ -423,8 +466,24 @@ async function collectContent(upstream: ReadableStream): Promise<string> {
        continue;
      }

-      if (data.type === "text" && typeof data.item === "string") {
+      const type = data.type;
+      if (type === "text" && typeof data.item === "string") {
        content += data.item;
+        continue;
+      }
+      if (
+        type === "missing_credits" ||
+        type === "reached_limit" ||
+        type === "rate_limit_reached" ||
+        type === "rate_limit_longer_reached"
+      ) {
+        const errorMsg =
+          type === "missing_credits"
+            ? "Inner.ai: not enough credits"
+            : type === "reached_limit"
+              ? "Inner.ai: usage limit reached"
+              : "Inner.ai: rate limit reached — try again later";
+        throw new InnerAiStreamError(429, String(type), errorMsg);
      }
    }
  }
@ -464,8 +523,25 @@ export class InnerAiExecutor extends BaseExecutor {
    let models: InnerAiModel[] = [];
    try {
      models = await resolveModels(token, deviceId, email, signal);
-    } catch {
-      // Non-fatal: proceed with empty list; synthetic model entry will be used
+    } catch (err) {
+      // Auth failures on /ai-models are surfaced explicitly so operators don't
+      // chase a "Inner.ai invalid model" downstream symptom when the real cause
+      // is the user's token expiring on the models endpoint.
+      if (err instanceof InnerAiModelsError && (err.status === 401 || err.status === 403)) {
+        return makeErrorResult(
+          err.status,
+          "Inner.ai /ai-models authentication failed — re-paste your token cookie",
+          body
+        );
+      }
+      // Non-auth failures (5xx, network): proceed with empty list and let the
+      // synthetic-model fallback try. Log so the operator sees the upstream blip.
+      // No `log` accessor in this executor scope — propagate via a runtime warning.
+      console.warn(
+        `[InnerAI] /ai-models fetch failed (status=${
+          err instanceof InnerAiModelsError ? err.status : "n/a"
+        }) — falling back to synthetic model entry`
+      );
    }

    const modelEntry: InnerAiModel = findModel(models, requestedModel) ?? {
@ -556,7 +632,18 @@ export class InnerAiExecutor extends BaseExecutor {
    }

    // Non-streaming: collect content and return as JSON
-    const content = await collectContent(upstream.body);
+    let content: string;
+    try {
+      content = await collectContent(upstream.body);
+    } catch (err) {
+      // Inner.ai SSE error events (missing_credits, rate_limit_reached, …)
+      // surface here as thrown errors. Translate into a proper HTTP error so
+      // the client sees the failure instead of an empty 200 body.
+      if (err instanceof InnerAiStreamError) {
+        return makeErrorResult(err.status, err.message, body);
+      }
+      throw err;
+    }
    const completionId = `chatcmpl-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
    return {
      response: new Response(
--- a/open-sse/handlers/chatCore.ts
+++ b/open-sse/handlers/chatCore.ts
@ -34,6 +34,7 @@ import {
  createErrorResult,
  parseUpstreamError,
  formatProviderError,
+  sanitizeErrorMessage,
 } from "../utils/error.ts";
 import {
  COOLDOWN_MS,
@ -3896,8 +3897,15 @@ export async function handleChatCore({
          providerResponse = retryResult.response;
          upstreamErrorParsed = false; // Let it be parsed downstream
        }
-      } catch {
-        log?.warn?.("TOKEN", `${provider.toUpperCase()} | retry after refresh failed`);
+      } catch (retryErr) {
+        // Refresh succeeded but the retry leg failed (network blip, AbortError,
+        // executor throw). Don't swallow — the operator-visible signal "the user
+        // saw 401 even though auth was actually fixed" is much more confusing
+        // than the original 401 alone. Surface at error level with sanitization.
+        log?.error?.(
+          "TOKEN",
+          `${provider.toUpperCase()} | retry after refresh failed: ${sanitizeErrorMessage(retryErr)}`
+        );
      }
    } else {
      log?.warn?.("TOKEN", `${provider.toUpperCase()} | refresh failed`);
--- a/open-sse/services/combo.ts
+++ b/open-sse/services/combo.ts
@ -1649,14 +1649,27 @@ export async function handleComboChat({
  // ── Per-model timeout wrapper ────────────────────────────────────────────
  // Default FETCH_TIMEOUT_MS is 600s per model. For combos, we use a shorter
  // per-model timeout so slow/hanging models don't block fallback.
+  //
+  // The timeoutController is forwarded to the inner caller via target.modelAbortSignal.
+  // When the timeout fires we (a) resolve the race with a synthetic 524 and
+  // (b) abort the inner request so its upstream fetch is cancelled and downstream
+  // cooldown/breaker/usage mutations stop — preventing "ghost" state mutations
+  // that diverge from the routing decision the operator sees.
  const handleSingleModelWithTimeout = async (b, modelStr, target?) => {
+    const timeoutController = new AbortController();
    let timeoutId;
+    let timedOut = false;
    const timeoutPromise = new Promise((resolve) => {
      timeoutId = setTimeout(() => {
+        timedOut = true;
        log.warn(
          "COMBO",
          `Model ${modelStr} exceeded ${COMBO_MODEL_TIMEOUT_MS}ms timeout — falling back`
        );
+        // Abort the inner request so its upstream fetch is cancelled and
+        // downstream cooldown/breaker/usage mutations don't continue mutating
+        // state behind the routing decision's back.
+        timeoutController.abort(new Error("combo-per-model-timeout"));
        resolve(
          new Response(JSON.stringify({ error: { message: `Model ${modelStr} timed out` } }), {
            status: 524,
@ -1665,9 +1678,19 @@ export async function handleComboChat({
        );
      }, COMBO_MODEL_TIMEOUT_MS);
    });
+    const targetWithSignal = {
+      ...(target ?? {}),
+      modelAbortSignal: timeoutController.signal,
+    };
    try {
      return await Promise.race([
-        handleSingleModelWrapped(b, modelStr, target).catch((err) => {
+        handleSingleModelWrapped(b, modelStr, targetWithSignal).catch((err) => {
+          if (timedOut) {
+            // Inner call rejected because we aborted it. The synthetic 524 from
+            // timeoutPromise already wins the race; return an empty response so
+            // the loser branch resolves cleanly without leaking err.message.
+            return new Response(null, { status: 599 });
+          }
          return errorResponse(502, err?.message ?? "Upstream model error");
        }),
        timeoutPromise,
@ -2441,13 +2464,17 @@ export async function handleComboChat({
        // Trigger shared provider circuit breaker for 5xx errors and connection failures.
        // If the next target in the combo is on the same provider, don't mark the provider
        // as failed — different models on the same provider may still succeed.
+        // G-02: when fallbackResult.skipProviderBreaker is set (embedded service supervisor
+        // outage signalled via X-Omni-Fallback-Hint: connection_cooldown) apply connection
+        // cooldown only — do NOT trip the whole-provider breaker.
        const nextTarget = orderedTargets[i + 1];
        const sameProviderNext =
          typeof nextTarget?.provider === "string" && nextTarget.provider === provider;
        if (
          !isStreamReadinessFailure &&
          isProviderFailureCode(result.status) &&
-          !sameProviderNext
+          !sameProviderNext &&
+          !fallbackResult.skipProviderBreaker
        ) {
          recordProviderFailure(provider, log, target.connectionId, profile);
        }
--- a/open-sse/services/tokenRefresh.ts
+++ b/open-sse/services/tokenRefresh.ts
@ -478,7 +478,7 @@ export async function refreshKimiCodingToken(
          Accept: "application/json",
          "X-Msh-Platform": platform,
          "X-Msh-Version": version,
-          "X-Msh-Device-Model": deviceModel,
+          "X-Msh-Device-Model": (providerSpecificData?.deviceModel as string) || deviceModel,
          "X-Msh-Device-Id": stableDeviceId,
          // These headers match getKimiOAuthHeaders() in providers/kimi-coding.ts.
          // They're derived at runtime from os module calls; use safe fallbacks here
--- a/open-sse/services/usage.ts
+++ b/open-sse/services/usage.ts
@ -118,6 +118,12 @@ type UsageQuota = {
  remainingPercentage?: number;
  resetAt: string | null;
  unlimited: boolean;
+  /**
+   * True when the upstream provider reported the remaining fraction. False
+   * means the API didn't include the field and the 0 value here is a sentinel,
+   * NOT a confirmed-exhausted state. Antigravity-specific.
+   */
+  fractionReported?: boolean;
  displayName?: string;
  details?: Array<{
    name: string;
@ -1940,11 +1946,19 @@ async function getAntigravityUsage(

      const rawFraction = toNumber(quotaInfo.remainingFraction, -1);
      const resetAt = parseResetTime(quotaInfo.resetTime);
-      // When remainingFraction is undefined/NaN (exhausted quota), default to 0%. Clamp to valid range [0, 1]
-      // Unlimited models have remainingFraction=1 AND no resetTime
-      const remainingFraction = rawFraction < 0 ? 0 : Math.max(0, Math.min(1, rawFraction));
-      // Models with no resetTime and full remaining are unlimited (e.g. tab-completion models)
-      const isUnlimited = !resetAt && remainingFraction >= 1;
+      // Distinguish "upstream did not report remainingFraction" from "remaining is 0%".
+      // A schema drift in Antigravity's quota API (very plausible — internal Google product)
+      // would otherwise silently mark every model as exhausted across the dashboard.
+      const fractionReported = rawFraction >= 0;
+      if (!fractionReported) {
+        console.warn(
+          `[Antigravity] model ${modelKey} returned no remainingFraction — quota unknown`
+        );
+      }
+      const remainingFraction = fractionReported ? Math.max(0, Math.min(1, rawFraction)) : 0;
+      // Models with no resetTime AND a reported full fraction are unlimited
+      // (e.g. tab-completion models). Unreported fraction is NEVER unlimited.
+      const isUnlimited = fractionReported && !resetAt && remainingFraction >= 1;
      const remainingPercentage = remainingFraction * 100;
      const QUOTA_NORMALIZED_BASE = 1000;
      const total = QUOTA_NORMALIZED_BASE;
@ -1957,6 +1971,7 @@ async function getAntigravityUsage(
        resetAt,
        remainingPercentage: isUnlimited ? 100 : remainingPercentage,
        unlimited: isUnlimited,
+        fractionReported,
      };
    }

--- a/open-sse/utils/proxyFetch.ts
+++ b/open-sse/utils/proxyFetch.ts
@ -101,7 +101,11 @@ function noProxyMatch(targetUrl) {
            ok = seg === "" || (hostname.endsWith(seg) && hostname.length - seg.length >= pos);
          } else {
            const idx = seg ? hostname.indexOf(seg, pos) : pos;
-            if (idx === -1) { ok = false; } else { pos = idx + seg.length; }
+            if (idx === -1) {
+              ok = false;
+            } else {
+              pos = idx + seg.length;
+            }
          }
        }
      }
@ -345,13 +349,19 @@ async function patchedFetch(
  ) {
    const vc = contextProxy as { host?: string; relayAuth?: string };
    if (!vc.relayAuth) {
-      throw new Error("[ProxyFetch] Vercel relay missing relayAuth — cannot route request");
+      // Generic message without internal labels — this throw can bubble up to
+      // catch blocks that put error.message in response bodies (combo per-model
+      // timeout, executor catch-all). Don't leak "[ProxyFetch]" diagnostics.
+      throw new Error("Vercel relay configuration error: missing relayAuth");
    }
    const targetUrl = getTargetUrl(input);
    const relayHeaders = buildVercelRelayHeaders(targetUrl, vc.relayAuth);
    const mergedHeaders = new Headers(options?.headers);
    for (const [k, v] of Object.entries(relayHeaders)) mergedHeaders.set(k, v);
-    console.log(`[ProxyFetch] Routing via Vercel relay: ${vc.host}`);
+    // Pass host through proxyUrlForLogs so the same redaction policy applies
+    // to relay routing logs (the rest of this module already follows that rule).
+    const hostForLogs = proxyUrlForLogs(vc.host ? `https://${vc.host}` : "");
+    console.log(`[ProxyFetch] Routing via Vercel relay: ${hostForLogs}`);
    return await originalFetch(`https://${vc.host}`, {
      ...options,
      headers: mergedHeaders,
--- a/src/lib/oauth/providers/kimi-coding.ts
+++ b/src/lib/oauth/providers/kimi-coding.ts
@ -120,5 +120,16 @@ export const kimiCoding = {
    expiresIn: tokens.expires_in,
    tokenType: tokens.token_type,
    scope: tokens.scope,
+    // Persist the device identity at login so refreshes use the SAME deviceId
+    // the device-code grant was issued against. Without this, tokenRefresh.ts
+    // falls back to pbkdf2(refresh_token, ...) — and Kimi rotates refresh_tokens
+    // per refresh, so the derived id changes every cycle and the anti-bot
+    // pipeline treats each refresh as a new device.
+    providerSpecificData: {
+      deviceId: getKimiDeviceId(),
+      deviceName: sanitizeHeaderValue(hostname()),
+      deviceModel: sanitizeHeaderValue(getDeviceModel()),
+      osVersion: sanitizeHeaderValue(osVersion()),
+    },
  }),
 };
--- a/src/lib/services/apiKey.ts
+++ b/src/lib/services/apiKey.ts
@ -6,11 +6,27 @@ export function generateServiceApiKey(prefix = "nr"): string {
  return `${prefix}_${randomBytes(32).toString("base64url")}`;
 }

+export class ServiceApiKeyDecryptError extends Error {
+  constructor(tool: string) {
+    super(
+      `Stored API key for service '${tool}' could not be decrypted. ` +
+        `STORAGE_ENCRYPTION_KEY may have changed, the row may be corrupted, ` +
+        `or it was written on a different machine. Reinstall the service ` +
+        `or rotate the key via POST /api/services/${tool}/rotate-key.`
+    );
+    this.name = "ServiceApiKeyDecryptError";
+  }
+}
+
 export async function getOrCreateApiKey(tool: string): Promise<string> {
  const row = await getServiceRow(tool);
  if (row?.apiKey) {
    const decrypted = decrypt(row.apiKey);
    if (decrypted) return decrypted;
+    // Fail loud: silently regenerating would mint a new key the embedded
+    // service has never been told, causing every request to 401 with no
+    // operator-facing signal.
+    throw new ServiceApiKeyDecryptError(tool);
  }
  const key = generateServiceApiKey(tool === "9router" ? "nr" : "cp");
  await updateServiceField(tool, "apiKey", encrypt(key) ?? key);
--- a/src/sse/handlers/chat.ts
+++ b/src/sse/handlers/chat.ts
@ -153,6 +153,27 @@ function intersectAllowedConnectionIds(primary: unknown, secondary: unknown): st

 const PROVIDER_BREAKER_FAILURE_STATUSES = new Set([408, 500, 502, 503, 504]);

+/**
+ * Wrap a Request-like object so callers reading `.signal` see a merged signal
+ * combining the original request signal with an extra per-call signal (e.g. a
+ * combo per-model timeout). Returns the original request unchanged when no
+ * extra signal is provided, so the hot path stays a no-op.
+ */
+function wrapRequestWithExtraSignal(request: any, extraSignal: AbortSignal | null) {
+  if (!extraSignal || !request) return request;
+  const baseSignal: AbortSignal | null | undefined = request?.signal ?? null;
+  const mergedSignal: AbortSignal = baseSignal
+    ? AbortSignal.any([baseSignal, extraSignal])
+    : extraSignal;
+  return new Proxy(request, {
+    get(target, prop, receiver) {
+      if (prop === "signal") return mergedSignal;
+      const value = Reflect.get(target, prop, receiver);
+      return typeof value === "function" ? value.bind(target) : value;
+    },
+  });
+}
+
 /**
 * Handle chat completion request
 * Supports: OpenAI, Claude, Gemini, OpenAI Responses API formats
@ -504,13 +525,14 @@ export async function handleChat(request: any, clientRawRequest: any = null) {
          stepId?: string | null;
          allowedConnectionIds?: string[] | null;
          failoverBeforeRetry?: boolean;
+          modelAbortSignal?: AbortSignal | null;
        }
      ) =>
        handleSingleModelChat(
          b,
          m,
          clientRawRequest,
-          request,
+          wrapRequestWithExtraSignal(request, target?.modelAbortSignal ?? null),
          combo.name,
          apiKeyInfo,
          telemetry,
@ -687,13 +709,14 @@ async function handleSingleModelChat(
          executionKey?: string | null;
          stepId?: string | null;
          failoverBeforeRetry?: boolean;
+          modelAbortSignal?: AbortSignal | null;
        }
      ) =>
        handleSingleModelChat(
          b,
          m,
          clientRawRequest,
-          request,
+          wrapRequestWithExtraSignal(request, target?.modelAbortSignal ?? null),
          redirectCombo.name ?? modelStr,
          apiKeyInfo,
          telemetry,