diff --git a/.env.example b/.env.example index 9979052a..004989d8 100644 --- a/.env.example +++ b/.env.example @@ -195,6 +195,24 @@ CLOUD_URL= # Default: http://localhost:20128 NEXT_PUBLIC_BASE_URL=http://localhost:20128 +# Browser-facing OmniRoute origin for generated assets in API responses. +# Used by: chatgpt-web image generation cache URLs (/v1/chatgpt-web/image/). +# Set this when OpenWebUI or another relay reaches OmniRoute by an internal URL +# but the user's browser must fetch images from a LAN, tunnel, or public origin. +# Do not include /v1; if included accidentally it will be normalized away. +# OMNIROUTE_PUBLIC_BASE_URL=http://192.168.0.15:20128 + +# Max wait time for an async chatgpt-web image to land via the celsius +# WebSocket, in milliseconds. Default 180000 (3 minutes). Increase during +# upstream queue-deep windows ("Lots of people are creating images right now"). +# OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS=180000 + +# Total in-memory byte budget for the chatgpt-web image cache (used to serve +# /v1/chatgpt-web/image/), in megabytes. Default 256. Lower this if you +# run OmniRoute on a memory-constrained host; raise it if image generation +# is heavy and clients are racing the 30-minute TTL. +# OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB=256 + # Public cloud URL — client-side mirror of CLOUD_URL. NEXT_PUBLIC_CLOUD_URL= @@ -261,6 +279,7 @@ NEXT_PUBLIC_ENABLE_SOCKS5_PROXY=true # Used by MCP server, A2A skills, and CLI sidecars to call the running instance. # Explicit base URL for MCP/A2A tools to reach OmniRoute (overrides localhost auto-detect). +# For browser-visible generated image URLs, prefer OMNIROUTE_PUBLIC_BASE_URL above. # Used by: open-sse/mcp-server/server.ts, src/lib/a2a/ # OMNIROUTE_BASE_URL=http://localhost:20128 diff --git a/next.config.mjs b/next.config.mjs index 23277f01..2973459f 100644 --- a/next.config.mjs +++ b/next.config.mjs @@ -18,6 +18,20 @@ const nextConfig = { }, }, output: "standalone", + // OmniRoute is a proxy for AI APIs — request bodies routinely include + // multi-MB payloads (vision models, image edits, base64-encoded files, + // long chat histories with embedded images). Next.js's Server Action + // handler intercepts POSTs with multipart/form-data or + // x-www-form-urlencoded content-types and enforces a 1 MB cap that + // surfaces as a 413 with a confusing "Server Actions" hint, even on + // pure route handlers. 50 MB matches what most upstream LLM providers + // accept for image-bearing requests; tune via env if a deployment needs + // more. + experimental: { + serverActions: { + bodySizeLimit: process.env.OMNIROUTE_SERVER_ACTIONS_BODY_LIMIT || "50mb", + }, + }, outputFileTracingRoot: projectRoot, outputFileTracingExcludes: { // Planning/task docs are not runtime assets and can break standalone copies diff --git a/open-sse/config/imageRegistry.ts b/open-sse/config/imageRegistry.ts index 576c8d1a..2c39ebb2 100644 --- a/open-sse/config/imageRegistry.ts +++ b/open-sse/config/imageRegistry.ts @@ -136,6 +136,17 @@ export const IMAGE_PROVIDERS: Record = { supportedSizes: ["512x512", "1024x1024", "1024x1536", "1536x1024"], }, + "chatgpt-web": { + id: "chatgpt-web", + alias: "cgpt-web", + baseUrl: "https://chatgpt.com/backend-api/f/conversation", + authType: "apikey", + authHeader: "cookie", + format: "chatgpt-web", + models: [{ id: "gpt-5.3-instant", name: "GPT-5.3 Instant (ChatGPT Web Image)" }], + supportedSizes: ["1024x1024", "1024x1536", "1536x1024"], + }, + xai: { id: "xai", baseUrl: "https://api.x.ai/v1/images/generations", diff --git a/open-sse/executors/chatgpt-web.ts b/open-sse/executors/chatgpt-web.ts index f8df8ebb..4ea91179 100644 --- a/open-sse/executors/chatgpt-web.ts +++ b/open-sse/executors/chatgpt-web.ts @@ -22,6 +22,12 @@ import { TlsClientUnavailableError, type TlsFetchResult, } from "../services/chatgptTlsClient.ts"; +import { + storeChatGptImage, + getChatGptImageConversationContext, + __resetChatGptImageCacheForTesting, + type ChatGptImageConversationContext, +} from "../services/chatgptImageCache.ts"; // ─── Constants ────────────────────────────────────────────────────────────── @@ -129,20 +135,23 @@ function tokenLookup(cookie: string): TokenEntry | null { return entry; } +const TOKEN_CACHE_MAX = 200; + function tokenStore(cookie: string, entry: TokenEntry): void { - tokenCache.set(cookieKey(cookie), entry); - // Trim to 200 entries (matches Perplexity executor's session cache) - if (tokenCache.size > 200) { + // Bound the cache to TOKEN_CACHE_MAX entries (FIFO). Same shape as the + // image cache and warmup cache — drop the oldest before inserting. + if (tokenCache.size >= TOKEN_CACHE_MAX && !tokenCache.has(cookieKey(cookie))) { const firstKey = tokenCache.keys().next().value; if (firstKey) tokenCache.delete(firstKey); } + tokenCache.set(cookieKey(cookie), entry); } -// Conversation continuity is intentionally not cached. The conversation body -// sets `history_and_training_disabled: true` (Temporary Chat mode), and -// chatgpt.com expires those conversation_ids quickly — re-using them returns -// 404. Open WebUI and most OpenAI-API-style clients send the full history -// each turn anyway, so each request just starts a fresh conversation. +// Conversation continuity is intentionally not cached. Open WebUI and most +// OpenAI-API-style clients re-send the full history each turn, so each +// request just starts a fresh conversation. Temporary Chat mode is the +// default; it gets disabled per-request only for image-gen prompts, since +// that mode rejects the image_gen tool. // ─── /api/auth/session — exchange cookie for JWT ──────────────────────────── @@ -393,10 +402,11 @@ async function prepareChatRequirements( deviceId: string, cookie: string, dplInfo: { dpl: string; scriptSrc: string }, - signal: AbortSignal | null | undefined + signal: AbortSignal | null | undefined, + log?: { warn?: (tag: string, msg: string) => void } | null ): Promise { const config = buildPrekeyConfig(CHATGPT_USER_AGENT, dplInfo.dpl, dplInfo.scriptSrc); - const prekey = await buildPrepareToken(config); + const prekey = await buildPrepareToken(config, log); const headers: Record = { ...browserHeaders(), @@ -596,9 +606,15 @@ function buildPrekeyConfig(userAgent: string, dpl: string, scriptSrc: string): u /** * Build the `p` (prekey) value sent in the chat-requirements POST body. * - * Format: "gAAAAAC" + base64(JSON(config)), with a brief PoW solver loop - * (difficulty "0fffff") mutating config[3] to find a hash whose hex prefix - * is ≤ the difficulty. Mirrors chat2api / openai-sentinel. + * Format: "" + base64(JSON(config)), with a PoW solver loop mutating + * config[3] to find a hash whose hex prefix is ≤ the target difficulty. + * Mirrors chat2api / openai-sentinel. + * - prepare: prefix="gAAAAAC", seed="" (target "0fffff") + * - chat-requirements: prefix="gAAAAAB", seed= (target=difficulty) + * + * Submitting an unsolved token still works on low-friction accounts, so we + * fall back to that after exhausting the iteration budget — but emit a warn + * log so production can see when it happens. */ // PoW solvers run up to 100k–500k SHA3-512 hashes. To avoid blocking the // Node event loop on a busy server, we yield with `setImmediate` every @@ -611,50 +627,68 @@ function yieldToEventLoop(): Promise { return new Promise((resolve) => setImmediate(resolve)); } -async function buildPrepareToken(config: unknown[]): Promise { - const target = "0fffff"; - const cfg = [...config]; - for (let i = 0; i < 100_000; i++) { - if (i > 0 && i % POW_YIELD_EVERY === 0) await yieldToEventLoop(); - cfg[3] = i; - const json = JSON.stringify(cfg); - const b64 = Buffer.from(json).toString("base64"); - const hash = createHash("sha3-512").update(b64).digest("hex"); - if (hash.slice(0, target.length) <= target) { - return `gAAAAAC${b64}`; - } - } - // Fallback — submit unsolved; some clients do this and it still works. - const b64 = Buffer.from(JSON.stringify(cfg)).toString("base64"); - return `gAAAAAC${b64}`; +interface PowOptions { + config: unknown[]; + seed: string; + target: string; + prefix: string; + maxIter: number; + label: string; + log?: { warn?: (tag: string, msg: string) => void } | null; } -async function solveProofOfWork( - seed: string, - difficulty: string, - config: unknown[] -): Promise { - const target = (difficulty || "").toLowerCase(); - const cfg = [...config]; - const maxIter = 500_000; - - for (let i = 0; i < maxIter; i++) { +async function solvePow(opts: PowOptions): Promise { + const cfg = [...opts.config]; + for (let i = 0; i < opts.maxIter; i++) { if (i > 0 && i % POW_YIELD_EVERY === 0) await yieldToEventLoop(); cfg[3] = i; const json = JSON.stringify(cfg); const b64 = Buffer.from(json).toString("base64"); const hash = createHash("sha3-512") - .update(seed + b64) + .update(opts.seed + b64) .digest("hex"); - if (target && hash.slice(0, target.length) <= target) { - return `gAAAAAB${b64}`; + if (opts.target && hash.slice(0, opts.target.length) <= opts.target) { + return `${opts.prefix}${b64}`; } } - - // Fallback: submit unsolved with the gAAAAAB prefix; some clients do this - // and the request still goes through on legacy/low-friction prompts. + opts.log?.warn?.( + "CGPT-WEB", + `PoW (${opts.label}) exhausted ${opts.maxIter} iterations against target=${opts.target || ""}; submitting unsolved token (Sentinel may reject)` + ); const b64 = Buffer.from(JSON.stringify(cfg)).toString("base64"); - return `gAAAAAB${b64}`; + return `${opts.prefix}${b64}`; +} + +async function buildPrepareToken( + config: unknown[], + log?: { warn?: (tag: string, msg: string) => void } | null +): Promise { + return solvePow({ + config, + seed: "", + target: "0fffff", + prefix: "gAAAAAC", + maxIter: 100_000, + label: "prepare", + log, + }); +} + +async function solveProofOfWork( + seed: string, + difficulty: string, + config: unknown[], + log?: { warn?: (tag: string, msg: string) => void } | null +): Promise { + return solvePow({ + config, + seed, + target: (difficulty || "").toLowerCase(), + prefix: "gAAAAAB", + maxIter: 500_000, + label: "conversation", + log, + }); } // ─── OpenAI → ChatGPT message translation ─────────────────────────────────── @@ -663,11 +697,48 @@ interface ParsedMessages { systemMsg: string; history: Array<{ role: string; content: string }>; currentMsg: string; + latestImageContext: ChatGptImageConversationContext | null; +} + +/** + * Strip embedded `data:image/...` URIs out of message content so prior + * generated images don't get fed back into chatgpt.com on the next turn. + * + * Why: when image generation succeeds we emit `![image](data:image/png;base64,...)` + * — frequently 2–4 MB. Chat clients (Open WebUI, OpenAI-style apps) replay + * the full conversation history on the next request, so without this strip + * we'd send megabytes of base64 back upstream. chatgpt.com responds with an + * empty body when that happens (verified: 502 "ChatGPT returned empty + * response body" on the very next turn after an image gen succeeds), and + * even if it didn't, a single inlined image is well past the model's context + * limit. Replacing with a short placeholder keeps semantic continuity + * without the bytes. + */ +const DATA_URI_IMAGE_RE = /!\[([^\]]*)\]\(data:image\/[^)]+\)/g; +const CACHED_IMAGE_URL_RE = /\/v1\/chatgpt-web\/image\/([a-f0-9]{16,64})(?=[)\s"'<>]|$)/gi; + +function stripInlinedImages(content: string): string { + return content.replace(DATA_URI_IMAGE_RE, (_, alt) => + alt ? `[${alt}: generated image]` : "[generated image]" + ); +} + +function findCachedImageContext(content: string): ChatGptImageConversationContext | null { + let latest: ChatGptImageConversationContext | null = null; + // String.prototype.matchAll consumes a fresh iterator and ignores the + // regex's lastIndex, so no manual reset is required. + for (const match of content.matchAll(CACHED_IMAGE_URL_RE)) { + const id = match[1]; + const context = getChatGptImageConversationContext(id); + if (context) latest = context; + } + return latest; } function parseOpenAIMessages(messages: Array>): ParsedMessages { let systemMsg = ""; const history: Array<{ role: string; content: string }> = []; + let latestImageContext: ChatGptImageConversationContext | null = null; for (const msg of messages) { let role = String(msg.role || "user"); @@ -682,6 +753,9 @@ function parseOpenAIMessages(messages: Array>): ParsedMe .map((c) => String(c.text || "")) .join(" "); } + content = stripInlinedImages(content); + const imageContext = findCachedImageContext(content); + if (imageContext) latestImageContext = imageContext; if (!content.trim()) continue; if (role === "system") { @@ -696,7 +770,7 @@ function parseOpenAIMessages(messages: Array>): ParsedMe currentMsg = history.pop()!.content; } - return { systemMsg, history, currentMsg }; + return { systemMsg, history, currentMsg, latestImageContext }; } interface ChatGptMessage { @@ -705,10 +779,99 @@ interface ChatGptMessage { content: { content_type: "text"; parts: string[] }; } +/** + * Cheap heuristic: does the last user turn look like an image-generation + * request? Used to decide whether to disable Temporary Chat mode. + * + * Why a heuristic instead of always disabling Temporary Chat: when + * `history_and_training_disabled: false`, every conversation gets saved to + * the user's chatgpt.com history. For text-only chats that's noise — a + * dozen "OmniRoute" entries clutter the sidebar and can interact with + * ChatGPT's memory. We pay that cost only when the user actually wants an + * image, since Temporary Chat refuses image_gen with the message + * "I cannot generate images in this chat". + * + * False positives (text chat misclassified as image) → unnecessary history + * entry. False negatives (image request misclassified as text) → ChatGPT + * refuses image_gen and the user retries. Tuning leans toward false + * positives (we'd rather pollute history than refuse image generation). + */ +const IMAGE_GEN_REGEXES: RegExp[] = [ + // verb + (anything within 40 chars) + image-noun + /\b(?:generate|create|make|draw|paint|render|produce|design|sketch|illustrate|show me)\b[\s\S]{0,40}\b(?:image|picture|photo|photograph|drawing|illustration|sketch|painting|portrait|logo|icon|art|artwork|wallpaper|render|graphic)\b/i, + // image-noun + "of" — "image of a kitten", "picture of mountains" + /\b(?:image|picture|photo|photograph|illustration|drawing|painting|render)\s+of\b/i, + // direct verb + a/an article — "draw a kitten", "paint an apple" + /\b(?:draw|paint|sketch|render|illustrate)\s+(?:me\s+)?(?:a|an|some|the)\s+\w+/i, + // explicit slash command users sometimes type — "/imagine ..." + /^\s*\/(?:image|imagine|img|draw|paint)\b/im, +]; + +/** + * Markers Open WebUI uses for its background tool prompts (follow-up + * suggestions, title generation, tag categorization). These prompts embed + * the prior conversation in `` blocks and frequently quote + * the user's earlier "generate an image of..." request — which would + * trip the image-gen regex below. Skip them so we don't unnecessarily + * disable Temporary Chat and trigger image_gen on background tasks. + * + * Catching just one of these markers is enough; tool prompts always + * include several together. + */ +const OPENWEBUI_TOOL_PROMPT_MARKERS = [ + //i, + /^### Task:/im, + /\bJSON format:\s*\{/i, + /\bfollow_?ups\b.*\barray of strings\b/i, +]; + +const OPENWEBUI_IMAGE_CONTEXT_MARKERS = [ + /\s*The requested image has been (?:created|edited and created) by the system successfully/i, + /\s*The requested image has been edited and created and is now being shown to the user/i, + /\s*Image generation was attempted but failed/i, +]; + +function hasOpenWebUIImageContext(parsed: ParsedMessages): boolean { + return OPENWEBUI_IMAGE_CONTEXT_MARKERS.some((re) => re.test(parsed.systemMsg)); +} + +function looksLikeImageGenRequest(parsed: ParsedMessages): boolean { + // Inspect only the latest user turn — historical turns are irrelevant + // (and could trigger false positives if the user mentioned an image + // generated previously). + const text = parsed.currentMsg.trim(); + if (!text) return false; + if (OPENWEBUI_TOOL_PROMPT_MARKERS.some((re) => re.test(text))) return false; + if (hasOpenWebUIImageContext(parsed)) return false; + return IMAGE_GEN_REGEXES.some((re) => re.test(text)); +} + +const IMAGE_EDIT_REGEXES: RegExp[] = [ + /\b(?:edit|adjust|modify|change|update|alter|revise|retouch|fix)\b[\s\S]{0,120}\b(?:it|image|picture|photo|lighting|background|style|color|colour|composition|scene|time of day)\b/i, + /\b(?:make|turn|set|switch)\s+(?:it|the\s+(?:image|picture|photo|scene))\b[\s\S]{0,120}\b/i, + /\b(?:add|remove|replace)\b[\s\S]{0,120}\b(?:it|image|picture|photo|background|sky|person|object|text|logo)\b/i, + /\b(?:brighter|darker|night|daytime|time of day|sunset|sunrise|morning|evening|lighting|relight|background|style)\b/i, + /^\s*(?:now|then|also)\b[\s\S]{0,120}\b(?:make|turn|change|adjust|add|remove|replace|edit)\b/i, +]; + +function looksLikeImageEditRequest(parsed: ParsedMessages): boolean { + if (!parsed.latestImageContext) return false; + const text = parsed.currentMsg.trim(); + if (!text) return false; + if (OPENWEBUI_TOOL_PROMPT_MARKERS.some((re) => re.test(text))) return false; + if (hasOpenWebUIImageContext(parsed)) return false; + return IMAGE_EDIT_REGEXES.some((re) => re.test(text)); +} + function buildConversationBody( parsed: ParsedMessages, modelSlug: string, - parentMessageId: string + parentMessageId: string, + // When true, send as a regular (non-temporary) chat so the image_gen tool + // is available. When false (default), use Temporary Chat to keep chats + // out of the user's chatgpt.com history. + forImageGen: boolean, + continuation: ChatGptImageConversationContext | null = null ): Record { // Critical: do NOT send prior turns as separate `assistant` and `user` // messages in the `messages` array. ChatGPT's web API ("action: next") @@ -723,7 +886,7 @@ function buildConversationBody( if (parsed.systemMsg.trim()) { systemParts.push(parsed.systemMsg.trim()); } - if (parsed.history.length > 0) { + if (!continuation && parsed.history.length > 0) { const formatted = parsed.history .map((h) => `${h.role === "assistant" ? "Assistant" : "User"}: ${h.content}`) .join("\n\n"); @@ -741,22 +904,32 @@ function buildConversationBody( }); } + const currentUserContent = hasOpenWebUIImageContext(parsed) + ? "Briefly acknowledge the image result described in the system context. Do not generate, edit, or request another image." + : parsed.currentMsg || ""; + messages.push({ id: randomUUID(), author: { role: "user" }, - content: { content_type: "text", parts: [parsed.currentMsg || ""] }, + content: { content_type: "text", parts: [currentUserContent] }, }); return { action: "next", messages, model: modelSlug, - // Conversation continuity intentionally disabled — Temporary Chat mode - // expires conversation_ids quickly upstream and 404s on reuse. - conversation_id: null, - parent_message_id: parentMessageId, + // Text-only API-style requests start fresh because clients replay full + // history. Generated-image edits are the exception: ChatGPT needs the + // original conversation node to adjust the actual image, not just a + // markdown URL echoed back in a synthetic history block. + conversation_id: continuation?.conversationId ?? null, + parent_message_id: continuation?.parentMessageId ?? parentMessageId, timezone_offset_min: -new Date().getTimezoneOffset(), - history_and_training_disabled: true, + // Temporary Chat is the default. Disable it ONLY when the user is asking + // for an image — that lets ChatGPT use its image_gen tool, at the cost of + // saving the chat to the user's history. For text-only requests we keep + // Temporary Chat on so the user's history stays clean. + history_and_training_disabled: !(forImageGen || continuation), suggestions: [], websocket_request_id: randomUUID(), }; @@ -778,6 +951,20 @@ interface ChatGptStreamEvent { v?: unknown; } +/** + * A part inside `content.parts` for a `multimodal_text` content_type. + * ChatGPT puts image references in a part with content_type "image_asset_pointer" + * and an asset_pointer like "file-service://file-XXXX" (final) or + * "sediment://..." (in-progress preview). + */ +interface ImageAssetPart { + content_type?: string; + asset_pointer?: string; + width?: number; + height?: number; + metadata?: Record; +} + async function* readChatGptSseEvents( body: ReadableStream, signal?: AbortSignal | null @@ -849,6 +1036,44 @@ interface ContentChunk { messageId?: string; error?: string; done?: boolean; + /** Image asset pointers seen on the current message (e.g. file-service://file-abc). */ + imagePointers?: ImagePointerRef[]; + /** + * True if the assistant invoked the async image_gen tool (we saw a task id + * in metadata or `turn_use_case: "image gen"` in server_ste_metadata). + * Set on the final `done: true` chunk so the caller can decide to poll the + * conversation endpoint for the actual image. + */ + imageGenAsync?: boolean; +} + +interface ImagePointerRef { + pointer: string; + messageId?: string; +} + +/** + * Pull image asset pointers out of a multimodal_text parts array. + * + * For text-only messages parts is `["text..."]` and this returns `[]`. For + * `image_gen` tool output, parts looks like: + * [ + * { content_type: "image_asset_pointer", + * asset_pointer: "file-service://file-abc..." or "sediment://..." } + * ] + * We collect every asset_pointer seen so the caller can resolve them once + * the stream terminates. + */ +function extractImagePointers(parts: unknown[]): string[] { + const out: string[] = []; + for (const p of parts) { + if (!p || typeof p !== "object") continue; + const obj = p as ImageAssetPart; + if (obj.content_type === "image_asset_pointer" && typeof obj.asset_pointer === "string") { + out.push(obj.asset_pointer); + } + } + return out; } async function* extractContent( @@ -870,6 +1095,12 @@ async function* extractContent( let currentParts = ""; let emittedLen = 0; let isLive = false; + // Dedupe pointers across echoes / repeated events. Order-preserving Set. + const imagePointers = new Map(); + // True if we observed signals the assistant kicked off the async image_gen + // tool (see ContentChunk.imageGenAsync). The actual image arrives later via + // WebSocket / polling — caller handles that. + let imageGenAsync = false; for await (const event of readChatGptSseEvents(eventStream, signal)) { if (event.error) { @@ -883,8 +1114,37 @@ async function* extractContent( if (event.conversation_id) conversationId = event.conversation_id; + // Detect image_gen on top-level "server_ste_metadata" events. These don't + // have a `message` field so the post-message guard would skip them, but + // they're the most reliable signal — `turn_use_case: "image gen"`. + // + // Originally we also accepted `meta.tool_invoked === true`, but ChatGPT + // sets that flag for ANY internal tool the assistant uses (reasoning + // chains, web search, calc, file_search, etc.). That made plain text + // turns spuriously emit the "Generating image…" placeholder + 30s + // WebSocket wait. Image gen has a more specific signal we can rely on: + // either `turn_use_case === "image gen"` here, or an `image_gen_task_id` + // on a tool-role message (handled below). + if (event.type === "server_ste_metadata") { + const meta = (event as Record).metadata as + | Record + | undefined; + if (meta && meta.turn_use_case === "image gen") { + imageGenAsync = true; + } + } + const m = event.message; if (!m) continue; + + // Tool messages with `image_gen_task_id` in metadata (the "Processing + // image..." card) confirm the async image_gen flow. We don't surface the + // tool message itself as text — it's just a placeholder — but we mark + // imageGenAsync so the executor knows to poll for the final image. + if (m.metadata && typeof m.metadata.image_gen_task_id === "string") { + imageGenAsync = true; + } + if (m.author?.role !== "assistant") continue; const id = m.id ?? null; @@ -903,6 +1163,25 @@ async function* extractContent( const parts = m.content?.parts ?? []; if (parts.length === 0) continue; + + // Image asset pointers: only collect once the message is finalized + // (status === "finished_successfully"). The same pointer may also appear + // on echoed prior turns at the head of the stream; that's fine — the Set + // dedupes, and the resolver in the executor produces the same URL either + // way. We could restrict to isLive-only to avoid resolving echoes, but + // that makes single-event instant responses (no in_progress phase) lose + // their image. Letting echoes through is harmless for correctness; the + // executor resolves each unique pointer at most once. + if (status === "finished_successfully" || status === "" || isLive) { + for (const ptr of extractImagePointers(parts)) { + const existing = imagePointers.get(ptr); + imagePointers.set( + ptr, + existing?.messageId ? existing : { pointer: ptr, ...(id ? { messageId: id } : {}) } + ); + } + } + const cumulative = parts.map((p) => (typeof p === "string" ? p : "")).join(""); if (cumulative.length > currentParts.length) { currentParts = cumulative; @@ -937,6 +1216,8 @@ async function* extractContent( answer: currentParts, conversationId: conversationId ?? undefined, messageId: currentId ?? undefined, + imagePointers: imagePointers.size > 0 ? Array.from(imagePointers.values()) : undefined, + imageGenAsync, done: true, }; } @@ -947,11 +1228,64 @@ function sseChunk(data: unknown): string { return `data: ${JSON.stringify(data)}\n\n`; } +/** + * Resolves a ChatGPT asset_pointer to a downloadable URL, given the live + * conversation_id (needed for sediment:// pointers). Returns null on failure + * so the caller can decide whether to surface a placeholder or skip silently. + */ +type ImageResolver = ( + assetPointer: string, + conversationId: string | null, + parentMessageId?: string | null +) => Promise; + +/** Build the final markdown block for a list of resolved image URLs. */ +function imageMarkdown(urls: string[]): string { + if (urls.length === 0) return ""; + // Two leading newlines → ensure separation from any prior text the model + // produced ("Here is your kitten:\n\n![image](...)"). One image per line. + return "\n\n" + urls.map((u) => `![image](${u})`).join("\n\n"); +} + +async function resolveImagePointers( + pointers: ImagePointerRef[] | undefined, + conversationId: string | null, + resolver: ImageResolver | null, + log?: { warn?: (tag: string, msg: string) => void } | null, + fallbackParentMessageId?: string | null +): Promise { + if (!pointers || pointers.length === 0 || !resolver) return []; + const urls: string[] = []; + for (const ref of pointers) { + try { + const url = await resolver( + ref.pointer, + conversationId, + ref.messageId ?? fallbackParentMessageId + ); + if (url) urls.push(url); + } catch (err) { + log?.warn?.( + "CGPT-WEB", + `Image resolve failed (${ref.pointer}): ${err instanceof Error ? err.message : String(err)}` + ); + } + } + return urls; +} + function buildStreamingResponse( eventStream: ReadableStream, model: string, cid: string, created: number, + resolver: ImageResolver | null, + // Optional poller for async image_gen — when ChatGPT processes the request + // out-of-band ("Lots of people are creating images right now"), the SSE + // stream finishes without an image_asset_pointer. The executor passes a + // closure here that knows how to poll the conversation endpoint. + pollAsyncImage: ((conversationId: string) => Promise) | null, + log: { warn?: (tag: string, msg: string) => void } | null, signal?: AbortSignal | null ): ReadableStream { const encoder = new TextEncoder(); @@ -974,7 +1308,14 @@ function buildStreamingResponse( ) ); + let conversationId: string | null = null; + let imagePointers: ImagePointerRef[] | undefined; + let imageGenAsync = false; + let parentCandidateMessageId: string | null = null; + for await (const chunk of extractContent(eventStream, signal)) { + if (chunk.conversationId) conversationId = chunk.conversationId; + if (chunk.messageId) parentCandidateMessageId = chunk.messageId; if (chunk.error) { controller.enqueue( encoder.encode( @@ -999,6 +1340,9 @@ function buildStreamingResponse( } if (chunk.done) { + imagePointers = chunk.imagePointers; + imageGenAsync = chunk.imageGenAsync ?? false; + if (chunk.messageId) parentCandidateMessageId = chunk.messageId; break; } @@ -1028,19 +1372,167 @@ function buildStreamingResponse( } } - controller.enqueue( - encoder.encode( - sseChunk({ - id: cid, - object: "chat.completion.chunk", - created, - model, - system_fingerprint: null, - choices: [{ index: 0, delta: {}, finish_reason: "stop", logprobs: null }], - }) + // If the assistant kicked off the async image_gen tool, the SSE + // stream ends with a "Processing image..." placeholder. Poll the + // conversation endpoint in the background for the final pointer. + // We only kick polling off if the in-stream pointers are empty — + // sometimes the synchronous path also fires and we already have one. + // Heartbeat helper: while we wait on long-running async work + // (WebSocket for image-gen, /files/download → 2-3 MB image fetch), + // the SSE stream goes quiet and Open WebUI's HTTP client times out + // at ~30s. We saw this in production: `disconnect: ResponseAborted` + // followed by "Controller is already closed". + // + // Layered traps to avoid: + // - SSE comments (`: ...`) are silently ignored by aiohttp's + // read-activity tracker. + // - Empty `delta:{}` chunks ARE emitted by us but get filtered + // out upstream by `hasValuableContent` in + // `open-sse/utils/streamHelpers.ts` (it requires content, + // role, or finish_reason on OpenAI chunks). + // + // So heartbeats are zero-width-space content deltas (`"​"`): + // they pass the valuable-content filter (non-empty content), reach + // the client as data events, and render as nothing visible. + const startHeartbeat = (intervalMs = 5_000): (() => void) => { + const heartbeatChunk = sseChunk({ + id: cid, + object: "chat.completion.chunk", + created, + model, + system_fingerprint: null, + choices: [{ index: 0, delta: { content: "​" }, finish_reason: null, logprobs: null }], + }); + const timer = setInterval(() => { + try { + controller.enqueue(encoder.encode(heartbeatChunk)); + } catch { + // Controller may already be closed if the client disconnected + // — just stop firing. + clearInterval(timer); + } + }, intervalMs); + return () => clearInterval(timer); + }; + + if ( + imageGenAsync && + conversationId && + (!imagePointers || imagePointers.length === 0) && + pollAsyncImage + ) { + // Tell the user something is happening — long polls otherwise + // look like a hang on the client side. The "..." plus a typing + // cue renders nicely in Open WebUI. + controller.enqueue( + encoder.encode( + sseChunk({ + id: cid, + object: "chat.completion.chunk", + created, + model, + system_fingerprint: null, + choices: [ + { + index: 0, + delta: { content: "_Generating image…_\n\n" }, + finish_reason: null, + logprobs: null, + }, + ], + }) + ) + ); + const stopHb = startHeartbeat(); + try { + const polled = await pollAsyncImage(conversationId); + if (polled.length > 0) imagePointers = polled; + } catch (err) { + log?.warn?.( + "CGPT-WEB", + `Async image poll failed: ${err instanceof Error ? err.message : String(err)}` + ); + } finally { + stopHb(); + } + } + + // Resolve and append any image markdown after the text deltas finish + // streaming. Downloading and caching the image bytes can take 1-3 + // seconds for big images, so keep the heartbeat running here too. + const stopHb2 = startHeartbeat(); + let urls: string[] = []; + try { + urls = await resolveImagePointers( + imagePointers, + conversationId, + resolver, + log, + parentCandidateMessageId + ); + } finally { + stopHb2(); + } + // Bail out cleanly if the client disconnected during the wait — + // any further enqueue throws "Invalid state: Controller is + // already closed". Better to no-op than to surface that as a + // server error. + if (signal?.aborted) return; + const mdBlock = imageMarkdown(urls); + const safeEnqueue = (bytes: Uint8Array): boolean => { + try { + controller.enqueue(bytes); + return true; + } catch { + return false; + } + }; + // The image markdown is now a small URL (we cache the bytes in + // memory and serve them at /v1/chatgpt-web/image/), so a + // single SSE chunk is fine — no aiohttp LineTooLong concerns + // and the markdown renderer in Open WebUI sees the URL whole + // and renders an `` immediately. + if (mdBlock) { + if ( + !safeEnqueue( + encoder.encode( + sseChunk({ + id: cid, + object: "chat.completion.chunk", + created, + model, + system_fingerprint: null, + choices: [ + { + index: 0, + delta: { content: mdBlock }, + finish_reason: null, + logprobs: null, + }, + ], + }) + ) + ) ) - ); - controller.enqueue(encoder.encode("data: [DONE]\n\n")); + return; + } + + if ( + !safeEnqueue( + encoder.encode( + sseChunk({ + id: cid, + object: "chat.completion.chunk", + created, + model, + system_fingerprint: null, + choices: [{ index: 0, delta: {}, finish_reason: "stop", logprobs: null }], + }) + ) + ) + ) + return; + safeEnqueue(encoder.encode("data: [DONE]\n\n")); } catch (err) { controller.enqueue( encoder.encode( @@ -1077,11 +1569,20 @@ async function buildNonStreamingResponse( cid: string, created: number, currentMsg: string, + resolver: ImageResolver | null, + pollAsyncImage: ((conversationId: string) => Promise) | null, + log: { warn?: (tag: string, msg: string) => void } | null, signal?: AbortSignal | null ): Promise { let fullAnswer = ""; + let conversationId: string | null = null; + let imagePointers: ImagePointerRef[] | undefined; + let imageGenAsync = false; + let parentCandidateMessageId: string | null = null; for await (const chunk of extractContent(eventStream, signal)) { + if (chunk.conversationId) conversationId = chunk.conversationId; + if (chunk.messageId) parentCandidateMessageId = chunk.messageId; if (chunk.error) { return new Response( JSON.stringify({ @@ -1092,12 +1593,43 @@ async function buildNonStreamingResponse( } if (chunk.done) { fullAnswer = chunk.answer || fullAnswer; + imagePointers = chunk.imagePointers; + imageGenAsync = chunk.imageGenAsync ?? false; + if (chunk.messageId) parentCandidateMessageId = chunk.messageId; break; } if (chunk.answer) fullAnswer = chunk.answer; } fullAnswer = cleanChatGptText(fullAnswer); + + // Async image gen: SSE ended with "Processing image..." — poll for the + // final pointer the same way the streaming path does. + if ( + imageGenAsync && + conversationId && + (!imagePointers || imagePointers.length === 0) && + pollAsyncImage + ) { + try { + const polled = await pollAsyncImage(conversationId); + if (polled.length > 0) imagePointers = polled; + } catch (err) { + log?.warn?.( + "CGPT-WEB", + `Async image poll failed: ${err instanceof Error ? err.message : String(err)}` + ); + } + } + + const urls = await resolveImagePointers( + imagePointers, + conversationId, + resolver, + log, + parentCandidateMessageId + ); + fullAnswer += imageMarkdown(urls); const promptTokens = Math.ceil(currentMsg.length / 4); const completionTokens = Math.ceil(fullAnswer.length / 4); @@ -1135,6 +1667,578 @@ function errorResponse(status: number, message: string, code?: string): Response ); } +function normalizePublicBaseUrl(value?: string | null): string | null { + const trimmed = value?.trim(); + if (!trimmed) return null; + return trimmed.replace(/\/+$/, "").replace(/\/v1$/i, ""); +} + +function firstForwardedValue(value?: string | null): string | null { + const first = value?.split(",")[0]?.trim(); + return first || null; +} + +function isLocalBaseUrl(baseUrl: string): boolean { + try { + const host = new URL(baseUrl).hostname.toLowerCase(); + return host === "localhost" || host === "127.0.0.1" || host === "::1" || host === "0.0.0.0"; + } catch { + return /\b(?:localhost|127\.0\.0\.1|0\.0\.0\.0)\b/i.test(baseUrl); + } +} + +function deriveHeaderBaseUrl(clientHeaders?: Record | null): string | null { + const headers = clientHeaders ?? {}; + const lower: Record = {}; + for (const [k, v] of Object.entries(headers)) lower[k.toLowerCase()] = v; + + const forwardedHost = firstForwardedValue(lower["x-forwarded-host"]); + const forwardedProto = firstForwardedValue(lower["x-forwarded-proto"]); + const host = forwardedHost || firstForwardedValue(lower["host"]); + if (!host) return null; + + // Default to http for IPs, localhost, and explicit host:port values where + // TLS is not a safe assumption. Reverse proxies can override via + // x-forwarded-proto, and deployments can force the exact value with + // OMNIROUTE_PUBLIC_BASE_URL. + const isPlain = + host.includes("localhost") || + /^\d+\.\d+\.\d+\.\d+(:\d+)?$/.test(host) || + host.endsWith(".local") || + host.includes(":"); + const proto = forwardedProto || (isPlain ? "http" : "https"); + return `${proto}://${host}`; +} + +/** + * Build the absolute base URL the client should use to fetch our cached + * images at /v1/chatgpt-web/image/. The most reliable value is an + * explicit browser-facing origin because relay clients such as Open WebUI + * often reach OmniRoute from a container while the user's browser needs a + * LAN, tunnel, or reverse-proxy URL. + */ +function derivePublicBaseUrl( + clientHeaders?: Record | null, + log?: { debug?: (tag: string, msg: string) => void } +): string { + const explicitPublicBase = normalizePublicBaseUrl(process.env.OMNIROUTE_PUBLIC_BASE_URL); + if (explicitPublicBase) { + log?.debug?.("CGPT-WEB", `derivePublicBaseUrl: using OMNIROUTE_PUBLIC_BASE_URL`); + return explicitPublicBase; + } + + const headerBase = deriveHeaderBaseUrl(clientHeaders); + const configuredBase = + normalizePublicBaseUrl(process.env.OMNIROUTE_BASE_URL) || + normalizePublicBaseUrl(process.env.NEXT_PUBLIC_BASE_URL); + + log?.debug?.( + "CGPT-WEB", + `derivePublicBaseUrl: configured=${configuredBase ?? "-"} header=${headerBase ?? "-"}` + ); + + if (configuredBase && (!headerBase || !isLocalBaseUrl(configuredBase))) return configuredBase; + if (headerBase) return headerBase; + if (configuredBase) return configuredBase; + + return `http://localhost:${process.env.PORT || 20128}`; +} + +// ─── Image asset resolution ──────────────────────────────────────────────── +// ChatGPT's image_gen tool emits `image_asset_pointer` parts whose +// `asset_pointer` is one of: +// +// file-service://file-XXXX → resolved via /backend-api/files/{id}/download +// sediment://file-XXXX → resolved via /backend-api/conversation/{conv_id}/attachment/{id}/download +// +// Both endpoints return JSON `{ download_url: "", ... }`. +// The signed URL has a limited lifetime (typically a few hours), but that's +// usually sufficient for the user to view the image in their UI right after +// generation. Persistent storage can be layered on later if needed. + +const FILE_SERVICE_PREFIX = "file-service://"; +const SEDIMENT_PREFIX = "sediment://"; + +interface ResolverContext { + accessToken: string; + accountId: string | null; + sessionId: string; + deviceId: string; + cookie: string; + signal?: AbortSignal | null; + log?: { debug?: (tag: string, msg: string) => void; warn?: (tag: string, msg: string) => void }; + /** + * Absolute base URL that downstream clients should use to fetch cached + * images served by /v1/chatgpt-web/image/. Derived from the inbound + * request host so the URL is reachable from whatever network the client + * came in on (localhost, Tailscale, cloudflared tunnel, etc.). + */ + publicBaseUrl: string; +} + +async function fetchDownloadUrl(endpoint: string, ctx: ResolverContext): Promise { + const headers: Record = { + ...browserHeaders(), + ...oaiHeaders(ctx.sessionId, ctx.deviceId), + Accept: "application/json", + Authorization: `Bearer ${ctx.accessToken}`, + Cookie: buildSessionCookieHeader(ctx.cookie), + }; + if (ctx.accountId) headers["chatgpt-account-id"] = ctx.accountId; + + const response = await tlsFetchChatGpt(endpoint, { + method: "GET", + headers, + timeoutMs: 30_000, + signal: ctx.signal, + }); + if (response.status !== 200) { + ctx.log?.warn?.( + "CGPT-WEB", + `Image download URL fetch failed (${response.status}) for ${endpoint}` + ); + return null; + } + let parsed: { download_url?: string } = {}; + try { + parsed = JSON.parse(response.text || "{}"); + } catch { + return null; + } + return parsed.download_url ?? null; +} + +/** + * Download a chatgpt.com signed image URL and re-serve it from OmniRoute's + * short-lived image cache. The URLs returned by /files//download and + * /conversation//attachment//download point at chatgpt.com's + * estuary endpoint, which 403s for any request without the user's session + * cookie. Downstream clients (Open WebUI, OpenAI-compatible apps) won't + * have those cookies, so we download once via the authenticated TLS client + * and return a browser-fetchable OmniRoute URL. + */ +const IMAGE_DOWNLOAD_MAX_BYTES = 8 * 1024 * 1024; + +async function imageUrlToCachedImageUrl( + signedUrl: string, + ctx: ResolverContext, + imageContext?: ChatGptImageConversationContext +): Promise { + const headers: Record = { + ...browserHeaders(), + Accept: "image/*,*/*;q=0.8", + Authorization: `Bearer ${ctx.accessToken}`, + Cookie: buildSessionCookieHeader(ctx.cookie), + }; + if (ctx.accountId) headers["chatgpt-account-id"] = ctx.accountId; + + let response: TlsFetchResult; + try { + response = await tlsFetchChatGpt(signedUrl, { + method: "GET", + headers, + timeoutMs: 60_000, + signal: ctx.signal, + // Required for binary payloads — the underlying tls-client returns + // bytes as a `data:;base64,...` string when this is true. + // Without it, raw image bytes get mangled by UTF-8 decoding. + byteResponse: true, + }); + } catch (err) { + ctx.log?.warn?.( + "CGPT-WEB", + `Image fetch failed: ${err instanceof Error ? err.message : String(err)}` + ); + return null; + } + + if (response.status !== 200) { + ctx.log?.warn?.( + "CGPT-WEB", + `Image fetch returned HTTP ${response.status} (${(response.text || "").slice(0, 120)})` + ); + return null; + } + + if (response.text == null || response.text.length === 0) return null; + + // tls-client-node already returns binary bodies as a "data:;base64,..." + // string (see node_modules/tls-client-node/dist/response.js — its bytes() + // method splits on the comma to extract base64). Decode back into bytes + // so we can hand them to the cache. + let bytes: Buffer; + let mime: string; + if (/^data:[^;]{1,256};base64,/.test(response.text)) { + const commaIdx = response.text.indexOf(","); + const header = response.text.slice(5, commaIdx); // strip "data:" + mime = header.split(";")[0] || "image/png"; + bytes = Buffer.from(response.text.slice(commaIdx + 1), "base64"); + } else { + // Plain-text body (shouldn't happen for binary downloads with + // byteResponse:true, but handle defensively). + bytes = Buffer.from(response.text, "binary"); + mime = response.headers.get("content-type")?.split(";")[0]?.trim() || "image/png"; + } + if (bytes.length === 0 || bytes.length > IMAGE_DOWNLOAD_MAX_BYTES) { + if (bytes.length > IMAGE_DOWNLOAD_MAX_BYTES) { + ctx.log?.warn?.( + "CGPT-WEB", + `Image too large to cache (${bytes.length} bytes > ${IMAGE_DOWNLOAD_MAX_BYTES}); skipping` + ); + } + return null; + } + // Cache the image and return a stable HTTP URL pointing at our own + // /v1/chatgpt-web/image/ route. Streaming the raw base64 back via + // SSE deltas works but Open WebUI's progressive markdown renderer shows + // each chunk as plain text mid-stream — the user sees megabytes of + // base64 scroll past before the image renders. URL-based delivery + // produces a small markdown delta and renders instantly when the + // browser fetches the URL. + const id = storeChatGptImage(bytes, mime, undefined, imageContext); + return `${ctx.publicBaseUrl}/v1/chatgpt-web/image/${id}`; +} + +/** + * Resolve the async image_gen result by registering a WebSocket with + * chatgpt.com and listening for the image_asset_pointer. + * + * Background: when chatgpt.com is busy ("Lots of people are creating images + * right now") the image_gen tool defers — the initial SSE finishes with a + * "Processing image..." placeholder and the real image arrives over a + * WebSocket pubsub. (We checked: the conversation tree at + * `/backend-api/conversation/{id}` is NOT updated when the image lands, so + * polling that endpoint does nothing.) + * + * Flow: + * 1. POST /backend-api/register-websocket → { wss_url, expires_at, ... } + * 2. Open the wss_url with the standard WebSocket client. + * Auth lives in the URL (signed access token), so we don't need the + * TLS-impersonation transport here. + * 3. Each WS message is JSON like { type: "wss-message", data: { ... + * conversation event ... } }. The conversation event has the same + * shape as the SSE events from /backend-api/f/conversation. + * 4. Watch for assistant messages with multimodal_text + image_asset_pointer + * OR a `message_stream_complete` for the conversation. Resolve when + * either pointer arrives or the timeout fires. + */ +async function registerWebSocket(ctx: ResolverContext): Promise { + // chatgpt.com migrated from POST /backend-api/register-websocket to a + // GET-only endpoint under /backend-api/celsius/ws/user. The response shape + // also changed from `{ wss_url }` → `{ websocket_url }`. Newer codebases + // (g4f, etc.) all hit the celsius path; the legacy path now 404s. + // Keep the legacy path as a fallback for older deployments. + const candidates = [ + { url: `${CHATGPT_BASE}/backend-api/celsius/ws/user`, method: "GET" as const }, + { url: `${CHATGPT_BASE}/backend-api/register-websocket`, method: "POST" as const }, + ]; + const headers: Record = { + ...browserHeaders(), + ...oaiHeaders(ctx.sessionId, ctx.deviceId), + Accept: "application/json", + Authorization: `Bearer ${ctx.accessToken}`, + Cookie: buildSessionCookieHeader(ctx.cookie), + }; + if (ctx.accountId) headers["chatgpt-account-id"] = ctx.accountId; + + for (const { url, method } of candidates) { + let r: TlsFetchResult; + try { + r = await tlsFetchChatGpt(url, { + method, + headers, + body: method === "POST" ? "" : undefined, + timeoutMs: 30_000, + signal: ctx.signal, + }); + } catch (err) { + ctx.log?.warn?.( + "CGPT-WEB", + `register-websocket fetch failed for ${url}: ${err instanceof Error ? err.message : String(err)}` + ); + continue; + } + if (r.status === 200) { + try { + const data = JSON.parse(r.text || "{}") as { + websocket_url?: string; + wss_url?: string; + }; + const ws = data.websocket_url ?? data.wss_url; + if (ws) { + ctx.log?.debug?.("CGPT-WEB", `Got WebSocket URL via ${url}`); + return ws; + } + } catch { + /* fall through */ + } + } + ctx.log?.warn?.( + "CGPT-WEB", + `register-websocket via ${url} → ${r.status}: ${(r.text || "").slice(0, 200)}` + ); + } + return null; +} + +interface WsWaitOutcome { + pointers: ImagePointerRef[]; + /** True if the connection emitted an error event. Used by the retry layer + * to decide whether a transport blip is worth a second attempt. */ + errored: boolean; + /** True if any frame (message or open) was actually received from the + * server. A retry is most valuable when the connection died before + * exchanging any data. */ + gotAnyMessage: boolean; +} + +async function waitForImageViaWebSocket( + wssUrl: string, + conversationId: string, + timeoutMs: number, + ctx: ResolverContext +): Promise { + return new Promise((resolve) => { + const found = new Map(); + let resolved = false; + let errored = false; + let gotAnyMessage = false; + const finish = () => { + if (resolved) return; + resolved = true; + try { + ws.close(); + } catch { + /* ignore */ + } + resolve({ + pointers: Array.from(found.values()), + errored, + gotAnyMessage, + }); + }; + const ws = new WebSocket(wssUrl); + const timer = setTimeout(() => { + ctx.log?.warn?.("CGPT-WEB", `WebSocket image wait timed out after ${timeoutMs}ms`); + finish(); + }, timeoutMs); + const onAbort = () => { + ctx.log?.debug?.("CGPT-WEB", "WebSocket aborted by client"); + finish(); + }; + ctx.signal?.addEventListener?.("abort", onAbort); + ws.onopen = () => { + gotAnyMessage = true; + ctx.log?.debug?.("CGPT-WEB", "WebSocket open — waiting for image events"); + }; + ws.onerror = (e) => { + errored = true; + ctx.log?.warn?.("CGPT-WEB", `WebSocket error: ${(e as ErrorEvent).message ?? "unknown"}`); + }; + ws.onclose = () => { + clearTimeout(timer); + ctx.signal?.removeEventListener?.("abort", onAbort); + finish(); + }; + ws.onmessage = (event) => { + gotAnyMessage = true; + let payload: unknown; + const raw = typeof event.data === "string" ? event.data : event.data.toString(); + try { + payload = JSON.parse(raw); + } catch { + return; + } + // chatgpt.com's celsius WS frames look like: + // { type: "conversation-update", + // payload: { conversation_id: "...", + // update_content: { message: { ... }, ... } } } + // Older deployments wrapped the conversation event directly as { data }. + const obj = payload as Record; + const candidates: ChatGptStreamEvent[] = []; + const innerPayload = obj.payload as Record | undefined; + const updateContent = innerPayload?.update_content as Record | undefined; + if (updateContent?.message) { + candidates.push({ + message: updateContent.message as ChatGptStreamEvent["message"], + conversation_id: innerPayload?.conversation_id as string | undefined, + }); + } + if (innerPayload?.message) { + candidates.push({ + message: innerPayload.message as ChatGptStreamEvent["message"], + conversation_id: innerPayload.conversation_id as string | undefined, + }); + } + if ((obj.data as { message?: unknown } | undefined)?.message) { + candidates.push(obj.data as ChatGptStreamEvent); + } + + for (const data of candidates) { + if (data?.conversation_id && data.conversation_id !== conversationId) continue; + const m = data?.message; + // The async image_gen result arrives as a TOOL-role message + // ({"author":{"role":"tool","name":"t2uay3k.sj1i4kz"}}), so we + // accept tool messages here too — extractImagePointers does the + // actual content_type filtering. + if (Array.isArray(m?.content?.parts)) { + for (const ptr of extractImagePointers(m.content?.parts ?? [])) { + const existing = found.get(ptr); + found.set( + ptr, + existing?.messageId + ? existing + : { pointer: ptr, ...(m?.id ? { messageId: m.id } : {}) } + ); + } + } + if (m?.metadata && typeof m.metadata === "object") { + const md = m.metadata as Record; + const ptr = (md.asset_pointer ?? md.image_asset_pointer) as string | undefined; + if (typeof ptr === "string") { + const existing = found.get(ptr); + found.set( + ptr, + existing?.messageId + ? existing + : { pointer: ptr, ...(m?.id ? { messageId: m.id } : {}) } + ); + } + } + } + if (found.size > 0) finish(); + }; + }); +} + +// Default 3-minute wait for the async image_gen tool to produce an image +// pointer over the celsius WebSocket. Tunable so deployments can stretch +// during chatgpt.com queue-deep windows ("Lots of people are creating +// images right now") without code changes. +const DEFAULT_ASYNC_IMAGE_TIMEOUT_MS = 180_000; + +function configuredAsyncImageTimeoutMs(): number { + const raw = Number(process.env.OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS); + if (!Number.isFinite(raw) || raw <= 0) return DEFAULT_ASYNC_IMAGE_TIMEOUT_MS; + return Math.floor(raw); +} + +async function pollForAsyncImage( + conversationId: string, + ctx: ResolverContext, + opts: { timeoutMs?: number } = {} +): Promise { + const totalTimeoutMs = opts.timeoutMs ?? configuredAsyncImageTimeoutMs(); + const deadline = Date.now() + totalTimeoutMs; + + // One reconnect attempt on transport error: the WS endpoint is signed and + // short-lived, and a network blip during the long wait would otherwise + // lose the image entirely. The deadline is shared across attempts so we + // never exceed the caller's budget. + for (let attempt = 0; attempt < 2; attempt++) { + const remaining = deadline - Date.now(); + if (remaining <= 0) break; + const wssUrl = await registerWebSocket(ctx); + if (!wssUrl) { + ctx.log?.warn?.( + "CGPT-WEB", + attempt === 0 + ? "Could not register WebSocket — async image gen not retrievable" + : `WebSocket re-registration failed on retry attempt ${attempt + 1}` + ); + if (attempt === 0) continue; // try again — registration can be flaky + return []; + } + ctx.log?.debug?.( + "CGPT-WEB", + `Registered WebSocket for async image (attempt ${attempt + 1}, ${remaining}ms remaining)` + ); + const outcome = await waitForImageViaWebSocket(wssUrl, conversationId, remaining, ctx); + if (outcome.pointers.length > 0) return outcome.pointers; + if (ctx.signal?.aborted) return []; + // Only retry when the connection died before producing anything useful. + // A clean close with no pointers (e.g., upstream cancellation) shouldn't + // burn a second attempt — the result would be the same. + if (!outcome.errored || outcome.gotAnyMessage) return []; + ctx.log?.warn?.( + "CGPT-WEB", + `WebSocket attempt ${attempt + 1} ended in transport error before any frame; retrying` + ); + } + return []; +} + +function makeImageResolver(ctx: ResolverContext): ImageResolver { + // Cache resolutions across the same request — the same pointer can show up + // on multiple SSE events (in-progress + finished_successfully). One HTTP + // round-trip per unique pointer is enough. + const cache = new Map(); + + return async (assetPointer, conversationId, parentMessageId) => { + if (cache.has(assetPointer)) return cache.get(assetPointer) ?? null; + + let fileId: string | null = null; + if (assetPointer.startsWith(FILE_SERVICE_PREFIX)) { + fileId = assetPointer.slice(FILE_SERVICE_PREFIX.length); + } else if (assetPointer.startsWith(SEDIMENT_PREFIX)) { + fileId = assetPointer.slice(SEDIMENT_PREFIX.length); + } else { + ctx.log?.warn?.("CGPT-WEB", `Unknown asset_pointer scheme: ${assetPointer}`); + } + + let signedUrl: string | null = null; + if (fileId) { + // Both endpoints return a chatgpt.com estuary URL signed for the + // user's current session — that URL 403s without the cookie, so + // downstream clients can't fetch it directly. We download once via + // the authenticated TLS client and expose the bytes through + // OmniRoute's short-lived image cache. + // + // /files/{id}/download is the historical path. It works for + // chat-uploaded files and the older image_gen output format + // (`file-XXXX`). Newer image-edit results from continued + // conversations land with a `file_00000000XXXX` shape that 422s on + // /files/{id}/download — they're conversation-scoped attachments + // and only resolve through /conversation/{cid}/attachment/{fid}/ + // download. We try /files first because it's cheaper and works for + // the common case, then fall through. + signedUrl = await fetchDownloadUrl( + `${CHATGPT_BASE}/backend-api/files/${encodeURIComponent(fileId)}/download`, + ctx + ); + if (!signedUrl && conversationId) { + signedUrl = await fetchDownloadUrl( + `${CHATGPT_BASE}/backend-api/conversation/${encodeURIComponent(conversationId)}/attachment/${encodeURIComponent(fileId)}/download`, + ctx + ); + } + } + + let finalUrl: string | null = null; + if (signedUrl) { + // chatgpt.com signed URLs require the user's session cookie to fetch, + // so we materialize the bytes into our own cache and emit an OmniRoute + // URL. If that fails (oversize, network error, etc.) we return null — + // never the signed URL — because handing it back would emit broken + // markdown that 403s for the client. Better to drop the image silently + // than render a broken link. + finalUrl = await imageUrlToCachedImageUrl( + signedUrl, + ctx, + conversationId && parentMessageId ? { conversationId, parentMessageId } : undefined + ); + } + cache.set(assetPointer, finalUrl); + if (finalUrl) { + const preview = finalUrl.startsWith("data:") + ? `data:... (${finalUrl.length} chars)` + : finalUrl.slice(0, 80) + "..."; + ctx.log?.debug?.("CGPT-WEB", `Resolved ${assetPointer} → ${preview}`); + } + return finalUrl; + }; +} + // ─── Executor ─────────────────────────────────────────────────────────────── export class ChatGptWebExecutor extends BaseExecutor { @@ -1150,6 +2254,7 @@ export class ChatGptWebExecutor extends BaseExecutor { signal, log, onCredentialsRefreshed, + clientHeaders, }: ExecuteInput) { const messages = (body as Record | null)?.messages as | Array> @@ -1267,7 +2372,8 @@ export class ChatGptWebExecutor extends BaseExecutor { deviceId, cookie, dplInfo, - signal + signal, + log ); } catch (err) { if (err instanceof SentinelBlockedError) { @@ -1320,7 +2426,8 @@ export class ChatGptWebExecutor extends BaseExecutor { proofToken = await solveProofOfWork( reqs.proofofwork.seed, reqs.proofofwork.difficulty, - powConfig + powConfig, + log ); } @@ -1335,15 +2442,31 @@ export class ChatGptWebExecutor extends BaseExecutor { }; } - // Conversation continuity is intentionally disabled. The body sets - // `history_and_training_disabled: true` (Temporary Chat mode) and - // chatgpt.com expires those conversation_ids quickly — re-using them - // returns 404. Each request starts a fresh conversation; clients (Open - // WebUI, OpenAI-API-style) send the full history each turn anyway. - const parentMessageId = randomUUID(); + // Toggle Temporary Chat off only for image-generation requests, since + // Temporary Chat disables the image_gen tool. For plain text turns we + // keep Temporary Chat on so the user's chatgpt.com history isn't + // polluted with router traffic. + const imageEdit = looksLikeImageEditRequest(parsed); + const continuation = imageEdit ? parsed.latestImageContext : null; + const forImageGen = looksLikeImageGenRequest(parsed) || imageEdit; + if (forImageGen) { + log?.debug?.( + "CGPT-WEB", + continuation + ? "Image edit intent detected — continuing saved image conversation" + : "Image-gen intent detected — disabling Temporary Chat for this turn" + ); + } + const parentMessageId = continuation?.parentMessageId ?? randomUUID(); const modelSlug = MODEL_MAP[model] ?? model; - const cgptBody = buildConversationBody(parsed, modelSlug, parentMessageId); + const cgptBody = buildConversationBody( + parsed, + modelSlug, + parentMessageId, + forImageGen, + continuation + ); const headers: Record = { ...browserHeaders(), @@ -1438,9 +2561,32 @@ export class ChatGptWebExecutor extends BaseExecutor { const cid = `chatcmpl-cgpt-${crypto.randomUUID().slice(0, 12)}`; const created = Math.floor(Date.now() / 1000); + const resolverCtx: ResolverContext = { + accessToken: tokenEntry.accessToken, + accountId: tokenEntry.accountId, + sessionId, + deviceId, + cookie, + signal, + log, + publicBaseUrl: derivePublicBaseUrl(clientHeaders, log), + }; + const imageResolver = makeImageResolver(resolverCtx); + const pollAsyncImage = (conversationId: string) => + pollForAsyncImage(conversationId, resolverCtx); + let finalResponse: Response; if (stream) { - const sseStream = buildStreamingResponse(bodyStream, model, cid, created, signal); + const sseStream = buildStreamingResponse( + bodyStream, + model, + cid, + created, + imageResolver, + pollAsyncImage, + log, + signal + ); finalResponse = new Response(sseStream, { status: 200, headers: { @@ -1456,6 +2602,9 @@ export class ChatGptWebExecutor extends BaseExecutor { cid, created, parsed.currentMsg, + imageResolver, + pollAsyncImage, + log, signal ); } @@ -1490,5 +2639,8 @@ export function __resetChatGptWebCachesForTesting(): void { tokenCache.clear(); warmupCache.clear(); deviceIdCache.clear(); + __resetChatGptImageCacheForTesting(); dplCache = null; } + +export const __derivePublicBaseUrlForTesting = derivePublicBaseUrl; diff --git a/open-sse/handlers/imageGeneration.ts b/open-sse/handlers/imageGeneration.ts index a9c3b566..5661f9b4 100644 --- a/open-sse/handlers/imageGeneration.ts +++ b/open-sse/handlers/imageGeneration.ts @@ -19,6 +19,12 @@ import { randomUUID } from "crypto"; import { getImageProvider, parseImageModel } from "../config/imageRegistry.ts"; import { mapImageSize } from "../translator/image/sizeMapper.ts"; import { getCodexClientVersion, getCodexUserAgent } from "../config/codexClient.ts"; +import { ChatGptWebExecutor } from "../executors/chatgpt-web.ts"; +import { + getChatGptImage, + findChatGptImageBySha256, +} from "../services/chatgptImageCache.ts"; +import { createHash } from "node:crypto"; import { saveCallLog } from "@/lib/usageDb"; import { submitComfyWorkflow, @@ -113,7 +119,14 @@ const FAL_PRESET_SIZES = { * @param {object} options.log - Logger * @param {string} [options.resolvedProvider] - Pre-resolved provider ID (from route layer custom model resolution) */ -export async function handleImageGeneration({ body, credentials, log, resolvedProvider = null }) { +export async function handleImageGeneration({ + body, + credentials, + log, + resolvedProvider = null, + signal = null, + clientHeaders = null, +}) { let provider, model; if (resolvedProvider) { @@ -257,6 +270,18 @@ export async function handleImageGeneration({ body, credentials, log, resolvedPr }); } + if (providerConfig.format === "chatgpt-web") { + return handleChatGptWebImageGeneration({ + model, + provider, + body, + credentials, + log, + signal, + clientHeaders, + }); + } + if (providerConfig.format === "nanobanana") { return handleNanoBananaImageGeneration({ model, @@ -537,6 +562,370 @@ async function handleOpenAIImageGeneration({ return result; } +const CHATGPT_WEB_IMAGE_MARKDOWN_RE = /!\[[^\]]*\]\(([^)\s]+)\)/g; +const CHATGPT_WEB_IMAGE_ID_RE = /\/v1\/chatgpt-web\/image\/([a-f0-9]{16,64})(?=[?\s"'<>)]|$)/i; + +function extractMarkdownImageUrls(text: string): string[] { + const urls: string[] = []; + // String.prototype.matchAll consumes a fresh iterator and ignores the + // regex's lastIndex, so no manual reset is required. + for (const match of text.matchAll(CHATGPT_WEB_IMAGE_MARKDOWN_RE)) { + if (match[1]) urls.push(match[1]); + } + return urls; +} + +function buildChatGptWebImagePrompt(body): string { + const prompt = String(body.prompt || "").trim(); + const details: string[] = [`Create an image for this prompt: ${prompt}`]; + if (typeof body.size === "string" && body.size.trim()) { + details.push(`Requested size: ${body.size.trim()}.`); + } + if (typeof body.quality === "string" && body.quality.trim()) { + details.push(`Requested quality: ${body.quality.trim()}.`); + } + if (typeof body.style === "string" && body.style.trim()) { + details.push(`Requested style: ${body.style.trim()}.`); + } + return details.join("\n"); +} + +async function handleChatGptWebImageGeneration({ + model, + provider, + body, + credentials, + log, + signal, + clientHeaders, +}) { + const startTime = Date.now(); + const prompt = typeof body.prompt === "string" ? body.prompt.trim() : ""; + if (!prompt) { + return saveImageErrorResult({ + provider, + model, + status: 400, + startTime, + error: "Prompt is required for ChatGPT Web image generation", + }); + } + + if (!credentials?.apiKey) { + return saveImageErrorResult({ + provider, + model, + status: 401, + startTime, + error: "ChatGPT Web credentials missing session cookie", + }); + } + + // Each image is one chatgpt.com chat turn (~30s). Cap at 4 (matches OpenAI's + // own limit for image-1 / dall-e-3) so a stray n=1000 doesn't pin the + // executor for hours before the upstream HTTP timeout fires. + const CHATGPT_WEB_IMAGE_N_MAX = 4; + const rawCount = + Number.isInteger(body.n) && (body.n as number) > 0 ? (body.n as number) : 1; + if (rawCount > CHATGPT_WEB_IMAGE_N_MAX) { + return saveImageErrorResult({ + provider, + model, + status: 400, + startTime, + error: `ChatGPT Web image generation supports n=1..${CHATGPT_WEB_IMAGE_N_MAX} (got ${rawCount}); each n is a separate ~30s chat turn.`, + }); + } + const requestedCount = rawCount; + if (log && requestedCount > 1) { + log.warn( + "IMAGE", + `ChatGPT Web returns one image per chat turn; requested n=${requestedCount} will run sequentially` + ); + } + + const wantsBase64 = body.response_format === "b64_json"; + const images: Array<{ url?: string; b64_json?: string }> = []; + const requestBody = { + model, + prompt: prompt.slice(0, 500), + size: body.size || undefined, + quality: body.quality || undefined, + }; + + for (let i = 0; i < requestedCount; i++) { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model, + body: { + messages: [{ role: "user", content: buildChatGptWebImagePrompt(body) }], + }, + stream: false, + credentials, + signal, + log, + clientHeaders, + }); + + const responseText = await result.response.text(); + if (result.response.status >= 400) { + return saveImageErrorResult({ + provider, + model, + status: result.response.status, + startTime, + error: responseText, + requestBody, + }); + } + + let content = ""; + try { + const json = JSON.parse(responseText); + content = String(json?.choices?.[0]?.message?.content || ""); + } catch { + content = responseText; + } + + const urls = extractMarkdownImageUrls(content); + if (urls.length === 0) { + return saveImageErrorResult({ + provider, + model, + status: 502, + startTime, + error: `ChatGPT Web completed without returning image markdown: ${content.slice(0, 300)}`, + requestBody, + }); + } + + for (const url of urls) { + if (!wantsBase64) { + images.push({ url }); + continue; + } + const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1]; + const cached = id ? getChatGptImage(id) : null; + if (!cached) { + return saveImageErrorResult({ + provider, + model, + status: 502, + startTime, + error: "ChatGPT Web image bytes expired before b64_json conversion", + requestBody, + }); + } + images.push({ b64_json: cached.bytes.toString("base64") }); + } + } + + return saveImageSuccessResult({ + provider, + model, + startTime, + requestBody, + responseBody: { images_count: images.length }, + images, + }); +} + +/** + * Handle a multipart /v1/images/edits request for chatgpt-web. Open WebUI + * uploads the prior image's bytes; we hash them and look up our cache. + * + * The hash match is reliable because Open WebUI's image-gen pipeline + * downloads our /v1/chatgpt-web/image/ URL byte-for-byte and re-serves + * those exact bytes through its own file store. When the user asks to edit + * the image, OWUI uploads the same bytes back to us via multipart — same + * hash, we find the conversation context, and drive the executor with a + * synthetic chat thread that triggers continuation mode. + * + * No-match cases (cache evicted by TTL, or the user uploaded a foreign + * image) get a clear 400. We can't actually edit an image we don't have a + * conversation context for — chatgpt.com's image_gen tool needs the + * original conversation node, and we don't have a path to upload bytes + * directly. + */ +export async function handleImageEdit({ + provider, + model, + body, + imageBytes, + credentials, + log, + signal = null, + clientHeaders = null, +}: { + provider: string; + model: string; + body: Record; + imageBytes: Buffer; + imageMime?: string; // accepted for symmetry with route layer; not used + credentials: any; + log: any; + signal?: AbortSignal | null; + clientHeaders?: Record | null; +}) { + const startTime = Date.now(); + const prompt = typeof body.prompt === "string" ? body.prompt.trim() : ""; + if (!prompt) { + return saveImageErrorResult({ + provider, + model, + status: 400, + startTime, + error: "Prompt is required for image edit", + }); + } + + if (!credentials?.apiKey) { + return saveImageErrorResult({ + provider, + model, + status: 401, + startTime, + error: "ChatGPT Web credentials missing session cookie", + }); + } + + const imageHash = createHash("sha256").update(imageBytes).digest("hex"); + const cached = findChatGptImageBySha256(imageHash); + + const wantsBase64 = body.response_format === "b64_json"; + const requestBody = { + model, + prompt: prompt.slice(0, 500), + size: body.size || undefined, + image_hash: imageHash.slice(0, 16), + image_bytes: imageBytes.length, + cached_match: Boolean(cached?.entry.context), + }; + + if (!cached?.entry.context) { + // chatgpt-web's image_gen tool can only edit an image when we continue + // the original conversation node. If we never generated this image (or + // its 30-minute TTL elapsed), there's no node to continue. Return a + // clear, actionable error — much better than silently spawning an + // unrelated image and confusing the user. + log?.warn?.( + "IMAGE", + `chatgpt-web edit: no cached match for sha256=${imageHash.slice(0, 16)} (bytes=${imageBytes.length}); returning 400` + ); + return saveImageErrorResult({ + provider, + model, + status: 400, + startTime, + error: + "chatgpt-web image edit only works for images recently generated through this OmniRoute instance " + + "(cache window: 30 minutes). Re-generate the image and try the edit immediately, or disable image-edit " + + "in your client to use plain chat-completion edit prompts instead.", + requestBody, + }); + } + + // Build a synthetic chat thread that surfaces the cached image URL on + // the assistant turn. The executor's parseOpenAIMessages picks up the + // URL, findCachedImageContext resolves it to {conversationId, + // parentMessageId}, and looksLikeImageEditRequest fires on the user + // prompt — together producing a continuation request that actually + // edits the saved image. + // + // The synthetic user prompt is anchored with both an edit verb AND an + // image-gen verb so the executor's heuristics fire regardless of what + // wording the caller used ("now make it brighter", "tweak this", ...): + // - looksLikeImageEditRequest: matches "edit" + "image" within 120 chars + // - looksLikeImageGenRequest: matches "generate" + "image" within 40 chars + // Either match alone would set forImageGen, but covering both is cheap + // insurance for prompts that don't fit common phrasings. + const messages: Array<{ role: string; content: string }> = [ + { + role: "assistant", + // The base URL is irrelevant — only the path is parsed by + // CACHED_IMAGE_URL_RE in the executor's findCachedImageContext. + content: `![image](http://internal/v1/chatgpt-web/image/${cached.id})`, + }, + { + role: "user", + content: `Edit the image and generate the new image: ${prompt}`, + }, + ]; + + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model, + body: { messages }, + stream: false, + credentials, + signal, + log, + clientHeaders, + }); + + const responseText = await result.response.text(); + if (result.response.status >= 400) { + return saveImageErrorResult({ + provider, + model, + status: result.response.status, + startTime, + error: responseText, + requestBody, + }); + } + + let content = ""; + try { + const json = JSON.parse(responseText); + content = String(json?.choices?.[0]?.message?.content || ""); + } catch { + content = responseText; + } + + const urls = extractMarkdownImageUrls(content); + if (urls.length === 0) { + return saveImageErrorResult({ + provider, + model, + status: 502, + startTime, + error: `ChatGPT Web edit completed without returning image markdown: ${content.slice(0, 300)}`, + requestBody, + }); + } + + const images: Array<{ url?: string; b64_json?: string }> = []; + for (const url of urls) { + if (!wantsBase64) { + images.push({ url }); + continue; + } + const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1]; + const cachedNew = id ? getChatGptImage(id) : null; + if (!cachedNew) { + return saveImageErrorResult({ + provider, + model, + status: 502, + startTime, + error: "ChatGPT Web image bytes expired before b64_json conversion", + requestBody, + }); + } + images.push({ b64_json: cachedNew.bytes.toString("base64") }); + } + + return saveImageSuccessResult({ + provider, + model, + startTime, + requestBody, + responseBody: { images_count: images.length, edit_match: Boolean(cached?.entry.context) }, + images, + }); +} + async function handleFalAIImageGeneration({ model, provider, diff --git a/open-sse/services/chatgptImageCache.ts b/open-sse/services/chatgptImageCache.ts new file mode 100644 index 00000000..77f67591 --- /dev/null +++ b/open-sse/services/chatgptImageCache.ts @@ -0,0 +1,148 @@ +/** + * In-memory cache for ChatGPT-generated images so we can serve them via a + * regular HTTP URL instead of inlining megabytes of base64 into SSE deltas. + * + * Why: chatgpt.com's `image_asset_pointer` resolves to a session-signed + * `estuary/content` URL that 403s for any anonymous client. We have to + * download the bytes server-side (with the user's session) and re-serve + * them. Streaming the raw base64 back through SSE works but Open WebUI's + * progressive markdown renderer displays each chunk as text mid-stream — + * the user sees ~3 MB of base64 scroll past before the final `)` arrives + * and the renderer recognizes it as an image. Hosting the image on a + * regular URL avoids that entirely: we emit a tiny `![image](http://...)` + * markdown delta and the browser fetches the image normally. + * + * The cache is in-memory only, with a short TTL — these URLs are single-use + * artifacts of one chat turn, not persistent assets. If the user reloads + * the conversation in a few hours the URLs will 404; that's expected. + */ + +import { createHash, randomUUID } from "node:crypto"; + +interface CachedImage { + bytes: Buffer; + mime: string; + expiresAt: number; + context?: ChatGptImageConversationContext; + /** sha256(bytes) — used by /v1/images/edits to correlate an uploaded + * image (Open WebUI re-uploads the bytes via multipart) back to the + * conversation context we cached when the image was first generated. */ + bytesSha256: string; +} + +const cache = new Map(); +let cacheBytes = 0; +const DEFAULT_TTL_MS = 30 * 60 * 1000; +const MAX_ENTRIES = 200; +// Per-entry images cap at 8 MB (enforced upstream in the executor) so 32 MB +// covers ~4 large images. The byte cap matters more than entry count: a hot +// loop of 8 MB images would otherwise pin 1.6 GB of RSS before count +// eviction kicked in. Tune via OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB. +const DEFAULT_MAX_BYTES = 256 * 1024 * 1024; + +function configuredMaxBytes(): number { + const raw = Number(process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB); + if (!Number.isFinite(raw) || raw <= 0) return DEFAULT_MAX_BYTES; + return Math.floor(raw * 1024 * 1024); +} + +export interface ChatGptImageConversationContext { + conversationId: string; + parentMessageId: string; +} + +function deleteEntry(id: string): void { + const entry = cache.get(id); + if (!entry) return; + cacheBytes -= entry.bytes.length; + cache.delete(id); +} + +function evictExpired(now = Date.now()): void { + for (const [id, entry] of cache) { + if (now >= entry.expiresAt) deleteEntry(id); + } +} + +function evictUntilWithinLimits(maxBytes: number, incomingBytes: number): void { + // Drop oldest until both the entry-count and total-byte caps are satisfied. + // Map iteration is insertion-ordered so the first key is the oldest entry. + while ( + (cache.size >= MAX_ENTRIES || cacheBytes + incomingBytes > maxBytes) && + cache.size > 0 + ) { + const firstKey = cache.keys().next().value; + if (!firstKey) break; + deleteEntry(firstKey); + } +} + +export function storeChatGptImage( + bytes: Buffer, + mime: string, + ttlMs = DEFAULT_TTL_MS, + context?: ChatGptImageConversationContext +): string { + evictExpired(); + evictUntilWithinLimits(configuredMaxBytes(), bytes.length); + const id = randomUUID().replace(/-/g, ""); + const bytesSha256 = createHash("sha256").update(bytes).digest("hex"); + cache.set(id, { + bytes, + mime, + expiresAt: Date.now() + ttlMs, + context, + bytesSha256, + }); + cacheBytes += bytes.length; + return id; +} + +export function getChatGptImage(id: string): CachedImage | null { + evictExpired(); + const entry = cache.get(id); + if (!entry) return null; + if (Date.now() >= entry.expiresAt) { + deleteEntry(id); + return null; + } + return entry; +} + +export function getChatGptImageConversationContext( + id: string +): ChatGptImageConversationContext | null { + return getChatGptImage(id)?.context ?? null; +} + +/** + * Look up a cached entry by sha256(bytes). Used by /v1/images/edits to + * correlate Open WebUI's re-uploaded image back to the conversation + * context we cached at generation time, so the executor can continue the + * saved chatgpt.com conversation node and actually edit the image instead + * of generating an unrelated one from scratch. + */ +export function findChatGptImageBySha256( + hash: string +): { id: string; entry: CachedImage } | null { + evictExpired(); + const target = hash.toLowerCase(); + for (const [id, entry] of cache.entries()) { + if (entry.bytesSha256 === target) { + if (Date.now() < entry.expiresAt) return { id, entry }; + deleteEntry(id); + } + } + return null; +} + +/** Test-only: clear the cache between tests. */ +export function __resetChatGptImageCacheForTesting(): void { + cache.clear(); + cacheBytes = 0; +} + +/** Test-only: peek at current resident-byte total. */ +export function __getChatGptImageCacheBytesForTesting(): number { + return cacheBytes; +} diff --git a/open-sse/services/chatgptTlsClient.ts b/open-sse/services/chatgptTlsClient.ts index a7e5c1c6..ed505802 100644 --- a/open-sse/services/chatgptTlsClient.ts +++ b/open-sse/services/chatgptTlsClient.ts @@ -110,6 +110,14 @@ export interface TlsFetchOptions { stream?: boolean; /** EOF marker the upstream sends to signal end of stream (default: "[DONE]"). */ streamEofSymbol?: string; + /** + * If true, instructs the underlying tls-client to return the response body + * as a base64 `data:;base64,...` string (so binary payloads survive + * the JSON marshalling step). Required for image / binary downloads — + * without it, raw bytes get UTF-8-decoded and any non-ASCII byte is + * mangled. Default false (text mode). + */ + byteResponse?: boolean; } export interface TlsFetchResult { @@ -161,6 +169,7 @@ export async function tlsFetchChatGpt( timeoutMilliseconds: options.timeoutMs ?? DEFAULT_TIMEOUT_MS, followRedirects: true, withRandomTLSExtensionOrder: true, + isByteResponse: options.byteResponse === true, }; if (options.stream) { diff --git a/src/app/api/v1/chat/completions/route.ts b/src/app/api/v1/chat/completions/route.ts index 65bc9cf0..3508d97f 100644 --- a/src/app/api/v1/chat/completions/route.ts +++ b/src/app/api/v1/chat/completions/route.ts @@ -31,6 +31,19 @@ export async function OPTIONS() { export async function POST(request) { await ensureInitialized(); + // One-line marker for diagnosing 413 / Server-Action interceptions. + // Logs only when Content-Length is present so debug noise stays low for + // typical chat payloads. Toggle off via OMNIROUTE_LOG_REQUEST_SHAPE=0. + if (process.env.OMNIROUTE_LOG_REQUEST_SHAPE !== "0") { + const ct = request.headers.get("content-type") ?? ""; + const cl = request.headers.get("content-length"); + if (cl && Number(cl) > 256 * 1024) { + console.error( + `[CHAT-ROUTE] large body content-type="${ct}" content-length=${cl}` + ); + } + } + // Prompt injection guard — inspect body before forwarding try { const cloned = request.clone(); diff --git a/src/app/api/v1/chatgpt-web/image/[id]/route.ts b/src/app/api/v1/chatgpt-web/image/[id]/route.ts new file mode 100644 index 00000000..a23e7daf --- /dev/null +++ b/src/app/api/v1/chatgpt-web/image/[id]/route.ts @@ -0,0 +1,42 @@ +import { CORS_HEADERS, handleCorsOptions } from "@/shared/utils/cors"; +import { getChatGptImage } from "@omniroute/open-sse/services/chatgptImageCache.ts"; + +export async function OPTIONS() { + return handleCorsOptions(); +} + +/** + * Serve a cached ChatGPT-generated image by its opaque cache id. + * + * Auth: intentionally unauthenticated. The id is a 128-bit random UUID and + * the entry has a short TTL, so the URL is unguessable for the lifetime of + * the chat turn. We need it open because it's loaded by the user's BROWSER + * (via an `` tag rendered from markdown) — that fetch doesn't carry + * the OmniRoute API key. Rate limiting / abuse protection sit at the + * network layer the same way they do for any other static asset. + */ +export async function GET(_request: Request, { params }: { params: Promise<{ id: string }> }) { + const { id } = await params; + const entry = getChatGptImage(id); + if (!entry) { + return new Response(JSON.stringify({ error: "Image not found or expired" }), { + status: 404, + headers: { "Content-Type": "application/json", ...CORS_HEADERS }, + }); + } + // entry.bytes is a Buffer (subclass of Uint8Array); pass it directly. + // Wrapping in `new Uint8Array(...)` would copy the entire payload — up to + // 8 MB per image — for no benefit. + return new Response(entry.bytes, { + status: 200, + headers: { + "Content-Type": entry.mime, + // Allow short browser caching — the id is unique-per-image, so a + // cache hit is fine and saves a round-trip if the user re-renders + // the chat. Beyond the in-memory TTL the URL 404s anyway. + "Cache-Control": "private, max-age=1800", + "Content-Length": String(entry.bytes.length), + ...CORS_HEADERS, + }, + }); +} diff --git a/src/app/api/v1/images/edits/route.ts b/src/app/api/v1/images/edits/route.ts new file mode 100644 index 00000000..05f70e3c --- /dev/null +++ b/src/app/api/v1/images/edits/route.ts @@ -0,0 +1,174 @@ +import { CORS_ORIGIN } from "@/shared/utils/cors"; +import { handleImageEdit } from "@omniroute/open-sse/handlers/imageGeneration.ts"; +import { + getProviderCredentials, + clearRecoveredProviderState, + extractApiKey, + isValidApiKey, +} from "@/sse/services/auth"; +import { parseImageModel, getImageProvider } from "@omniroute/open-sse/config/imageRegistry.ts"; +import { errorResponse, unavailableResponse } from "@omniroute/open-sse/utils/error.ts"; +import { HTTP_STATUS } from "@omniroute/open-sse/config/constants.ts"; +import * as log from "@/sse/utils/logger"; +import { toJsonErrorPayload } from "@/shared/utils/upstreamError"; +import { enforceApiKeyPolicy } from "@/shared/utils/apiKeyPolicy"; + +/** + * /v1/images/edits — multipart edit endpoint matching OpenAI's images-edit API. + * + * Open WebUI's "Image Edit" toggle (images.edit.engine = "openai") posts here + * with `prompt` + `image` (file). For chatgpt-web, an "edit" only makes sense + * if the uploaded image was originally generated through OmniRoute — we then + * have its `{conversationId, parentMessageId}` cached and can continue the + * saved chatgpt.com conversation node, which is the only way to actually edit + * the image instead of generating an unrelated one from scratch. + * + * Without this route, multipart bodies trip Next.js's Server Action handler + * (which intercepts ALL POSTs with multipart/form-data content-type) and the + * client gets a confusing "Failed to find Server Action" 500. + */ + +export async function OPTIONS() { + return new Response(null, { + headers: { + "Access-Control-Allow-Origin": CORS_ORIGIN, + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "*", + }, + }); +} + +const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const; + +function publicBaseUrlHeaders(headers: Headers): Record { + const out: Record = {}; + for (const key of PUBLIC_BASE_URL_HEADER_KEYS) { + const value = headers.get(key); + if (value !== null) out[key] = value; + } + return out; +} + +async function readMultipartImage(formData: FormData): Promise<{ + prompt: string; + model: string | null; + size: string | null; + responseFormat: string | null; + imageBytes: Buffer | null; + imageMime: string | null; +}> { + const promptRaw = formData.get("prompt"); + const prompt = typeof promptRaw === "string" ? promptRaw.trim() : ""; + const modelRaw = formData.get("model"); + const model = typeof modelRaw === "string" ? modelRaw.trim() : null; + const sizeRaw = formData.get("size"); + const size = typeof sizeRaw === "string" ? sizeRaw.trim() : null; + const respRaw = formData.get("response_format"); + const responseFormat = typeof respRaw === "string" ? respRaw.trim() : null; + + // OpenAI's API and Open WebUI both accept either a single `image` field or + // an `image[]` array. We use the first image when multiple are sent — the + // chatgpt-web edit tool can only edit one image per conversation node. + const imageEntry = formData.get("image") ?? formData.get("image[]"); + if (!imageEntry || typeof imageEntry === "string") { + return { prompt, model, size, responseFormat, imageBytes: null, imageMime: null }; + } + const file = imageEntry as File; + const imageBytes = Buffer.from(await file.arrayBuffer()); + const imageMime = file.type || "image/png"; + return { prompt, model, size, responseFormat, imageBytes, imageMime }; +} + +export async function POST(request: Request) { + let formData: FormData; + try { + formData = await request.formData(); + } catch (err) { + log.warn("IMAGE", `Invalid multipart body: ${err instanceof Error ? err.message : String(err)}`); + return errorResponse(HTTP_STATUS.BAD_REQUEST, "Invalid multipart body"); + } + + const { prompt, model, size, responseFormat, imageBytes, imageMime } = + await readMultipartImage(formData); + + if (!prompt) { + return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: prompt"); + } + if (!imageBytes || imageBytes.length === 0) { + return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: image"); + } + + const apiKey = extractApiKey(request); + if (!isValidApiKey(apiKey)) { + const policyError = enforceApiKeyPolicy(apiKey); + if (policyError) { + return new Response(JSON.stringify(policyError.body), { + status: policyError.status, + headers: { "Content-Type": "application/json" }, + }); + } + } + + const fullModel = model || "cgpt-web/gpt-5.3-instant"; + const parsed = parseImageModel(fullModel); + const providerConfig = getImageProvider(parsed.provider); + if (!providerConfig) { + return errorResponse(HTTP_STATUS.BAD_REQUEST, `Unknown image provider: ${parsed.provider}`); + } + if (providerConfig.format !== "chatgpt-web") { + // We only implement edit for chatgpt-web today; everything else routes + // through generations which doesn't accept image inputs. Surface a + // useful error rather than silently dropping the image. + return errorResponse( + HTTP_STATUS.BAD_REQUEST, + `Image edit is only supported for chatgpt-web models (got ${parsed.provider})` + ); + } + + const credentials = await getProviderCredentials(parsed.provider, apiKey); + if (!credentials) { + return errorResponse(HTTP_STATUS.UNAUTHORIZED, `No credentials for provider: ${parsed.provider}`); + } + if (credentials.allRateLimited) { + return unavailableResponse( + HTTP_STATUS.RATE_LIMITED, + `[${parsed.provider}] All accounts rate limited`, + credentials.retryAfter, + credentials.retryAfterHuman + ); + } + + const result = await handleImageEdit({ + provider: parsed.provider, + model: parsed.model, + body: { + prompt, + size: size ?? undefined, + response_format: responseFormat ?? undefined, + n: 1, + }, + imageBytes, + imageMime, + credentials, + log, + signal: request.signal, + clientHeaders: publicBaseUrlHeaders(request.headers), + }); + + if (result.success) { + await clearRecoveredProviderState(credentials); + return new Response(JSON.stringify((result as any).data), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + } + + const errorPayload = toJsonErrorPayload( + (result as any).error, + "Image edit provider error" + ); + return new Response(JSON.stringify(errorPayload), { + status: (result as any).status, + headers: { "Content-Type": "application/json" }, + }); +} diff --git a/src/app/api/v1/images/generations/route.ts b/src/app/api/v1/images/generations/route.ts index 3a37b9e5..be7aa2b3 100644 --- a/src/app/api/v1/images/generations/route.ts +++ b/src/app/api/v1/images/generations/route.ts @@ -101,6 +101,23 @@ function hasImageGenerationInput(body: Record) { return false; } +// Forward only the host-shaped headers the chatgpt-web image handler needs +// to derive the browser-facing public base URL. Avoid copying the full +// request header set: it's wider than the handler needs (auth tokens, +// content-type, etc.) and `Headers.forEach` collapses repeated values, which +// would silently drop entries if a wider helper were reused for headers +// that can legitimately repeat (e.g., set-cookie). +const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const; + +function publicBaseUrlHeaders(headers: Headers): Record { + const out: Record = {}; + for (const key of PUBLIC_BASE_URL_HEADER_KEYS) { + const value = headers.get(key); + if (value !== null) out[key] = value; + } + return out; +} + export async function POST(request) { let rawBody; try { @@ -228,6 +245,8 @@ export async function POST(request) { credentials, log, ...(isCustomModel && { resolvedProvider: provider }), + signal: request.signal, + clientHeaders: publicBaseUrlHeaders(request.headers), }); if (result.success) { diff --git a/src/lib/env/runtimeEnv.ts b/src/lib/env/runtimeEnv.ts index 7d3c9505..c836ddd8 100644 --- a/src/lib/env/runtimeEnv.ts +++ b/src/lib/env/runtimeEnv.ts @@ -66,6 +66,7 @@ export const webRuntimeEnvSchema = z.object({ OMNIROUTE_DISABLE_BACKGROUND_SERVICES: optionalBooleanEnv, CLOUD_URL: optionalHttpUrl, NEXT_PUBLIC_CLOUD_URL: optionalHttpUrl, + OMNIROUTE_PUBLIC_BASE_URL: optionalHttpUrl, OMNIROUTE_BASE_URL: optionalHttpUrl, BASE_URL: optionalHttpUrl, NEXT_PUBLIC_BASE_URL: optionalHttpUrl, diff --git a/src/types/global.d.ts b/src/types/global.d.ts index 75226fe9..0e36d867 100644 --- a/src/types/global.d.ts +++ b/src/types/global.d.ts @@ -21,6 +21,9 @@ declare namespace NodeJS { PORT?: string; API_HOST?: string; DASHBOARD_PORT?: string; + OMNIROUTE_PUBLIC_BASE_URL?: string; + OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS?: string; + OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB?: string; OMNIROUTE_BASE_URL?: string; OMNIROUTE_DISABLE_BACKGROUND_SERVICES?: string; OMNIROUTE_PORT?: string; diff --git a/tests/unit/chatgpt-web.test.ts b/tests/unit/chatgpt-web.test.ts index 5a489a89..76a18e74 100644 --- a/tests/unit/chatgpt-web.test.ts +++ b/tests/unit/chatgpt-web.test.ts @@ -1,7 +1,7 @@ import test from "node:test"; import assert from "node:assert/strict"; -const { ChatGptWebExecutor, __resetChatGptWebCachesForTesting } = +const { ChatGptWebExecutor, __derivePublicBaseUrlForTesting, __resetChatGptWebCachesForTesting } = await import("../../open-sse/executors/chatgpt-web.ts"); const { getExecutor, hasSpecializedExecutor } = await import("../../open-sse/executors/index.ts"); const { __setTlsFetchOverrideForTesting, looksLikeSse, TlsClientUnavailableError } = @@ -24,14 +24,60 @@ function makeHeaders(map = {}) { return h; } +async function withEnv(overrides, fn) { + const keys = [ + "OMNIROUTE_PUBLIC_BASE_URL", + "OMNIROUTE_BASE_URL", + "NEXT_PUBLIC_BASE_URL", + "BASE_URL", + "PORT", + ]; + const previous = new Map(keys.map((key) => [key, process.env[key]])); + + for (const key of keys) { + if (Object.prototype.hasOwnProperty.call(overrides, key)) { + const value = overrides[key]; + if (value == null) delete process.env[key]; + else process.env[key] = String(value); + } else { + delete process.env[key]; + } + } + + try { + return await fn(); + } finally { + for (const [key, value] of previous) { + if (value == null) delete process.env[key]; + else process.env[key] = value; + } + } +} + /** Dispatch the TLS-impersonating fetch by URL pathname. * Default: session 200 with accessToken, sentinel 200 no PoW, conv 200 empty stream. */ -function installMockFetch({ session, sentinel, conv, dpl, onSession, onSentinel, onConv } = {}) { +function installMockFetch({ + session, + sentinel, + conv, + dpl, + fileDownload, + attachmentDownload, + signedDownload, + onSession, + onSentinel, + onConv, + onFileDownload, + onAttachmentDownload, +} = {}) { const calls = { session: 0, dpl: 0, sentinel: 0, conv: 0, + fileDownload: 0, + attachmentDownload: 0, + signedDownload: 0, urls: [], headers: [], bodies: [], @@ -97,6 +143,75 @@ function installMockFetch({ session, sentinel, conv, dpl, onSession, onSentinel, }; } + // /backend-api/conversation//attachment//download + // Must match BEFORE the conversation-endpoint regex below since the + // conv-prefix regex is broad. + { + const m1 = u.match(/\/backend-api\/conversation\/[^/]+\/attachment\/([^/]+)\/download/); + if (m1) { + calls.attachmentDownload++; + if (onAttachmentDownload) onAttachmentDownload(opts, m1[1]); + const cfg = attachmentDownload ?? { + status: 200, + body: { download_url: `https://files.oaiusercontent.com/${m1[1]}?sig=mock` }, + }; + return { + status: cfg.status, + headers: makeHeaders({ "Content-Type": "application/json" }), + text: typeof cfg.body === "string" ? cfg.body : JSON.stringify(cfg.body || {}), + body: null, + }; + } + } + + // /backend-api/files//download + { + const m1 = u.match(/\/backend-api\/files\/([^/]+)\/download/); + if (m1) { + calls.fileDownload++; + if (onFileDownload) onFileDownload(opts, m1[1]); + const cfg = fileDownload ?? { + status: 200, + body: { download_url: `https://files.oaiusercontent.com/${m1[1]}?sig=mock` }, + }; + return { + status: cfg.status, + headers: makeHeaders({ "Content-Type": "application/json" }), + text: typeof cfg.body === "string" ? cfg.body : JSON.stringify(cfg.body || {}), + body: null, + }; + } + } + + // The signed estuary URL the executor follows after fetching either + // download endpoint. Mock returns a tiny PNG header so the executor's + // `imageUrlToCachedImageUrl` decoder produces a valid Buffer and the + // image makes it into the cache, surfaced as /v1/chatgpt-web/image/. + if (/^https:\/\/files\.oaiusercontent\.com\//.test(u)) { + calls.signedDownload++; + const cfg = signedDownload ?? { status: 200 }; + if (cfg.status >= 400) { + return { + status: cfg.status, + headers: makeHeaders({ "Content-Type": "text/plain" }), + text: cfg.body || "", + body: null, + }; + } + const tinyPng = Buffer.from([ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, + 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, + ]); + return { + status: cfg.status, + headers: makeHeaders({ "Content-Type": "image/png" }), + // tls-client-node packages binary bodies as a data:;base64,... + // string when isByteResponse is set; the mock mirrors that contract. + text: `data:image/png;base64,${tinyPng.toString("base64")}`, + body: null, + }; + } + // Match only the exact conversation endpoint, not /conversations (plural — warmup). if ( u.endsWith("/backend-api/f/conversation") || @@ -185,6 +300,63 @@ test("ChatGptWebExecutor sets correct provider name", () => { assert.equal(executor.getProvider(), "chatgpt-web"); }); +// ─── Public image URL derivation ──────────────────────────────────────────── + +test("Image URL base: OMNIROUTE_PUBLIC_BASE_URL wins and strips accidental /v1", async () => { + await withEnv( + { + OMNIROUTE_PUBLIC_BASE_URL: " http://192.168.107.55:20128/v1/ ", + NEXT_PUBLIC_BASE_URL: "http://localhost:20128", + }, + async () => { + assert.equal( + __derivePublicBaseUrlForTesting({ host: "localhost:20128" }), + "http://192.168.107.55:20128" + ); + } + ); +}); + +test("Image URL base: local NEXT_PUBLIC_BASE_URL does not mask LAN Host header", async () => { + await withEnv( + { + NEXT_PUBLIC_BASE_URL: "http://localhost:20128", + BASE_URL: "http://localhost:20128", + }, + async () => { + assert.equal( + __derivePublicBaseUrlForTesting({ host: "192.168.107.55:20128" }), + "http://192.168.107.55:20128" + ); + } + ); +}); + +test("Image URL base: forwarded headers override raw Host", async () => { + await withEnv({}, async () => { + assert.equal( + __derivePublicBaseUrlForTesting({ + host: "localhost:20128", + "x-forwarded-host": "omni.example.com", + "x-forwarded-proto": "https", + }), + "https://omni.example.com" + ); + }); +}); + +test("Image URL base: non-local OMNIROUTE_BASE_URL remains a compatibility fallback", async () => { + await withEnv({ OMNIROUTE_BASE_URL: "https://omni.example.com/v1" }, async () => { + assert.equal(__derivePublicBaseUrlForTesting(null), "https://omni.example.com"); + }); +}); + +test("Image URL base: falls back to localhost with PORT", async () => { + await withEnv({ PORT: "20129" }, async () => { + assert.equal(__derivePublicBaseUrlForTesting(null), "http://localhost:20129"); + }); +}); + // ─── Token exchange path ──────────────────────────────────────────────────── test("Token exchange: cookie sent to /api/auth/session, accessToken used as Bearer on later calls", async () => { @@ -869,6 +1041,8 @@ test("Request: payload has correct ChatGPT shape", async () => { const body = JSON.parse(m.calls.bodies[convIdx]); assert.equal(body.action, "next"); assert.equal(body.model, "gpt-5-3"); + // Plain text request → Temporary Chat stays ON. We disable it only for + // image-gen prompts (see "Image gen: image-intent prompts" tests below). assert.equal(body.history_and_training_disabled, true); // System message preserves the user-supplied system prompt; the user // message is the latest query. @@ -893,6 +1067,17 @@ test("Provider registry: chatgpt-web is registered with gpt-5.3-instant model", assert.ok(entry.models.find((m) => m.id === "gpt-5.3-instant")); }); +test("Image registry: cgpt-web/gpt-5.3-instant routes to ChatGPT Web image handler", async () => { + const { parseImageModel, getImageProvider } = + await import("../../open-sse/config/imageRegistry.ts"); + const parsed = parseImageModel("cgpt-web/gpt-5.3-instant"); + assert.equal(parsed.provider, "chatgpt-web"); + assert.equal(parsed.model, "gpt-5.3-instant"); + const provider = getImageProvider(parsed.provider); + assert.equal(provider.format, "chatgpt-web"); + assert.equal(provider.authHeader, "cookie"); +}); + // ─── Cookie rotation preserves Cloudflare cookies ─────────────────────────── test("Cookie rotation: full DevTools blob keeps cf_clearance/__cf_bm/_cfuvid", async () => { @@ -1332,3 +1517,931 @@ test("looksLikeSse: rejects non-SSE bodies that previously passed as 200", () => assert.equal(looksLikeSse(" \n\n"), false, "whitespace only"); assert.equal(looksLikeSse("error: rate limit"), false, "non-SSE field name"); }); + +// ─── Image generation ────────────────────────────────────────────────────── + +/** Build a SSE event stream that mimics ChatGPT's image-generation reply. + * Text turn first, then a finalized multimodal_text with one image_asset_pointer. */ +function imageGenEvents({ pointer, text = "Here's your kitten:" }) { + return [ + { + conversation_id: "conv-img-1", + message: { + id: "msg-1", + author: { role: "assistant" }, + content: { content_type: "text", parts: [text] }, + status: "in_progress", + }, + }, + { + conversation_id: "conv-img-1", + message: { + id: "msg-1", + author: { role: "assistant" }, + content: { + content_type: "multimodal_text", + parts: [ + { + content_type: "image_asset_pointer", + asset_pointer: pointer, + width: 1024, + height: 1024, + }, + ], + }, + status: "finished_successfully", + }, + }, + ]; +} + +test("Image gen: file-service:// pointer resolves to download URL and is appended as markdown (non-streaming)", async () => { + reset(); + const m = installMockFetch({ + conv: { status: 200, events: imageGenEvents({ pointer: "file-service://file-kitten1" }) }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "generate an image of a kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + assert.match(content, /Here's your kitten:/); + // The signed chatgpt.com URL is downloaded server-side and re-served + // from the OmniRoute cache; clients see a stable /v1/chatgpt-web/image + // path, never the session-signed estuary URL (which 403s anonymously). + assert.match(content, /!\[image\]\([^)]*\/v1\/chatgpt-web\/image\/[a-f0-9]+\)/); + assert.doesNotMatch(content, /files\.oaiusercontent\.com/); + assert.equal(m.calls.fileDownload, 1, "fetched download URL once"); + assert.equal(m.calls.signedDownload, 1, "fetched signed bytes once"); + } finally { + m.restore(); + } +}); + +test("Image gen: file-service:// pointer is appended in streaming SSE", async () => { + reset(); + const m = installMockFetch({ + conv: { + status: 200, + events: imageGenEvents({ pointer: "file-service://file-kitten2", text: "ok:" }), + }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "draw a kitten" }] }, + stream: true, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const reader = result.response.body.getReader(); + const decoder = new TextDecoder(); + let body = ""; + while (true) { + const { value, done } = await reader.read(); + if (done) break; + body += decoder.decode(value); + } + assert.match(body, /!\[image\]\([^)]*\/v1\/chatgpt-web\/image\/[a-f0-9]+\)/); + assert.doesNotMatch(body, /files\.oaiusercontent\.com/); + assert.match(body, /data: \[DONE\]/); + assert.equal(m.calls.fileDownload, 1); + assert.equal(m.calls.signedDownload, 1); + } finally { + m.restore(); + } +}); + +test("Image gen: sediment:// pointer prefers /files//download over /attachment", async () => { + reset(); + const m = installMockFetch({ + conv: { status: 200, events: imageGenEvents({ pointer: "sediment://file-sed1" }) }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "make a kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + // The mock returns the signed URL via /files//download. We try + // that path first (it's the same kind of estuary URL chatgpt.com + // returns either way, and we care more about not hitting an extra + // round-trip); the /attachment endpoint is a fallback for when the + // primary 404s. The mock /files/ response also doubles as the image + // bytes that are cached behind the emitted OmniRoute image URL. + assert.match(content, /!\[image\]\([^)]*\/v1\/chatgpt-web\/image\/[a-f0-9]+\)/, "image rendered"); + assert.doesNotMatch(content, /files\.oaiusercontent\.com/); + assert.equal(m.calls.fileDownload, 1, "tried /files/ endpoint first"); + assert.equal(m.calls.attachmentDownload, 0, "did not need /attachment fallback"); + assert.equal(m.calls.signedDownload, 1, "fetched signed bytes once"); + } finally { + m.restore(); + } +}); + +test("Image gen: failed download URL is dropped silently — no broken markdown", async () => { + reset(); + // Both /files//download AND the /conversation//attachment// + // download fallback have to fail for the resolver to give up. The fallback + // is the path that recovers `file_00000000XXX` shaped IDs returned by + // chatgpt.com for image-edit results, so the failure assertion has to + // close BOTH doors. + const m = installMockFetch({ + conv: { status: 200, events: imageGenEvents({ pointer: "file-service://file-broken" }) }, + fileDownload: { status: 500, body: { error: "boom" } }, + attachmentDownload: { status: 500, body: { error: "boom" } }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + // Text retained, markdown placeholder NOT emitted (no broken ![image]() link). + assert.match(content, /Here's your kitten:/); + assert.doesNotMatch(content, /!\[image\]\(/); + } finally { + m.restore(); + } +}); + +test("Image gen: image-intent prompt disables Temporary Chat", async () => { + reset(); + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "generate an image of a kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal(body.history_and_training_disabled, false, "Temporary Chat OFF for image gen"); + } finally { + m.restore(); + } +}); + +test("Image gen: text-only prompt keeps Temporary Chat ON", async () => { + reset(); + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "what is the capital of France?" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal(body.history_and_training_disabled, true, "Temporary Chat ON for text request"); + } finally { + m.restore(); + } +}); + +test("Image gen: Open WebUI follow-up/title/tag tool prompts do NOT trigger image gen", async () => { + reset(); + const toolPrompts = [ + `### Task:\nSuggest 3-5 relevant follow-up questions...\n### Output:\nJSON format: { "follow_ups": ["Q1?", "Q2?", "Q3?"] }\n### Chat History:\n\nUSER: generate an image of a football game\nASSISTANT: _Generating image…_\n`, + `### Task:\nGenerate a concise, 3-5 word title with an emoji summarizing the chat history.\n### Output:\nJSON format: { "title": "your concise title here" }\n### Chat History:\n\nUSER: draw an image of a kitten\n`, + `### Task:\nGenerate 1-3 broad tags categorizing the main themes of the chat history\n### Output:\nJSON format: { "tags": ["tag1"] }\n### Chat History:\n\nUSER: render a logo for my startup\n`, + ]; + for (const prompt of toolPrompts) { + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: prompt }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal( + body.history_and_training_disabled, + true, + `tool prompt should keep Temporary Chat ON: ${prompt.slice(0, 50)}...` + ); + } finally { + m.restore(); + } + } +}); + +test("Image gen: Open WebUI image-generation context suppresses duplicate chat image gen", async () => { + reset(); + const contexts = [ + "The requested image has been created by the system successfully and is now being shown to the user. Let the user know that the image they requested has been generated and is now shown in the chat.", + "The requested image has been edited and created and is now being shown to the user. Let them know that it has been generated.", + "Image generation was attempted but failed because of an error. The system is currently unable to generate the image.", + ]; + + for (const context of contexts) { + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { + messages: [ + { role: "system", content: context }, + { role: "user", content: "draw an image of a tennis match at night" }, + ], + }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal( + body.history_and_training_disabled, + true, + "Open WebUI already handled image generation, so chat path should stay temporary" + ); + assert.equal(body.conversation_id, null); + assert.match( + body.messages[body.messages.length - 1].content.parts[0], + /Briefly acknowledge the image result/ + ); + assert.doesNotMatch(body.messages[body.messages.length - 1].content.parts[0], /tennis match/); + } finally { + m.restore(); + } + } +}); + +test("Image gen: heuristic catches common phrasings", async () => { + reset(); + const phrases = [ + "draw me a kitten", + "create an image of a sunset", + "make a picture of mountains", + "render a logo for my startup", + "show me an illustration of a dragon", + "/imagine a futuristic city", + "paint a portrait of Einstein", + "produce a photo of a beach", + ]; + for (const phrase of phrases) { + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: phrase }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal( + body.history_and_training_disabled, + false, + `Phrase ${JSON.stringify(phrase)} should classify as image-gen` + ); + } finally { + m.restore(); + } + } +}); + +test("Image gen: signed URL bytes are cached and exposed via /v1/chatgpt-web/image URL", async () => { + reset(); + // Real-world flow: /files//download returns a chatgpt.com estuary URL + // signed for the user's session — that URL 403s for any anonymous client, + // so we fetch the bytes, cache them locally, and emit an OmniRoute image URL. + const pngBytes = Buffer.from([ + 0x89, + 0x50, + 0x4e, + 0x47, + 0x0d, + 0x0a, + 0x1a, + 0x0a, // PNG magic + 0x00, + 0x00, + 0x00, + 0x0d, + 0x49, + 0x48, + 0x44, + 0x52, // IHDR chunk + ]); + const dataUriFromTlsClient = `data:image/png;base64,${pngBytes.toString("base64")}`; + + const tls = await import("../../open-sse/services/chatgptTlsClient.ts"); + __resetChatGptWebCachesForTesting(); + const downloadUrl = "https://chatgpt.com/backend-api/estuary/content?id=file-data1&sig=abc"; + const calls = { signed: 0, urls: [] }; + + tls.__setTlsFetchOverrideForTesting(async (url, opts = {}) => { + const u = String(url); + calls.urls.push(u); + if (u === "https://chatgpt.com/" && (opts.method || "GET") === "GET") { + return { + status: 200, + headers: makeHeaders({ "Content-Type": "text/html" }), + text: '', + body: null, + }; + } + if (u.includes("/api/auth/session")) { + return { + status: 200, + headers: makeHeaders({ "Content-Type": "application/json" }), + text: JSON.stringify({ + accessToken: "jwt-x", + expires: new Date(Date.now() + 3600_000).toISOString(), + user: { id: "u1" }, + }), + body: null, + }; + } + if (u.includes("/sentinel/chat-requirements")) { + return { + status: 200, + headers: makeHeaders({ "Content-Type": "application/json" }), + text: JSON.stringify({ token: "t", proofofwork: { required: false } }), + body: null, + }; + } + if (u.match(/\/backend-api\/files\/[^/]+\/download/)) { + return { + status: 200, + headers: makeHeaders({ "Content-Type": "application/json" }), + text: JSON.stringify({ download_url: downloadUrl }), + body: null, + }; + } + if (u.startsWith(downloadUrl)) { + calls.signed++; + // tls-client-node returns binary bodies as a "data:;base64,..." + // string (see its response.js bytes() impl); the executor decodes it + // back into bytes before putting the image in OmniRoute's cache. + return { + status: 200, + headers: makeHeaders({ "Content-Type": "image/png" }), + text: dataUriFromTlsClient, + body: null, + }; + } + if (u.endsWith("/backend-api/f/conversation") || u.endsWith("/backend-api/conversation")) { + return { + status: 200, + headers: makeHeaders({ "Content-Type": "text/event-stream" }), + text: mockChatGptStreamText(imageGenEvents({ pointer: "file-service://file-data1" })), + body: null, + }; + } + return { status: 404, headers: makeHeaders(), text: "not mocked", body: null }; + }); + + await withEnv( + { + OMNIROUTE_PUBLIC_BASE_URL: "http://192.168.107.55:20128/v1", + NEXT_PUBLIC_BASE_URL: "http://localhost:20128", + }, + async () => { + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "draw kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + // The executor caches the bytes in memory and emits a URL pointing + // at /v1/chatgpt-web/image/ instead of embedding a data URI — + // see open-sse/services/chatgptImageCache.ts and the matching route + // in src/app/api/v1/chatgpt-web/image/[id]/route.ts. + const m = content.match( + /!\[image\]\((http:\/\/192\.168\.107\.55:20128\/v1\/chatgpt-web\/image\/([a-f0-9]+))\)/ + ); + assert.ok(m, `expected URL-style markdown, got: ${content.slice(0, 200)}`); + assert.equal(calls.signed, 1, "fetched signed URL once"); + + // Verify the cached bytes match the PNG we fed in by going through + // the cache module directly. + const cacheMod = await import("../../open-sse/services/chatgptImageCache.ts"); + const entry = cacheMod.getChatGptImage(m[2]); + assert.ok(entry, "cache entry exists for the emitted id"); + assert.equal(entry.mime, "image/png"); + assert.deepEqual(Array.from(entry.bytes), Array.from(pngBytes)); + assert.deepEqual(entry.context, { + conversationId: "conv-img-1", + parentMessageId: "msg-1", + }); + } finally { + tls.__setTlsFetchOverrideForTesting(null); + } + } + ); +}); + +test("Image gen: prior data: image URIs are stripped from history before upstream", async () => { + // Open WebUI replays the full conversation each turn. After we generate an + // image and emit ![image](data:image/png;base64,...), that 2-3MB string + // comes back as the assistant message on the next turn. Sending it back + // upstream blows past chatgpt.com's body limits → "empty response body" 502. + reset(); + const m = installMockFetch(); + try { + const huge = "iVBORw0KGgo" + "A".repeat(2_000_000); // ~2MB base64 + const assistantMsg = `Sure, here you go:\n\n![image](data:image/png;base64,${huge})\n`; + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { + messages: [ + { role: "user", content: "draw a kitten" }, + { role: "assistant", content: assistantMsg }, + { role: "user", content: "now make it a puppy" }, + ], + }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + assert.ok(convIdx >= 0, "conversation request was sent"); + const body = m.calls.bodies[convIdx]; + assert.ok(body.length < 50_000, `body should be small, got ${body.length}`); + const parsed = JSON.parse(body); + const allParts = JSON.stringify(parsed.messages); + assert.doesNotMatch(allParts, /data:image/, "no data: URI in upstream body"); + assert.match(allParts, /generated image/, "placeholder is present"); + } finally { + m.restore(); + } +}); + +test("Image edit: cached OmniRoute image URL continues the saved ChatGPT conversation", async () => { + reset(); + const { storeChatGptImage } = await import("../../open-sse/services/chatgptImageCache.ts"); + const imageId = storeChatGptImage(Buffer.from([1, 2, 3]), "image/png", 30_000, { + conversationId: "conv-image-1", + parentMessageId: "msg-image-1", + }); + const imageUrl = `http://192.168.107.55:20128/v1/chatgpt-web/image/${imageId}`; + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { + messages: [ + { role: "user", content: "draw a kitten" }, + { role: "assistant", content: `Here it is:\n\n![image](${imageUrl})` }, + { role: "user", content: "make it nighttime with softer lighting" }, + ], + }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + assert.ok(convIdx >= 0, "conversation request was sent"); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal(body.conversation_id, "conv-image-1"); + assert.equal(body.parent_message_id, "msg-image-1"); + assert.equal(body.history_and_training_disabled, false); + assert.equal(body.messages.length, 1, "saved ChatGPT conversation carries prior image state"); + assert.equal(body.messages[0].author.role, "user"); + assert.match(body.messages[0].content.parts[0], /nighttime/); + } finally { + m.restore(); + } +}); + +test("Image edit: Open WebUI image context suppresses duplicate edit continuation", async () => { + reset(); + const { storeChatGptImage } = await import("../../open-sse/services/chatgptImageCache.ts"); + const imageId = storeChatGptImage(Buffer.from([1, 2, 3]), "image/png", 30_000, { + conversationId: "conv-image-2", + parentMessageId: "msg-image-2", + }); + const imageUrl = `http://192.168.107.55:20128/v1/chatgpt-web/image/${imageId}`; + const m = installMockFetch(); + try { + const executor = new ChatGptWebExecutor(); + await executor.execute({ + model: "gpt-5.3-instant", + body: { + messages: [ + { + role: "system", + content: + "The requested image has been edited and created and is now being shown to the user. Let them know that it has been generated.", + }, + { role: "user", content: "draw a kitten" }, + { role: "assistant", content: `Here it is:\n\n![image](${imageUrl})` }, + { role: "user", content: "make it nighttime with softer lighting" }, + ], + }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + const body = JSON.parse(m.calls.bodies[convIdx]); + assert.equal(body.conversation_id, null); + assert.notEqual(body.parent_message_id, "msg-image-2"); + assert.equal(body.history_and_training_disabled, true); + assert.match( + body.messages[body.messages.length - 1].content.parts[0], + /Briefly acknowledge the image result/ + ); + assert.doesNotMatch(body.messages[body.messages.length - 1].content.parts[0], /nighttime/); + } finally { + m.restore(); + } +}); + +test("Image gen: dedupes the same pointer across in-progress + finished events", async () => { + reset(); + // Repeat the same pointer in BOTH the in_progress event and the + // finished_successfully event. The resolver should fetch the URL once. + const events = [ + { + conversation_id: "conv-d", + message: { + id: "msg-1", + author: { role: "assistant" }, + content: { + content_type: "multimodal_text", + parts: [ + { content_type: "image_asset_pointer", asset_pointer: "file-service://file-dedupe" }, + ], + }, + status: "in_progress", + }, + }, + { + conversation_id: "conv-d", + message: { + id: "msg-1", + author: { role: "assistant" }, + content: { + content_type: "multimodal_text", + parts: [ + { content_type: "image_asset_pointer", asset_pointer: "file-service://file-dedupe" }, + ], + }, + status: "finished_successfully", + }, + }, + ]; + const m = installMockFetch({ conv: { status: 200, events } }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + // Markdown emitted exactly once (single image, not duplicated). + const matches = json.choices[0].message.content.match(/!\[image\]\(/g) ?? []; + assert.equal(matches.length, 1, "markdown emitted once"); + assert.equal(m.calls.fileDownload, 1, "download URL fetched once"); + } finally { + m.restore(); + } +}); + +test("Image gen: bytes-fetch failure drops markdown (no signed-URL fallback)", async () => { + // Memory principle #3: never hand back the chatgpt.com signed estuary URL. + // It 403s for any anonymous client, so emitting it as markdown produces + // broken images. The resolver returns null when the bytes fetch fails; + // imageMarkdown skips empty URL lists, so no `![image](…)` appears. + reset(); + const m = installMockFetch({ + conv: { status: 200, events: imageGenEvents({ pointer: "file-service://file-broken-bytes" }) }, + signedDownload: { status: 500, body: "boom" }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "draw a kitten" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + assert.doesNotMatch(content, /!\[image\]/, "no markdown for failed bytes fetch"); + assert.doesNotMatch(content, /files\.oaiusercontent\.com/, "signed URL is never leaked to client"); + assert.equal(m.calls.fileDownload, 1, "download URL was attempted"); + assert.equal(m.calls.signedDownload, 1, "signed-bytes fetch was attempted and failed"); + } finally { + m.restore(); + } +}); + +test("Image cache: byte cap evicts oldest before count cap kicks in", async () => { + reset(); + const cacheMod = await import("../../open-sse/services/chatgptImageCache.ts"); + // 4 MB images × 3 stores against a 10 MB byte cap: third store should + // evict the first to fit. Verifies the byte budget bites BEFORE the + // 200-entry count cap, which is the whole point of the byte budget. + const original = process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB; + process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB = "10"; + try { + cacheMod.__resetChatGptImageCacheForTesting(); + const big = Buffer.alloc(4 * 1024 * 1024, 1); + const id1 = cacheMod.storeChatGptImage(big, "image/png"); + const id2 = cacheMod.storeChatGptImage(big, "image/png"); + assert.ok(cacheMod.getChatGptImage(id1), "id1 still resident after id2"); + assert.ok(cacheMod.getChatGptImage(id2), "id2 resident"); + const id3 = cacheMod.storeChatGptImage(big, "image/png"); + assert.equal(cacheMod.getChatGptImage(id1), null, "id1 evicted to make room for id3"); + assert.ok(cacheMod.getChatGptImage(id2), "id2 still resident"); + assert.ok(cacheMod.getChatGptImage(id3), "id3 resident"); + // Total bytes never exceeds the cap once we've evicted. + const bytes = cacheMod.__getChatGptImageCacheBytesForTesting(); + assert.ok( + bytes <= 10 * 1024 * 1024, + `cache bytes (${bytes}) should be within the configured 10 MB cap` + ); + } finally { + if (original == null) delete process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB; + else process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB = original; + cacheMod.__resetChatGptImageCacheForTesting(); + } +}); + +test("Image edit: file_0000XXXX (chatgpt-web edit result) falls back to /conversation/.../attachment/.../download", async () => { + // The /files//download endpoint rejects the new `file_00000000XXX` + // pointer shape that chatgpt.com returns for image-EDIT results — it + // 422s. The pointer is conversation-scoped and only resolves through + // /conversation//attachment//download. Without the fallback + // we'd render a broken image link or no markdown at all. + reset(); + const m = installMockFetch({ + conv: { + status: 200, + events: imageGenEvents({ + pointer: "file-service://file_00000000f1fc7246af3a3934a8c55b9c", + }), + }, + fileDownload: { status: 422, body: { detail: "edit-shape pointer not directly fetchable" } }, + }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "now make it nighttime" }] }, + stream: false, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + assert.equal(result.response.status, 200); + const json = await result.response.json(); + const content = json.choices[0].message.content; + assert.match(content, /!\[image\]\([^)]*\/v1\/chatgpt-web\/image\/[a-f0-9]+\)/, "image rendered via fallback"); + assert.equal(m.calls.fileDownload, 1, "tried /files/ first"); + assert.equal(m.calls.attachmentDownload, 1, "fell back to /conversation/.../attachment/.../download"); + assert.equal(m.calls.signedDownload, 1, "fetched signed bytes once"); + } finally { + m.restore(); + } +}); + +test("Image gen: ChatGPT-internal tool_invoked metadata does NOT spuriously trigger image gen heartbeats", async () => { + // Regression for: the executor used to set imageGenAsync = true on any + // server_ste_metadata event with `tool_invoked: true`, but ChatGPT marks + // *all* internal tool usage (reasoning, web search, calc, file_search) + // with that flag. Plain text turns ended up emitting "Generating image…" + // text and a 30s WebSocket wait. Specific image-gen signals only. + reset(); + const events = [ + { + type: "server_ste_metadata", + metadata: { tool_invoked: true, turn_use_case: "default" }, + }, + { + conversation_id: "conv-x", + message: { + id: "msg-x", + author: { role: "assistant" }, + content: { content_type: "text", parts: ["GPT-4o-mini has weaker reasoning."] }, + status: "in_progress", + }, + }, + { + conversation_id: "conv-x", + message: { + id: "msg-x", + author: { role: "assistant" }, + content: { content_type: "text", parts: ["GPT-4o-mini has weaker reasoning."] }, + status: "finished_successfully", + }, + }, + ]; + const m = installMockFetch({ conv: { status: 200, events } }); + try { + const executor = new ChatGptWebExecutor(); + const result = await executor.execute({ + model: "gpt-5.3-instant", + body: { messages: [{ role: "user", content: "limitations of gpt-4o-mini?" }] }, + stream: true, + credentials: { apiKey: "test" }, + signal: AbortSignal.timeout(10_000), + log: null, + }); + const reader = result.response.body.getReader(); + const decoder = new TextDecoder(); + let body = ""; + while (true) { + const { value, done } = await reader.read(); + if (done) break; + body += decoder.decode(value); + } + assert.doesNotMatch(body, /Generating image/, "no spurious image-gen placeholder text"); + assert.match(body, /weaker reasoning/, "actual answer streamed"); + } finally { + m.restore(); + } +}); + +test("Image edit handler: bytes-hash match drives executor with cached conversation context", async () => { + // The /v1/images/edits flow exists because Open WebUI's image-edit toggle + // posts multipart bodies (prompt + uploaded image bytes) and would + // otherwise trip Next.js's Server Action handler. We hash the uploaded + // bytes, find the cached entry, and synthesize a chat thread that drives + // the executor through its continuation path — same code paths as the + // chat-message edit flow. + reset(); + const cacheMod = await import("../../open-sse/services/chatgptImageCache.ts"); + cacheMod.__resetChatGptImageCacheForTesting(); + const sourceBytes = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 1, 2, 3]); + cacheMod.storeChatGptImage(sourceBytes, "image/png", 60_000, { + conversationId: "conv-edit-handler", + parentMessageId: "msg-edit-handler", + }); + + const m = installMockFetch({ + // Conv response must include an image pointer so the handler sees + // markdown in the assistant message and treats the edit as successful. + conv: { status: 200, events: imageGenEvents({ pointer: "file-service://file-edited-day", text: "Done:" }) }, + }); + try { + const { handleImageEdit } = await import( + "../../open-sse/handlers/imageGeneration.ts" + ); + const result = await handleImageEdit({ + provider: "chatgpt-web", + model: "gpt-5.3-instant", + body: { prompt: "turn it to day time" }, + imageBytes: sourceBytes, + credentials: { apiKey: "test" }, + log: null, + }); + assert.equal(result.success, true, `expected success, got error: ${(result as any).error}`); + const convIdx = m.calls.urls.findIndex((u) => u.endsWith("/backend-api/f/conversation")); + assert.ok(convIdx >= 0, "conversation request was sent"); + const sentBody = JSON.parse(m.calls.bodies[convIdx]); + assert.equal(sentBody.conversation_id, "conv-edit-handler"); + assert.equal(sentBody.parent_message_id, "msg-edit-handler"); + assert.equal(sentBody.history_and_training_disabled, false); + assert.match( + sentBody.messages[sentBody.messages.length - 1].content.parts[0], + /day time/ + ); + } finally { + m.restore(); + } +}); + +test("Image edit handler: no cached match returns 400 (does not silently generate unrelated image)", async () => { + // If the user uploads a foreign image (or the cache TTL elapsed), there's + // no chatgpt.com conversation node to continue and chatgpt-web's image_gen + // tool can't actually edit. Surface that with a clear, actionable error + // instead of generating an unrelated image and confusing the user. + reset(); + const cacheMod = await import("../../open-sse/services/chatgptImageCache.ts"); + cacheMod.__resetChatGptImageCacheForTesting(); + + const m = installMockFetch(); + try { + const { handleImageEdit } = await import( + "../../open-sse/handlers/imageGeneration.ts" + ); + const foreignBytes = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0xde, 0xad, 0xbe, 0xef]); + const result = await handleImageEdit({ + provider: "chatgpt-web", + model: "gpt-5.3-instant", + body: { prompt: "turn it to day time" }, + imageBytes: foreignBytes, + credentials: { apiKey: "test" }, + log: null, + }); + assert.equal(result.success, false); + assert.equal(result.status, 400); + assert.match(String(result.error), /generated through this OmniRoute instance/); + assert.equal(m.calls.session, 0, "no upstream calls were attempted"); + assert.equal(m.calls.conv, 0, "no chat-completion was attempted"); + } finally { + m.restore(); + } +}); + +test("Image gen handler: n>4 is rejected before any upstream call", async () => { + // Each chatgpt-web image is a separate ~30s chat turn. Without a clamp, + // body.n=1000 would pin the executor for hours before HTTP timeout. + // Verify the cap rejects at the boundary without burning a single upstream + // request — important so a rogue client can't trivially DoS the worker. + reset(); + const m = installMockFetch(); + try { + const { handleImageGeneration } = await import( + "../../open-sse/handlers/imageGeneration.ts" + ); + const result = await handleImageGeneration({ + body: { prompt: "draw a kitten", n: 5, model: "cgpt-web/gpt-5.3-instant" }, + credentials: { apiKey: "test" }, + log: null, + }); + assert.equal(result.success, false); + assert.equal(result.status, 400); + assert.match(String(result.error), /n=1\.\.4/); + assert.equal(m.calls.session, 0, "no session exchange was attempted"); + assert.equal(m.calls.conv, 0, "no conversation request was attempted"); + } finally { + m.restore(); + } +}); + +test("Image cache: deleting an entry decrements the byte counter", async () => { + // Regression guard: an earlier draft tracked entry count but not bytes, + // and TTL eviction removed the entry without crediting back its size — + // the counter would only ever grow. + reset(); + const cacheMod = await import("../../open-sse/services/chatgptImageCache.ts"); + cacheMod.__resetChatGptImageCacheForTesting(); + const id = cacheMod.storeChatGptImage(Buffer.alloc(1024, 7), "image/png", 10); + assert.equal(cacheMod.__getChatGptImageCacheBytesForTesting(), 1024); + // Wait past the 10 ms TTL, then trigger eviction by reading. + await new Promise((r) => setTimeout(r, 25)); + assert.equal(cacheMod.getChatGptImage(id), null, "entry expired"); + assert.equal(cacheMod.__getChatGptImageCacheBytesForTesting(), 0, "bytes credited back on TTL evict"); +});