feat(chatgpt-web): image generation + edit (Open WebUI compatible) (#1607)

Integrated into release/v3.7.0
This commit is contained in:
Payne 2026-04-26 05:51:37 +03:00 committed by GitHub
parent 13495d4d13
commit 8a8fcc77a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 3187 additions and 80 deletions

View file

@ -195,6 +195,24 @@ CLOUD_URL=
# Default: http://localhost:20128
NEXT_PUBLIC_BASE_URL=http://localhost:20128
# Browser-facing OmniRoute origin for generated assets in API responses.
# Used by: chatgpt-web image generation cache URLs (/v1/chatgpt-web/image/<id>).
# Set this when OpenWebUI or another relay reaches OmniRoute by an internal URL
# but the user's browser must fetch images from a LAN, tunnel, or public origin.
# Do not include /v1; if included accidentally it will be normalized away.
# OMNIROUTE_PUBLIC_BASE_URL=http://192.168.0.15:20128
# Max wait time for an async chatgpt-web image to land via the celsius
# WebSocket, in milliseconds. Default 180000 (3 minutes). Increase during
# upstream queue-deep windows ("Lots of people are creating images right now").
# OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS=180000
# Total in-memory byte budget for the chatgpt-web image cache (used to serve
# /v1/chatgpt-web/image/<id>), in megabytes. Default 256. Lower this if you
# run OmniRoute on a memory-constrained host; raise it if image generation
# is heavy and clients are racing the 30-minute TTL.
# OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB=256
# Public cloud URL — client-side mirror of CLOUD_URL.
NEXT_PUBLIC_CLOUD_URL=
@ -261,6 +279,7 @@ NEXT_PUBLIC_ENABLE_SOCKS5_PROXY=true
# Used by MCP server, A2A skills, and CLI sidecars to call the running instance.
# Explicit base URL for MCP/A2A tools to reach OmniRoute (overrides localhost auto-detect).
# For browser-visible generated image URLs, prefer OMNIROUTE_PUBLIC_BASE_URL above.
# Used by: open-sse/mcp-server/server.ts, src/lib/a2a/
# OMNIROUTE_BASE_URL=http://localhost:20128

View file

@ -18,6 +18,20 @@ const nextConfig = {
},
},
output: "standalone",
// OmniRoute is a proxy for AI APIs — request bodies routinely include
// multi-MB payloads (vision models, image edits, base64-encoded files,
// long chat histories with embedded images). Next.js's Server Action
// handler intercepts POSTs with multipart/form-data or
// x-www-form-urlencoded content-types and enforces a 1 MB cap that
// surfaces as a 413 with a confusing "Server Actions" hint, even on
// pure route handlers. 50 MB matches what most upstream LLM providers
// accept for image-bearing requests; tune via env if a deployment needs
// more.
experimental: {
serverActions: {
bodySizeLimit: process.env.OMNIROUTE_SERVER_ACTIONS_BODY_LIMIT || "50mb",
},
},
outputFileTracingRoot: projectRoot,
outputFileTracingExcludes: {
// Planning/task docs are not runtime assets and can break standalone copies

View file

@ -136,6 +136,17 @@ export const IMAGE_PROVIDERS: Record<string, ImageProviderConfig> = {
supportedSizes: ["512x512", "1024x1024", "1024x1536", "1536x1024"],
},
"chatgpt-web": {
id: "chatgpt-web",
alias: "cgpt-web",
baseUrl: "https://chatgpt.com/backend-api/f/conversation",
authType: "apikey",
authHeader: "cookie",
format: "chatgpt-web",
models: [{ id: "gpt-5.3-instant", name: "GPT-5.3 Instant (ChatGPT Web Image)" }],
supportedSizes: ["1024x1024", "1024x1536", "1536x1024"],
},
xai: {
id: "xai",
baseUrl: "https://api.x.ai/v1/images/generations",

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,12 @@ import { randomUUID } from "crypto";
import { getImageProvider, parseImageModel } from "../config/imageRegistry.ts";
import { mapImageSize } from "../translator/image/sizeMapper.ts";
import { getCodexClientVersion, getCodexUserAgent } from "../config/codexClient.ts";
import { ChatGptWebExecutor } from "../executors/chatgpt-web.ts";
import {
getChatGptImage,
findChatGptImageBySha256,
} from "../services/chatgptImageCache.ts";
import { createHash } from "node:crypto";
import { saveCallLog } from "@/lib/usageDb";
import {
submitComfyWorkflow,
@ -113,7 +119,14 @@ const FAL_PRESET_SIZES = {
* @param {object} options.log - Logger
* @param {string} [options.resolvedProvider] - Pre-resolved provider ID (from route layer custom model resolution)
*/
export async function handleImageGeneration({ body, credentials, log, resolvedProvider = null }) {
export async function handleImageGeneration({
body,
credentials,
log,
resolvedProvider = null,
signal = null,
clientHeaders = null,
}) {
let provider, model;
if (resolvedProvider) {
@ -257,6 +270,18 @@ export async function handleImageGeneration({ body, credentials, log, resolvedPr
});
}
if (providerConfig.format === "chatgpt-web") {
return handleChatGptWebImageGeneration({
model,
provider,
body,
credentials,
log,
signal,
clientHeaders,
});
}
if (providerConfig.format === "nanobanana") {
return handleNanoBananaImageGeneration({
model,
@ -537,6 +562,370 @@ async function handleOpenAIImageGeneration({
return result;
}
const CHATGPT_WEB_IMAGE_MARKDOWN_RE = /!\[[^\]]*\]\(([^)\s]+)\)/g;
const CHATGPT_WEB_IMAGE_ID_RE = /\/v1\/chatgpt-web\/image\/([a-f0-9]{16,64})(?=[?\s"'<>)]|$)/i;
function extractMarkdownImageUrls(text: string): string[] {
const urls: string[] = [];
// String.prototype.matchAll consumes a fresh iterator and ignores the
// regex's lastIndex, so no manual reset is required.
for (const match of text.matchAll(CHATGPT_WEB_IMAGE_MARKDOWN_RE)) {
if (match[1]) urls.push(match[1]);
}
return urls;
}
function buildChatGptWebImagePrompt(body): string {
const prompt = String(body.prompt || "").trim();
const details: string[] = [`Create an image for this prompt: ${prompt}`];
if (typeof body.size === "string" && body.size.trim()) {
details.push(`Requested size: ${body.size.trim()}.`);
}
if (typeof body.quality === "string" && body.quality.trim()) {
details.push(`Requested quality: ${body.quality.trim()}.`);
}
if (typeof body.style === "string" && body.style.trim()) {
details.push(`Requested style: ${body.style.trim()}.`);
}
return details.join("\n");
}
async function handleChatGptWebImageGeneration({
model,
provider,
body,
credentials,
log,
signal,
clientHeaders,
}) {
const startTime = Date.now();
const prompt = typeof body.prompt === "string" ? body.prompt.trim() : "";
if (!prompt) {
return saveImageErrorResult({
provider,
model,
status: 400,
startTime,
error: "Prompt is required for ChatGPT Web image generation",
});
}
if (!credentials?.apiKey) {
return saveImageErrorResult({
provider,
model,
status: 401,
startTime,
error: "ChatGPT Web credentials missing session cookie",
});
}
// Each image is one chatgpt.com chat turn (~30s). Cap at 4 (matches OpenAI's
// own limit for image-1 / dall-e-3) so a stray n=1000 doesn't pin the
// executor for hours before the upstream HTTP timeout fires.
const CHATGPT_WEB_IMAGE_N_MAX = 4;
const rawCount =
Number.isInteger(body.n) && (body.n as number) > 0 ? (body.n as number) : 1;
if (rawCount > CHATGPT_WEB_IMAGE_N_MAX) {
return saveImageErrorResult({
provider,
model,
status: 400,
startTime,
error: `ChatGPT Web image generation supports n=1..${CHATGPT_WEB_IMAGE_N_MAX} (got ${rawCount}); each n is a separate ~30s chat turn.`,
});
}
const requestedCount = rawCount;
if (log && requestedCount > 1) {
log.warn(
"IMAGE",
`ChatGPT Web returns one image per chat turn; requested n=${requestedCount} will run sequentially`
);
}
const wantsBase64 = body.response_format === "b64_json";
const images: Array<{ url?: string; b64_json?: string }> = [];
const requestBody = {
model,
prompt: prompt.slice(0, 500),
size: body.size || undefined,
quality: body.quality || undefined,
};
for (let i = 0; i < requestedCount; i++) {
const executor = new ChatGptWebExecutor();
const result = await executor.execute({
model,
body: {
messages: [{ role: "user", content: buildChatGptWebImagePrompt(body) }],
},
stream: false,
credentials,
signal,
log,
clientHeaders,
});
const responseText = await result.response.text();
if (result.response.status >= 400) {
return saveImageErrorResult({
provider,
model,
status: result.response.status,
startTime,
error: responseText,
requestBody,
});
}
let content = "";
try {
const json = JSON.parse(responseText);
content = String(json?.choices?.[0]?.message?.content || "");
} catch {
content = responseText;
}
const urls = extractMarkdownImageUrls(content);
if (urls.length === 0) {
return saveImageErrorResult({
provider,
model,
status: 502,
startTime,
error: `ChatGPT Web completed without returning image markdown: ${content.slice(0, 300)}`,
requestBody,
});
}
for (const url of urls) {
if (!wantsBase64) {
images.push({ url });
continue;
}
const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1];
const cached = id ? getChatGptImage(id) : null;
if (!cached) {
return saveImageErrorResult({
provider,
model,
status: 502,
startTime,
error: "ChatGPT Web image bytes expired before b64_json conversion",
requestBody,
});
}
images.push({ b64_json: cached.bytes.toString("base64") });
}
}
return saveImageSuccessResult({
provider,
model,
startTime,
requestBody,
responseBody: { images_count: images.length },
images,
});
}
/**
* Handle a multipart /v1/images/edits request for chatgpt-web. Open WebUI
* uploads the prior image's bytes; we hash them and look up our cache.
*
* The hash match is reliable because Open WebUI's image-gen pipeline
* downloads our /v1/chatgpt-web/image/<id> URL byte-for-byte and re-serves
* those exact bytes through its own file store. When the user asks to edit
* the image, OWUI uploads the same bytes back to us via multipart same
* hash, we find the conversation context, and drive the executor with a
* synthetic chat thread that triggers continuation mode.
*
* No-match cases (cache evicted by TTL, or the user uploaded a foreign
* image) get a clear 400. We can't actually edit an image we don't have a
* conversation context for chatgpt.com's image_gen tool needs the
* original conversation node, and we don't have a path to upload bytes
* directly.
*/
export async function handleImageEdit({
provider,
model,
body,
imageBytes,
credentials,
log,
signal = null,
clientHeaders = null,
}: {
provider: string;
model: string;
body: Record<string, any>;
imageBytes: Buffer;
imageMime?: string; // accepted for symmetry with route layer; not used
credentials: any;
log: any;
signal?: AbortSignal | null;
clientHeaders?: Record<string, string> | null;
}) {
const startTime = Date.now();
const prompt = typeof body.prompt === "string" ? body.prompt.trim() : "";
if (!prompt) {
return saveImageErrorResult({
provider,
model,
status: 400,
startTime,
error: "Prompt is required for image edit",
});
}
if (!credentials?.apiKey) {
return saveImageErrorResult({
provider,
model,
status: 401,
startTime,
error: "ChatGPT Web credentials missing session cookie",
});
}
const imageHash = createHash("sha256").update(imageBytes).digest("hex");
const cached = findChatGptImageBySha256(imageHash);
const wantsBase64 = body.response_format === "b64_json";
const requestBody = {
model,
prompt: prompt.slice(0, 500),
size: body.size || undefined,
image_hash: imageHash.slice(0, 16),
image_bytes: imageBytes.length,
cached_match: Boolean(cached?.entry.context),
};
if (!cached?.entry.context) {
// chatgpt-web's image_gen tool can only edit an image when we continue
// the original conversation node. If we never generated this image (or
// its 30-minute TTL elapsed), there's no node to continue. Return a
// clear, actionable error — much better than silently spawning an
// unrelated image and confusing the user.
log?.warn?.(
"IMAGE",
`chatgpt-web edit: no cached match for sha256=${imageHash.slice(0, 16)} (bytes=${imageBytes.length}); returning 400`
);
return saveImageErrorResult({
provider,
model,
status: 400,
startTime,
error:
"chatgpt-web image edit only works for images recently generated through this OmniRoute instance " +
"(cache window: 30 minutes). Re-generate the image and try the edit immediately, or disable image-edit " +
"in your client to use plain chat-completion edit prompts instead.",
requestBody,
});
}
// Build a synthetic chat thread that surfaces the cached image URL on
// the assistant turn. The executor's parseOpenAIMessages picks up the
// URL, findCachedImageContext resolves it to {conversationId,
// parentMessageId}, and looksLikeImageEditRequest fires on the user
// prompt — together producing a continuation request that actually
// edits the saved image.
//
// The synthetic user prompt is anchored with both an edit verb AND an
// image-gen verb so the executor's heuristics fire regardless of what
// wording the caller used ("now make it brighter", "tweak this", ...):
// - looksLikeImageEditRequest: matches "edit" + "image" within 120 chars
// - looksLikeImageGenRequest: matches "generate" + "image" within 40 chars
// Either match alone would set forImageGen, but covering both is cheap
// insurance for prompts that don't fit common phrasings.
const messages: Array<{ role: string; content: string }> = [
{
role: "assistant",
// The base URL is irrelevant — only the path is parsed by
// CACHED_IMAGE_URL_RE in the executor's findCachedImageContext.
content: `![image](http://internal/v1/chatgpt-web/image/${cached.id})`,
},
{
role: "user",
content: `Edit the image and generate the new image: ${prompt}`,
},
];
const executor = new ChatGptWebExecutor();
const result = await executor.execute({
model,
body: { messages },
stream: false,
credentials,
signal,
log,
clientHeaders,
});
const responseText = await result.response.text();
if (result.response.status >= 400) {
return saveImageErrorResult({
provider,
model,
status: result.response.status,
startTime,
error: responseText,
requestBody,
});
}
let content = "";
try {
const json = JSON.parse(responseText);
content = String(json?.choices?.[0]?.message?.content || "");
} catch {
content = responseText;
}
const urls = extractMarkdownImageUrls(content);
if (urls.length === 0) {
return saveImageErrorResult({
provider,
model,
status: 502,
startTime,
error: `ChatGPT Web edit completed without returning image markdown: ${content.slice(0, 300)}`,
requestBody,
});
}
const images: Array<{ url?: string; b64_json?: string }> = [];
for (const url of urls) {
if (!wantsBase64) {
images.push({ url });
continue;
}
const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1];
const cachedNew = id ? getChatGptImage(id) : null;
if (!cachedNew) {
return saveImageErrorResult({
provider,
model,
status: 502,
startTime,
error: "ChatGPT Web image bytes expired before b64_json conversion",
requestBody,
});
}
images.push({ b64_json: cachedNew.bytes.toString("base64") });
}
return saveImageSuccessResult({
provider,
model,
startTime,
requestBody,
responseBody: { images_count: images.length, edit_match: Boolean(cached?.entry.context) },
images,
});
}
async function handleFalAIImageGeneration({
model,
provider,

View file

@ -0,0 +1,148 @@
/**
* In-memory cache for ChatGPT-generated images so we can serve them via a
* regular HTTP URL instead of inlining megabytes of base64 into SSE deltas.
*
* Why: chatgpt.com's `image_asset_pointer` resolves to a session-signed
* `estuary/content` URL that 403s for any anonymous client. We have to
* download the bytes server-side (with the user's session) and re-serve
* them. Streaming the raw base64 back through SSE works but Open WebUI's
* progressive markdown renderer displays each chunk as text mid-stream
* the user sees ~3 MB of base64 scroll past before the final `)` arrives
* and the renderer recognizes it as an image. Hosting the image on a
* regular URL avoids that entirely: we emit a tiny `![image](http://...)`
* markdown delta and the browser fetches the image normally.
*
* The cache is in-memory only, with a short TTL these URLs are single-use
* artifacts of one chat turn, not persistent assets. If the user reloads
* the conversation in a few hours the URLs will 404; that's expected.
*/
import { createHash, randomUUID } from "node:crypto";
interface CachedImage {
bytes: Buffer;
mime: string;
expiresAt: number;
context?: ChatGptImageConversationContext;
/** sha256(bytes) used by /v1/images/edits to correlate an uploaded
* image (Open WebUI re-uploads the bytes via multipart) back to the
* conversation context we cached when the image was first generated. */
bytesSha256: string;
}
const cache = new Map<string, CachedImage>();
let cacheBytes = 0;
const DEFAULT_TTL_MS = 30 * 60 * 1000;
const MAX_ENTRIES = 200;
// Per-entry images cap at 8 MB (enforced upstream in the executor) so 32 MB
// covers ~4 large images. The byte cap matters more than entry count: a hot
// loop of 8 MB images would otherwise pin 1.6 GB of RSS before count
// eviction kicked in. Tune via OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB.
const DEFAULT_MAX_BYTES = 256 * 1024 * 1024;
function configuredMaxBytes(): number {
const raw = Number(process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB);
if (!Number.isFinite(raw) || raw <= 0) return DEFAULT_MAX_BYTES;
return Math.floor(raw * 1024 * 1024);
}
export interface ChatGptImageConversationContext {
conversationId: string;
parentMessageId: string;
}
function deleteEntry(id: string): void {
const entry = cache.get(id);
if (!entry) return;
cacheBytes -= entry.bytes.length;
cache.delete(id);
}
function evictExpired(now = Date.now()): void {
for (const [id, entry] of cache) {
if (now >= entry.expiresAt) deleteEntry(id);
}
}
function evictUntilWithinLimits(maxBytes: number, incomingBytes: number): void {
// Drop oldest until both the entry-count and total-byte caps are satisfied.
// Map iteration is insertion-ordered so the first key is the oldest entry.
while (
(cache.size >= MAX_ENTRIES || cacheBytes + incomingBytes > maxBytes) &&
cache.size > 0
) {
const firstKey = cache.keys().next().value;
if (!firstKey) break;
deleteEntry(firstKey);
}
}
export function storeChatGptImage(
bytes: Buffer,
mime: string,
ttlMs = DEFAULT_TTL_MS,
context?: ChatGptImageConversationContext
): string {
evictExpired();
evictUntilWithinLimits(configuredMaxBytes(), bytes.length);
const id = randomUUID().replace(/-/g, "");
const bytesSha256 = createHash("sha256").update(bytes).digest("hex");
cache.set(id, {
bytes,
mime,
expiresAt: Date.now() + ttlMs,
context,
bytesSha256,
});
cacheBytes += bytes.length;
return id;
}
export function getChatGptImage(id: string): CachedImage | null {
evictExpired();
const entry = cache.get(id);
if (!entry) return null;
if (Date.now() >= entry.expiresAt) {
deleteEntry(id);
return null;
}
return entry;
}
export function getChatGptImageConversationContext(
id: string
): ChatGptImageConversationContext | null {
return getChatGptImage(id)?.context ?? null;
}
/**
* Look up a cached entry by sha256(bytes). Used by /v1/images/edits to
* correlate Open WebUI's re-uploaded image back to the conversation
* context we cached at generation time, so the executor can continue the
* saved chatgpt.com conversation node and actually edit the image instead
* of generating an unrelated one from scratch.
*/
export function findChatGptImageBySha256(
hash: string
): { id: string; entry: CachedImage } | null {
evictExpired();
const target = hash.toLowerCase();
for (const [id, entry] of cache.entries()) {
if (entry.bytesSha256 === target) {
if (Date.now() < entry.expiresAt) return { id, entry };
deleteEntry(id);
}
}
return null;
}
/** Test-only: clear the cache between tests. */
export function __resetChatGptImageCacheForTesting(): void {
cache.clear();
cacheBytes = 0;
}
/** Test-only: peek at current resident-byte total. */
export function __getChatGptImageCacheBytesForTesting(): number {
return cacheBytes;
}

View file

@ -110,6 +110,14 @@ export interface TlsFetchOptions {
stream?: boolean;
/** EOF marker the upstream sends to signal end of stream (default: "[DONE]"). */
streamEofSymbol?: string;
/**
* If true, instructs the underlying tls-client to return the response body
* as a base64 `data:<mime>;base64,...` string (so binary payloads survive
* the JSON marshalling step). Required for image / binary downloads
* without it, raw bytes get UTF-8-decoded and any non-ASCII byte is
* mangled. Default false (text mode).
*/
byteResponse?: boolean;
}
export interface TlsFetchResult {
@ -161,6 +169,7 @@ export async function tlsFetchChatGpt(
timeoutMilliseconds: options.timeoutMs ?? DEFAULT_TIMEOUT_MS,
followRedirects: true,
withRandomTLSExtensionOrder: true,
isByteResponse: options.byteResponse === true,
};
if (options.stream) {

View file

@ -31,6 +31,19 @@ export async function OPTIONS() {
export async function POST(request) {
await ensureInitialized();
// One-line marker for diagnosing 413 / Server-Action interceptions.
// Logs only when Content-Length is present so debug noise stays low for
// typical chat payloads. Toggle off via OMNIROUTE_LOG_REQUEST_SHAPE=0.
if (process.env.OMNIROUTE_LOG_REQUEST_SHAPE !== "0") {
const ct = request.headers.get("content-type") ?? "";
const cl = request.headers.get("content-length");
if (cl && Number(cl) > 256 * 1024) {
console.error(
`[CHAT-ROUTE] large body content-type="${ct}" content-length=${cl}`
);
}
}
// Prompt injection guard — inspect body before forwarding
try {
const cloned = request.clone();

View file

@ -0,0 +1,42 @@
import { CORS_HEADERS, handleCorsOptions } from "@/shared/utils/cors";
import { getChatGptImage } from "@omniroute/open-sse/services/chatgptImageCache.ts";
export async function OPTIONS() {
return handleCorsOptions();
}
/**
* Serve a cached ChatGPT-generated image by its opaque cache id.
*
* Auth: intentionally unauthenticated. The id is a 128-bit random UUID and
* the entry has a short TTL, so the URL is unguessable for the lifetime of
* the chat turn. We need it open because it's loaded by the user's BROWSER
* (via an `<img>` tag rendered from markdown) that fetch doesn't carry
* the OmniRoute API key. Rate limiting / abuse protection sit at the
* network layer the same way they do for any other static asset.
*/
export async function GET(_request: Request, { params }: { params: Promise<{ id: string }> }) {
const { id } = await params;
const entry = getChatGptImage(id);
if (!entry) {
return new Response(JSON.stringify({ error: "Image not found or expired" }), {
status: 404,
headers: { "Content-Type": "application/json", ...CORS_HEADERS },
});
}
// entry.bytes is a Buffer (subclass of Uint8Array); pass it directly.
// Wrapping in `new Uint8Array(...)` would copy the entire payload — up to
// 8 MB per image — for no benefit.
return new Response(entry.bytes, {
status: 200,
headers: {
"Content-Type": entry.mime,
// Allow short browser caching — the id is unique-per-image, so a
// cache hit is fine and saves a round-trip if the user re-renders
// the chat. Beyond the in-memory TTL the URL 404s anyway.
"Cache-Control": "private, max-age=1800",
"Content-Length": String(entry.bytes.length),
...CORS_HEADERS,
},
});
}

View file

@ -0,0 +1,174 @@
import { CORS_ORIGIN } from "@/shared/utils/cors";
import { handleImageEdit } from "@omniroute/open-sse/handlers/imageGeneration.ts";
import {
getProviderCredentials,
clearRecoveredProviderState,
extractApiKey,
isValidApiKey,
} from "@/sse/services/auth";
import { parseImageModel, getImageProvider } from "@omniroute/open-sse/config/imageRegistry.ts";
import { errorResponse, unavailableResponse } from "@omniroute/open-sse/utils/error.ts";
import { HTTP_STATUS } from "@omniroute/open-sse/config/constants.ts";
import * as log from "@/sse/utils/logger";
import { toJsonErrorPayload } from "@/shared/utils/upstreamError";
import { enforceApiKeyPolicy } from "@/shared/utils/apiKeyPolicy";
/**
* /v1/images/edits multipart edit endpoint matching OpenAI's images-edit API.
*
* Open WebUI's "Image Edit" toggle (images.edit.engine = "openai") posts here
* with `prompt` + `image` (file). For chatgpt-web, an "edit" only makes sense
* if the uploaded image was originally generated through OmniRoute we then
* have its `{conversationId, parentMessageId}` cached and can continue the
* saved chatgpt.com conversation node, which is the only way to actually edit
* the image instead of generating an unrelated one from scratch.
*
* Without this route, multipart bodies trip Next.js's Server Action handler
* (which intercepts ALL POSTs with multipart/form-data content-type) and the
* client gets a confusing "Failed to find Server Action" 500.
*/
export async function OPTIONS() {
return new Response(null, {
headers: {
"Access-Control-Allow-Origin": CORS_ORIGIN,
"Access-Control-Allow-Methods": "POST, OPTIONS",
"Access-Control-Allow-Headers": "*",
},
});
}
const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const;
function publicBaseUrlHeaders(headers: Headers): Record<string, string> {
const out: Record<string, string> = {};
for (const key of PUBLIC_BASE_URL_HEADER_KEYS) {
const value = headers.get(key);
if (value !== null) out[key] = value;
}
return out;
}
async function readMultipartImage(formData: FormData): Promise<{
prompt: string;
model: string | null;
size: string | null;
responseFormat: string | null;
imageBytes: Buffer | null;
imageMime: string | null;
}> {
const promptRaw = formData.get("prompt");
const prompt = typeof promptRaw === "string" ? promptRaw.trim() : "";
const modelRaw = formData.get("model");
const model = typeof modelRaw === "string" ? modelRaw.trim() : null;
const sizeRaw = formData.get("size");
const size = typeof sizeRaw === "string" ? sizeRaw.trim() : null;
const respRaw = formData.get("response_format");
const responseFormat = typeof respRaw === "string" ? respRaw.trim() : null;
// OpenAI's API and Open WebUI both accept either a single `image` field or
// an `image[]` array. We use the first image when multiple are sent — the
// chatgpt-web edit tool can only edit one image per conversation node.
const imageEntry = formData.get("image") ?? formData.get("image[]");
if (!imageEntry || typeof imageEntry === "string") {
return { prompt, model, size, responseFormat, imageBytes: null, imageMime: null };
}
const file = imageEntry as File;
const imageBytes = Buffer.from(await file.arrayBuffer());
const imageMime = file.type || "image/png";
return { prompt, model, size, responseFormat, imageBytes, imageMime };
}
export async function POST(request: Request) {
let formData: FormData;
try {
formData = await request.formData();
} catch (err) {
log.warn("IMAGE", `Invalid multipart body: ${err instanceof Error ? err.message : String(err)}`);
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Invalid multipart body");
}
const { prompt, model, size, responseFormat, imageBytes, imageMime } =
await readMultipartImage(formData);
if (!prompt) {
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: prompt");
}
if (!imageBytes || imageBytes.length === 0) {
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: image");
}
const apiKey = extractApiKey(request);
if (!isValidApiKey(apiKey)) {
const policyError = enforceApiKeyPolicy(apiKey);
if (policyError) {
return new Response(JSON.stringify(policyError.body), {
status: policyError.status,
headers: { "Content-Type": "application/json" },
});
}
}
const fullModel = model || "cgpt-web/gpt-5.3-instant";
const parsed = parseImageModel(fullModel);
const providerConfig = getImageProvider(parsed.provider);
if (!providerConfig) {
return errorResponse(HTTP_STATUS.BAD_REQUEST, `Unknown image provider: ${parsed.provider}`);
}
if (providerConfig.format !== "chatgpt-web") {
// We only implement edit for chatgpt-web today; everything else routes
// through generations which doesn't accept image inputs. Surface a
// useful error rather than silently dropping the image.
return errorResponse(
HTTP_STATUS.BAD_REQUEST,
`Image edit is only supported for chatgpt-web models (got ${parsed.provider})`
);
}
const credentials = await getProviderCredentials(parsed.provider, apiKey);
if (!credentials) {
return errorResponse(HTTP_STATUS.UNAUTHORIZED, `No credentials for provider: ${parsed.provider}`);
}
if (credentials.allRateLimited) {
return unavailableResponse(
HTTP_STATUS.RATE_LIMITED,
`[${parsed.provider}] All accounts rate limited`,
credentials.retryAfter,
credentials.retryAfterHuman
);
}
const result = await handleImageEdit({
provider: parsed.provider,
model: parsed.model,
body: {
prompt,
size: size ?? undefined,
response_format: responseFormat ?? undefined,
n: 1,
},
imageBytes,
imageMime,
credentials,
log,
signal: request.signal,
clientHeaders: publicBaseUrlHeaders(request.headers),
});
if (result.success) {
await clearRecoveredProviderState(credentials);
return new Response(JSON.stringify((result as any).data), {
status: 200,
headers: { "Content-Type": "application/json" },
});
}
const errorPayload = toJsonErrorPayload(
(result as any).error,
"Image edit provider error"
);
return new Response(JSON.stringify(errorPayload), {
status: (result as any).status,
headers: { "Content-Type": "application/json" },
});
}

View file

@ -101,6 +101,23 @@ function hasImageGenerationInput(body: Record<string, unknown>) {
return false;
}
// Forward only the host-shaped headers the chatgpt-web image handler needs
// to derive the browser-facing public base URL. Avoid copying the full
// request header set: it's wider than the handler needs (auth tokens,
// content-type, etc.) and `Headers.forEach` collapses repeated values, which
// would silently drop entries if a wider helper were reused for headers
// that can legitimately repeat (e.g., set-cookie).
const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const;
function publicBaseUrlHeaders(headers: Headers): Record<string, string> {
const out: Record<string, string> = {};
for (const key of PUBLIC_BASE_URL_HEADER_KEYS) {
const value = headers.get(key);
if (value !== null) out[key] = value;
}
return out;
}
export async function POST(request) {
let rawBody;
try {
@ -228,6 +245,8 @@ export async function POST(request) {
credentials,
log,
...(isCustomModel && { resolvedProvider: provider }),
signal: request.signal,
clientHeaders: publicBaseUrlHeaders(request.headers),
});
if (result.success) {

View file

@ -66,6 +66,7 @@ export const webRuntimeEnvSchema = z.object({
OMNIROUTE_DISABLE_BACKGROUND_SERVICES: optionalBooleanEnv,
CLOUD_URL: optionalHttpUrl,
NEXT_PUBLIC_CLOUD_URL: optionalHttpUrl,
OMNIROUTE_PUBLIC_BASE_URL: optionalHttpUrl,
OMNIROUTE_BASE_URL: optionalHttpUrl,
BASE_URL: optionalHttpUrl,
NEXT_PUBLIC_BASE_URL: optionalHttpUrl,

View file

@ -21,6 +21,9 @@ declare namespace NodeJS {
PORT?: string;
API_HOST?: string;
DASHBOARD_PORT?: string;
OMNIROUTE_PUBLIC_BASE_URL?: string;
OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS?: string;
OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB?: string;
OMNIROUTE_BASE_URL?: string;
OMNIROUTE_DISABLE_BACKGROUND_SERVICES?: string;
OMNIROUTE_PORT?: string;

File diff suppressed because it is too large Load diff