mirror of
https://github.com/diegosouzapw/OmniRoute.git
synced 2026-04-28 06:19:46 +00:00
feat(chatgpt-web): image generation + edit (Open WebUI compatible) (#1607)
Integrated into release/v3.7.0
This commit is contained in:
parent
13495d4d13
commit
8a8fcc77a8
14 changed files with 3187 additions and 80 deletions
19
.env.example
19
.env.example
|
|
@ -195,6 +195,24 @@ CLOUD_URL=
|
|||
# Default: http://localhost:20128
|
||||
NEXT_PUBLIC_BASE_URL=http://localhost:20128
|
||||
|
||||
# Browser-facing OmniRoute origin for generated assets in API responses.
|
||||
# Used by: chatgpt-web image generation cache URLs (/v1/chatgpt-web/image/<id>).
|
||||
# Set this when OpenWebUI or another relay reaches OmniRoute by an internal URL
|
||||
# but the user's browser must fetch images from a LAN, tunnel, or public origin.
|
||||
# Do not include /v1; if included accidentally it will be normalized away.
|
||||
# OMNIROUTE_PUBLIC_BASE_URL=http://192.168.0.15:20128
|
||||
|
||||
# Max wait time for an async chatgpt-web image to land via the celsius
|
||||
# WebSocket, in milliseconds. Default 180000 (3 minutes). Increase during
|
||||
# upstream queue-deep windows ("Lots of people are creating images right now").
|
||||
# OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS=180000
|
||||
|
||||
# Total in-memory byte budget for the chatgpt-web image cache (used to serve
|
||||
# /v1/chatgpt-web/image/<id>), in megabytes. Default 256. Lower this if you
|
||||
# run OmniRoute on a memory-constrained host; raise it if image generation
|
||||
# is heavy and clients are racing the 30-minute TTL.
|
||||
# OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB=256
|
||||
|
||||
# Public cloud URL — client-side mirror of CLOUD_URL.
|
||||
NEXT_PUBLIC_CLOUD_URL=
|
||||
|
||||
|
|
@ -261,6 +279,7 @@ NEXT_PUBLIC_ENABLE_SOCKS5_PROXY=true
|
|||
# Used by MCP server, A2A skills, and CLI sidecars to call the running instance.
|
||||
|
||||
# Explicit base URL for MCP/A2A tools to reach OmniRoute (overrides localhost auto-detect).
|
||||
# For browser-visible generated image URLs, prefer OMNIROUTE_PUBLIC_BASE_URL above.
|
||||
# Used by: open-sse/mcp-server/server.ts, src/lib/a2a/
|
||||
# OMNIROUTE_BASE_URL=http://localhost:20128
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,20 @@ const nextConfig = {
|
|||
},
|
||||
},
|
||||
output: "standalone",
|
||||
// OmniRoute is a proxy for AI APIs — request bodies routinely include
|
||||
// multi-MB payloads (vision models, image edits, base64-encoded files,
|
||||
// long chat histories with embedded images). Next.js's Server Action
|
||||
// handler intercepts POSTs with multipart/form-data or
|
||||
// x-www-form-urlencoded content-types and enforces a 1 MB cap that
|
||||
// surfaces as a 413 with a confusing "Server Actions" hint, even on
|
||||
// pure route handlers. 50 MB matches what most upstream LLM providers
|
||||
// accept for image-bearing requests; tune via env if a deployment needs
|
||||
// more.
|
||||
experimental: {
|
||||
serverActions: {
|
||||
bodySizeLimit: process.env.OMNIROUTE_SERVER_ACTIONS_BODY_LIMIT || "50mb",
|
||||
},
|
||||
},
|
||||
outputFileTracingRoot: projectRoot,
|
||||
outputFileTracingExcludes: {
|
||||
// Planning/task docs are not runtime assets and can break standalone copies
|
||||
|
|
|
|||
|
|
@ -136,6 +136,17 @@ export const IMAGE_PROVIDERS: Record<string, ImageProviderConfig> = {
|
|||
supportedSizes: ["512x512", "1024x1024", "1024x1536", "1536x1024"],
|
||||
},
|
||||
|
||||
"chatgpt-web": {
|
||||
id: "chatgpt-web",
|
||||
alias: "cgpt-web",
|
||||
baseUrl: "https://chatgpt.com/backend-api/f/conversation",
|
||||
authType: "apikey",
|
||||
authHeader: "cookie",
|
||||
format: "chatgpt-web",
|
||||
models: [{ id: "gpt-5.3-instant", name: "GPT-5.3 Instant (ChatGPT Web Image)" }],
|
||||
supportedSizes: ["1024x1024", "1024x1536", "1536x1024"],
|
||||
},
|
||||
|
||||
xai: {
|
||||
id: "xai",
|
||||
baseUrl: "https://api.x.ai/v1/images/generations",
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -19,6 +19,12 @@ import { randomUUID } from "crypto";
|
|||
import { getImageProvider, parseImageModel } from "../config/imageRegistry.ts";
|
||||
import { mapImageSize } from "../translator/image/sizeMapper.ts";
|
||||
import { getCodexClientVersion, getCodexUserAgent } from "../config/codexClient.ts";
|
||||
import { ChatGptWebExecutor } from "../executors/chatgpt-web.ts";
|
||||
import {
|
||||
getChatGptImage,
|
||||
findChatGptImageBySha256,
|
||||
} from "../services/chatgptImageCache.ts";
|
||||
import { createHash } from "node:crypto";
|
||||
import { saveCallLog } from "@/lib/usageDb";
|
||||
import {
|
||||
submitComfyWorkflow,
|
||||
|
|
@ -113,7 +119,14 @@ const FAL_PRESET_SIZES = {
|
|||
* @param {object} options.log - Logger
|
||||
* @param {string} [options.resolvedProvider] - Pre-resolved provider ID (from route layer custom model resolution)
|
||||
*/
|
||||
export async function handleImageGeneration({ body, credentials, log, resolvedProvider = null }) {
|
||||
export async function handleImageGeneration({
|
||||
body,
|
||||
credentials,
|
||||
log,
|
||||
resolvedProvider = null,
|
||||
signal = null,
|
||||
clientHeaders = null,
|
||||
}) {
|
||||
let provider, model;
|
||||
|
||||
if (resolvedProvider) {
|
||||
|
|
@ -257,6 +270,18 @@ export async function handleImageGeneration({ body, credentials, log, resolvedPr
|
|||
});
|
||||
}
|
||||
|
||||
if (providerConfig.format === "chatgpt-web") {
|
||||
return handleChatGptWebImageGeneration({
|
||||
model,
|
||||
provider,
|
||||
body,
|
||||
credentials,
|
||||
log,
|
||||
signal,
|
||||
clientHeaders,
|
||||
});
|
||||
}
|
||||
|
||||
if (providerConfig.format === "nanobanana") {
|
||||
return handleNanoBananaImageGeneration({
|
||||
model,
|
||||
|
|
@ -537,6 +562,370 @@ async function handleOpenAIImageGeneration({
|
|||
return result;
|
||||
}
|
||||
|
||||
const CHATGPT_WEB_IMAGE_MARKDOWN_RE = /!\[[^\]]*\]\(([^)\s]+)\)/g;
|
||||
const CHATGPT_WEB_IMAGE_ID_RE = /\/v1\/chatgpt-web\/image\/([a-f0-9]{16,64})(?=[?\s"'<>)]|$)/i;
|
||||
|
||||
function extractMarkdownImageUrls(text: string): string[] {
|
||||
const urls: string[] = [];
|
||||
// String.prototype.matchAll consumes a fresh iterator and ignores the
|
||||
// regex's lastIndex, so no manual reset is required.
|
||||
for (const match of text.matchAll(CHATGPT_WEB_IMAGE_MARKDOWN_RE)) {
|
||||
if (match[1]) urls.push(match[1]);
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
function buildChatGptWebImagePrompt(body): string {
|
||||
const prompt = String(body.prompt || "").trim();
|
||||
const details: string[] = [`Create an image for this prompt: ${prompt}`];
|
||||
if (typeof body.size === "string" && body.size.trim()) {
|
||||
details.push(`Requested size: ${body.size.trim()}.`);
|
||||
}
|
||||
if (typeof body.quality === "string" && body.quality.trim()) {
|
||||
details.push(`Requested quality: ${body.quality.trim()}.`);
|
||||
}
|
||||
if (typeof body.style === "string" && body.style.trim()) {
|
||||
details.push(`Requested style: ${body.style.trim()}.`);
|
||||
}
|
||||
return details.join("\n");
|
||||
}
|
||||
|
||||
async function handleChatGptWebImageGeneration({
|
||||
model,
|
||||
provider,
|
||||
body,
|
||||
credentials,
|
||||
log,
|
||||
signal,
|
||||
clientHeaders,
|
||||
}) {
|
||||
const startTime = Date.now();
|
||||
const prompt = typeof body.prompt === "string" ? body.prompt.trim() : "";
|
||||
if (!prompt) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 400,
|
||||
startTime,
|
||||
error: "Prompt is required for ChatGPT Web image generation",
|
||||
});
|
||||
}
|
||||
|
||||
if (!credentials?.apiKey) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 401,
|
||||
startTime,
|
||||
error: "ChatGPT Web credentials missing session cookie",
|
||||
});
|
||||
}
|
||||
|
||||
// Each image is one chatgpt.com chat turn (~30s). Cap at 4 (matches OpenAI's
|
||||
// own limit for image-1 / dall-e-3) so a stray n=1000 doesn't pin the
|
||||
// executor for hours before the upstream HTTP timeout fires.
|
||||
const CHATGPT_WEB_IMAGE_N_MAX = 4;
|
||||
const rawCount =
|
||||
Number.isInteger(body.n) && (body.n as number) > 0 ? (body.n as number) : 1;
|
||||
if (rawCount > CHATGPT_WEB_IMAGE_N_MAX) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 400,
|
||||
startTime,
|
||||
error: `ChatGPT Web image generation supports n=1..${CHATGPT_WEB_IMAGE_N_MAX} (got ${rawCount}); each n is a separate ~30s chat turn.`,
|
||||
});
|
||||
}
|
||||
const requestedCount = rawCount;
|
||||
if (log && requestedCount > 1) {
|
||||
log.warn(
|
||||
"IMAGE",
|
||||
`ChatGPT Web returns one image per chat turn; requested n=${requestedCount} will run sequentially`
|
||||
);
|
||||
}
|
||||
|
||||
const wantsBase64 = body.response_format === "b64_json";
|
||||
const images: Array<{ url?: string; b64_json?: string }> = [];
|
||||
const requestBody = {
|
||||
model,
|
||||
prompt: prompt.slice(0, 500),
|
||||
size: body.size || undefined,
|
||||
quality: body.quality || undefined,
|
||||
};
|
||||
|
||||
for (let i = 0; i < requestedCount; i++) {
|
||||
const executor = new ChatGptWebExecutor();
|
||||
const result = await executor.execute({
|
||||
model,
|
||||
body: {
|
||||
messages: [{ role: "user", content: buildChatGptWebImagePrompt(body) }],
|
||||
},
|
||||
stream: false,
|
||||
credentials,
|
||||
signal,
|
||||
log,
|
||||
clientHeaders,
|
||||
});
|
||||
|
||||
const responseText = await result.response.text();
|
||||
if (result.response.status >= 400) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: result.response.status,
|
||||
startTime,
|
||||
error: responseText,
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
|
||||
let content = "";
|
||||
try {
|
||||
const json = JSON.parse(responseText);
|
||||
content = String(json?.choices?.[0]?.message?.content || "");
|
||||
} catch {
|
||||
content = responseText;
|
||||
}
|
||||
|
||||
const urls = extractMarkdownImageUrls(content);
|
||||
if (urls.length === 0) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 502,
|
||||
startTime,
|
||||
error: `ChatGPT Web completed without returning image markdown: ${content.slice(0, 300)}`,
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
|
||||
for (const url of urls) {
|
||||
if (!wantsBase64) {
|
||||
images.push({ url });
|
||||
continue;
|
||||
}
|
||||
const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1];
|
||||
const cached = id ? getChatGptImage(id) : null;
|
||||
if (!cached) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 502,
|
||||
startTime,
|
||||
error: "ChatGPT Web image bytes expired before b64_json conversion",
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
images.push({ b64_json: cached.bytes.toString("base64") });
|
||||
}
|
||||
}
|
||||
|
||||
return saveImageSuccessResult({
|
||||
provider,
|
||||
model,
|
||||
startTime,
|
||||
requestBody,
|
||||
responseBody: { images_count: images.length },
|
||||
images,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle a multipart /v1/images/edits request for chatgpt-web. Open WebUI
|
||||
* uploads the prior image's bytes; we hash them and look up our cache.
|
||||
*
|
||||
* The hash match is reliable because Open WebUI's image-gen pipeline
|
||||
* downloads our /v1/chatgpt-web/image/<id> URL byte-for-byte and re-serves
|
||||
* those exact bytes through its own file store. When the user asks to edit
|
||||
* the image, OWUI uploads the same bytes back to us via multipart — same
|
||||
* hash, we find the conversation context, and drive the executor with a
|
||||
* synthetic chat thread that triggers continuation mode.
|
||||
*
|
||||
* No-match cases (cache evicted by TTL, or the user uploaded a foreign
|
||||
* image) get a clear 400. We can't actually edit an image we don't have a
|
||||
* conversation context for — chatgpt.com's image_gen tool needs the
|
||||
* original conversation node, and we don't have a path to upload bytes
|
||||
* directly.
|
||||
*/
|
||||
export async function handleImageEdit({
|
||||
provider,
|
||||
model,
|
||||
body,
|
||||
imageBytes,
|
||||
credentials,
|
||||
log,
|
||||
signal = null,
|
||||
clientHeaders = null,
|
||||
}: {
|
||||
provider: string;
|
||||
model: string;
|
||||
body: Record<string, any>;
|
||||
imageBytes: Buffer;
|
||||
imageMime?: string; // accepted for symmetry with route layer; not used
|
||||
credentials: any;
|
||||
log: any;
|
||||
signal?: AbortSignal | null;
|
||||
clientHeaders?: Record<string, string> | null;
|
||||
}) {
|
||||
const startTime = Date.now();
|
||||
const prompt = typeof body.prompt === "string" ? body.prompt.trim() : "";
|
||||
if (!prompt) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 400,
|
||||
startTime,
|
||||
error: "Prompt is required for image edit",
|
||||
});
|
||||
}
|
||||
|
||||
if (!credentials?.apiKey) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 401,
|
||||
startTime,
|
||||
error: "ChatGPT Web credentials missing session cookie",
|
||||
});
|
||||
}
|
||||
|
||||
const imageHash = createHash("sha256").update(imageBytes).digest("hex");
|
||||
const cached = findChatGptImageBySha256(imageHash);
|
||||
|
||||
const wantsBase64 = body.response_format === "b64_json";
|
||||
const requestBody = {
|
||||
model,
|
||||
prompt: prompt.slice(0, 500),
|
||||
size: body.size || undefined,
|
||||
image_hash: imageHash.slice(0, 16),
|
||||
image_bytes: imageBytes.length,
|
||||
cached_match: Boolean(cached?.entry.context),
|
||||
};
|
||||
|
||||
if (!cached?.entry.context) {
|
||||
// chatgpt-web's image_gen tool can only edit an image when we continue
|
||||
// the original conversation node. If we never generated this image (or
|
||||
// its 30-minute TTL elapsed), there's no node to continue. Return a
|
||||
// clear, actionable error — much better than silently spawning an
|
||||
// unrelated image and confusing the user.
|
||||
log?.warn?.(
|
||||
"IMAGE",
|
||||
`chatgpt-web edit: no cached match for sha256=${imageHash.slice(0, 16)} (bytes=${imageBytes.length}); returning 400`
|
||||
);
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 400,
|
||||
startTime,
|
||||
error:
|
||||
"chatgpt-web image edit only works for images recently generated through this OmniRoute instance " +
|
||||
"(cache window: 30 minutes). Re-generate the image and try the edit immediately, or disable image-edit " +
|
||||
"in your client to use plain chat-completion edit prompts instead.",
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
|
||||
// Build a synthetic chat thread that surfaces the cached image URL on
|
||||
// the assistant turn. The executor's parseOpenAIMessages picks up the
|
||||
// URL, findCachedImageContext resolves it to {conversationId,
|
||||
// parentMessageId}, and looksLikeImageEditRequest fires on the user
|
||||
// prompt — together producing a continuation request that actually
|
||||
// edits the saved image.
|
||||
//
|
||||
// The synthetic user prompt is anchored with both an edit verb AND an
|
||||
// image-gen verb so the executor's heuristics fire regardless of what
|
||||
// wording the caller used ("now make it brighter", "tweak this", ...):
|
||||
// - looksLikeImageEditRequest: matches "edit" + "image" within 120 chars
|
||||
// - looksLikeImageGenRequest: matches "generate" + "image" within 40 chars
|
||||
// Either match alone would set forImageGen, but covering both is cheap
|
||||
// insurance for prompts that don't fit common phrasings.
|
||||
const messages: Array<{ role: string; content: string }> = [
|
||||
{
|
||||
role: "assistant",
|
||||
// The base URL is irrelevant — only the path is parsed by
|
||||
// CACHED_IMAGE_URL_RE in the executor's findCachedImageContext.
|
||||
content: ``,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `Edit the image and generate the new image: ${prompt}`,
|
||||
},
|
||||
];
|
||||
|
||||
const executor = new ChatGptWebExecutor();
|
||||
const result = await executor.execute({
|
||||
model,
|
||||
body: { messages },
|
||||
stream: false,
|
||||
credentials,
|
||||
signal,
|
||||
log,
|
||||
clientHeaders,
|
||||
});
|
||||
|
||||
const responseText = await result.response.text();
|
||||
if (result.response.status >= 400) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: result.response.status,
|
||||
startTime,
|
||||
error: responseText,
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
|
||||
let content = "";
|
||||
try {
|
||||
const json = JSON.parse(responseText);
|
||||
content = String(json?.choices?.[0]?.message?.content || "");
|
||||
} catch {
|
||||
content = responseText;
|
||||
}
|
||||
|
||||
const urls = extractMarkdownImageUrls(content);
|
||||
if (urls.length === 0) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 502,
|
||||
startTime,
|
||||
error: `ChatGPT Web edit completed without returning image markdown: ${content.slice(0, 300)}`,
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
|
||||
const images: Array<{ url?: string; b64_json?: string }> = [];
|
||||
for (const url of urls) {
|
||||
if (!wantsBase64) {
|
||||
images.push({ url });
|
||||
continue;
|
||||
}
|
||||
const id = url.match(CHATGPT_WEB_IMAGE_ID_RE)?.[1];
|
||||
const cachedNew = id ? getChatGptImage(id) : null;
|
||||
if (!cachedNew) {
|
||||
return saveImageErrorResult({
|
||||
provider,
|
||||
model,
|
||||
status: 502,
|
||||
startTime,
|
||||
error: "ChatGPT Web image bytes expired before b64_json conversion",
|
||||
requestBody,
|
||||
});
|
||||
}
|
||||
images.push({ b64_json: cachedNew.bytes.toString("base64") });
|
||||
}
|
||||
|
||||
return saveImageSuccessResult({
|
||||
provider,
|
||||
model,
|
||||
startTime,
|
||||
requestBody,
|
||||
responseBody: { images_count: images.length, edit_match: Boolean(cached?.entry.context) },
|
||||
images,
|
||||
});
|
||||
}
|
||||
|
||||
async function handleFalAIImageGeneration({
|
||||
model,
|
||||
provider,
|
||||
|
|
|
|||
148
open-sse/services/chatgptImageCache.ts
Normal file
148
open-sse/services/chatgptImageCache.ts
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
/**
|
||||
* In-memory cache for ChatGPT-generated images so we can serve them via a
|
||||
* regular HTTP URL instead of inlining megabytes of base64 into SSE deltas.
|
||||
*
|
||||
* Why: chatgpt.com's `image_asset_pointer` resolves to a session-signed
|
||||
* `estuary/content` URL that 403s for any anonymous client. We have to
|
||||
* download the bytes server-side (with the user's session) and re-serve
|
||||
* them. Streaming the raw base64 back through SSE works but Open WebUI's
|
||||
* progressive markdown renderer displays each chunk as text mid-stream —
|
||||
* the user sees ~3 MB of base64 scroll past before the final `)` arrives
|
||||
* and the renderer recognizes it as an image. Hosting the image on a
|
||||
* regular URL avoids that entirely: we emit a tiny ``
|
||||
* markdown delta and the browser fetches the image normally.
|
||||
*
|
||||
* The cache is in-memory only, with a short TTL — these URLs are single-use
|
||||
* artifacts of one chat turn, not persistent assets. If the user reloads
|
||||
* the conversation in a few hours the URLs will 404; that's expected.
|
||||
*/
|
||||
|
||||
import { createHash, randomUUID } from "node:crypto";
|
||||
|
||||
interface CachedImage {
|
||||
bytes: Buffer;
|
||||
mime: string;
|
||||
expiresAt: number;
|
||||
context?: ChatGptImageConversationContext;
|
||||
/** sha256(bytes) — used by /v1/images/edits to correlate an uploaded
|
||||
* image (Open WebUI re-uploads the bytes via multipart) back to the
|
||||
* conversation context we cached when the image was first generated. */
|
||||
bytesSha256: string;
|
||||
}
|
||||
|
||||
const cache = new Map<string, CachedImage>();
|
||||
let cacheBytes = 0;
|
||||
const DEFAULT_TTL_MS = 30 * 60 * 1000;
|
||||
const MAX_ENTRIES = 200;
|
||||
// Per-entry images cap at 8 MB (enforced upstream in the executor) so 32 MB
|
||||
// covers ~4 large images. The byte cap matters more than entry count: a hot
|
||||
// loop of 8 MB images would otherwise pin 1.6 GB of RSS before count
|
||||
// eviction kicked in. Tune via OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB.
|
||||
const DEFAULT_MAX_BYTES = 256 * 1024 * 1024;
|
||||
|
||||
function configuredMaxBytes(): number {
|
||||
const raw = Number(process.env.OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB);
|
||||
if (!Number.isFinite(raw) || raw <= 0) return DEFAULT_MAX_BYTES;
|
||||
return Math.floor(raw * 1024 * 1024);
|
||||
}
|
||||
|
||||
export interface ChatGptImageConversationContext {
|
||||
conversationId: string;
|
||||
parentMessageId: string;
|
||||
}
|
||||
|
||||
function deleteEntry(id: string): void {
|
||||
const entry = cache.get(id);
|
||||
if (!entry) return;
|
||||
cacheBytes -= entry.bytes.length;
|
||||
cache.delete(id);
|
||||
}
|
||||
|
||||
function evictExpired(now = Date.now()): void {
|
||||
for (const [id, entry] of cache) {
|
||||
if (now >= entry.expiresAt) deleteEntry(id);
|
||||
}
|
||||
}
|
||||
|
||||
function evictUntilWithinLimits(maxBytes: number, incomingBytes: number): void {
|
||||
// Drop oldest until both the entry-count and total-byte caps are satisfied.
|
||||
// Map iteration is insertion-ordered so the first key is the oldest entry.
|
||||
while (
|
||||
(cache.size >= MAX_ENTRIES || cacheBytes + incomingBytes > maxBytes) &&
|
||||
cache.size > 0
|
||||
) {
|
||||
const firstKey = cache.keys().next().value;
|
||||
if (!firstKey) break;
|
||||
deleteEntry(firstKey);
|
||||
}
|
||||
}
|
||||
|
||||
export function storeChatGptImage(
|
||||
bytes: Buffer,
|
||||
mime: string,
|
||||
ttlMs = DEFAULT_TTL_MS,
|
||||
context?: ChatGptImageConversationContext
|
||||
): string {
|
||||
evictExpired();
|
||||
evictUntilWithinLimits(configuredMaxBytes(), bytes.length);
|
||||
const id = randomUUID().replace(/-/g, "");
|
||||
const bytesSha256 = createHash("sha256").update(bytes).digest("hex");
|
||||
cache.set(id, {
|
||||
bytes,
|
||||
mime,
|
||||
expiresAt: Date.now() + ttlMs,
|
||||
context,
|
||||
bytesSha256,
|
||||
});
|
||||
cacheBytes += bytes.length;
|
||||
return id;
|
||||
}
|
||||
|
||||
export function getChatGptImage(id: string): CachedImage | null {
|
||||
evictExpired();
|
||||
const entry = cache.get(id);
|
||||
if (!entry) return null;
|
||||
if (Date.now() >= entry.expiresAt) {
|
||||
deleteEntry(id);
|
||||
return null;
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
export function getChatGptImageConversationContext(
|
||||
id: string
|
||||
): ChatGptImageConversationContext | null {
|
||||
return getChatGptImage(id)?.context ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up a cached entry by sha256(bytes). Used by /v1/images/edits to
|
||||
* correlate Open WebUI's re-uploaded image back to the conversation
|
||||
* context we cached at generation time, so the executor can continue the
|
||||
* saved chatgpt.com conversation node and actually edit the image instead
|
||||
* of generating an unrelated one from scratch.
|
||||
*/
|
||||
export function findChatGptImageBySha256(
|
||||
hash: string
|
||||
): { id: string; entry: CachedImage } | null {
|
||||
evictExpired();
|
||||
const target = hash.toLowerCase();
|
||||
for (const [id, entry] of cache.entries()) {
|
||||
if (entry.bytesSha256 === target) {
|
||||
if (Date.now() < entry.expiresAt) return { id, entry };
|
||||
deleteEntry(id);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Test-only: clear the cache between tests. */
|
||||
export function __resetChatGptImageCacheForTesting(): void {
|
||||
cache.clear();
|
||||
cacheBytes = 0;
|
||||
}
|
||||
|
||||
/** Test-only: peek at current resident-byte total. */
|
||||
export function __getChatGptImageCacheBytesForTesting(): number {
|
||||
return cacheBytes;
|
||||
}
|
||||
|
|
@ -110,6 +110,14 @@ export interface TlsFetchOptions {
|
|||
stream?: boolean;
|
||||
/** EOF marker the upstream sends to signal end of stream (default: "[DONE]"). */
|
||||
streamEofSymbol?: string;
|
||||
/**
|
||||
* If true, instructs the underlying tls-client to return the response body
|
||||
* as a base64 `data:<mime>;base64,...` string (so binary payloads survive
|
||||
* the JSON marshalling step). Required for image / binary downloads —
|
||||
* without it, raw bytes get UTF-8-decoded and any non-ASCII byte is
|
||||
* mangled. Default false (text mode).
|
||||
*/
|
||||
byteResponse?: boolean;
|
||||
}
|
||||
|
||||
export interface TlsFetchResult {
|
||||
|
|
@ -161,6 +169,7 @@ export async function tlsFetchChatGpt(
|
|||
timeoutMilliseconds: options.timeoutMs ?? DEFAULT_TIMEOUT_MS,
|
||||
followRedirects: true,
|
||||
withRandomTLSExtensionOrder: true,
|
||||
isByteResponse: options.byteResponse === true,
|
||||
};
|
||||
|
||||
if (options.stream) {
|
||||
|
|
|
|||
|
|
@ -31,6 +31,19 @@ export async function OPTIONS() {
|
|||
export async function POST(request) {
|
||||
await ensureInitialized();
|
||||
|
||||
// One-line marker for diagnosing 413 / Server-Action interceptions.
|
||||
// Logs only when Content-Length is present so debug noise stays low for
|
||||
// typical chat payloads. Toggle off via OMNIROUTE_LOG_REQUEST_SHAPE=0.
|
||||
if (process.env.OMNIROUTE_LOG_REQUEST_SHAPE !== "0") {
|
||||
const ct = request.headers.get("content-type") ?? "";
|
||||
const cl = request.headers.get("content-length");
|
||||
if (cl && Number(cl) > 256 * 1024) {
|
||||
console.error(
|
||||
`[CHAT-ROUTE] large body content-type="${ct}" content-length=${cl}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Prompt injection guard — inspect body before forwarding
|
||||
try {
|
||||
const cloned = request.clone();
|
||||
|
|
|
|||
42
src/app/api/v1/chatgpt-web/image/[id]/route.ts
Normal file
42
src/app/api/v1/chatgpt-web/image/[id]/route.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import { CORS_HEADERS, handleCorsOptions } from "@/shared/utils/cors";
|
||||
import { getChatGptImage } from "@omniroute/open-sse/services/chatgptImageCache.ts";
|
||||
|
||||
export async function OPTIONS() {
|
||||
return handleCorsOptions();
|
||||
}
|
||||
|
||||
/**
|
||||
* Serve a cached ChatGPT-generated image by its opaque cache id.
|
||||
*
|
||||
* Auth: intentionally unauthenticated. The id is a 128-bit random UUID and
|
||||
* the entry has a short TTL, so the URL is unguessable for the lifetime of
|
||||
* the chat turn. We need it open because it's loaded by the user's BROWSER
|
||||
* (via an `<img>` tag rendered from markdown) — that fetch doesn't carry
|
||||
* the OmniRoute API key. Rate limiting / abuse protection sit at the
|
||||
* network layer the same way they do for any other static asset.
|
||||
*/
|
||||
export async function GET(_request: Request, { params }: { params: Promise<{ id: string }> }) {
|
||||
const { id } = await params;
|
||||
const entry = getChatGptImage(id);
|
||||
if (!entry) {
|
||||
return new Response(JSON.stringify({ error: "Image not found or expired" }), {
|
||||
status: 404,
|
||||
headers: { "Content-Type": "application/json", ...CORS_HEADERS },
|
||||
});
|
||||
}
|
||||
// entry.bytes is a Buffer (subclass of Uint8Array); pass it directly.
|
||||
// Wrapping in `new Uint8Array(...)` would copy the entire payload — up to
|
||||
// 8 MB per image — for no benefit.
|
||||
return new Response(entry.bytes, {
|
||||
status: 200,
|
||||
headers: {
|
||||
"Content-Type": entry.mime,
|
||||
// Allow short browser caching — the id is unique-per-image, so a
|
||||
// cache hit is fine and saves a round-trip if the user re-renders
|
||||
// the chat. Beyond the in-memory TTL the URL 404s anyway.
|
||||
"Cache-Control": "private, max-age=1800",
|
||||
"Content-Length": String(entry.bytes.length),
|
||||
...CORS_HEADERS,
|
||||
},
|
||||
});
|
||||
}
|
||||
174
src/app/api/v1/images/edits/route.ts
Normal file
174
src/app/api/v1/images/edits/route.ts
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
import { CORS_ORIGIN } from "@/shared/utils/cors";
|
||||
import { handleImageEdit } from "@omniroute/open-sse/handlers/imageGeneration.ts";
|
||||
import {
|
||||
getProviderCredentials,
|
||||
clearRecoveredProviderState,
|
||||
extractApiKey,
|
||||
isValidApiKey,
|
||||
} from "@/sse/services/auth";
|
||||
import { parseImageModel, getImageProvider } from "@omniroute/open-sse/config/imageRegistry.ts";
|
||||
import { errorResponse, unavailableResponse } from "@omniroute/open-sse/utils/error.ts";
|
||||
import { HTTP_STATUS } from "@omniroute/open-sse/config/constants.ts";
|
||||
import * as log from "@/sse/utils/logger";
|
||||
import { toJsonErrorPayload } from "@/shared/utils/upstreamError";
|
||||
import { enforceApiKeyPolicy } from "@/shared/utils/apiKeyPolicy";
|
||||
|
||||
/**
|
||||
* /v1/images/edits — multipart edit endpoint matching OpenAI's images-edit API.
|
||||
*
|
||||
* Open WebUI's "Image Edit" toggle (images.edit.engine = "openai") posts here
|
||||
* with `prompt` + `image` (file). For chatgpt-web, an "edit" only makes sense
|
||||
* if the uploaded image was originally generated through OmniRoute — we then
|
||||
* have its `{conversationId, parentMessageId}` cached and can continue the
|
||||
* saved chatgpt.com conversation node, which is the only way to actually edit
|
||||
* the image instead of generating an unrelated one from scratch.
|
||||
*
|
||||
* Without this route, multipart bodies trip Next.js's Server Action handler
|
||||
* (which intercepts ALL POSTs with multipart/form-data content-type) and the
|
||||
* client gets a confusing "Failed to find Server Action" 500.
|
||||
*/
|
||||
|
||||
export async function OPTIONS() {
|
||||
return new Response(null, {
|
||||
headers: {
|
||||
"Access-Control-Allow-Origin": CORS_ORIGIN,
|
||||
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
||||
"Access-Control-Allow-Headers": "*",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const;
|
||||
|
||||
function publicBaseUrlHeaders(headers: Headers): Record<string, string> {
|
||||
const out: Record<string, string> = {};
|
||||
for (const key of PUBLIC_BASE_URL_HEADER_KEYS) {
|
||||
const value = headers.get(key);
|
||||
if (value !== null) out[key] = value;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function readMultipartImage(formData: FormData): Promise<{
|
||||
prompt: string;
|
||||
model: string | null;
|
||||
size: string | null;
|
||||
responseFormat: string | null;
|
||||
imageBytes: Buffer | null;
|
||||
imageMime: string | null;
|
||||
}> {
|
||||
const promptRaw = formData.get("prompt");
|
||||
const prompt = typeof promptRaw === "string" ? promptRaw.trim() : "";
|
||||
const modelRaw = formData.get("model");
|
||||
const model = typeof modelRaw === "string" ? modelRaw.trim() : null;
|
||||
const sizeRaw = formData.get("size");
|
||||
const size = typeof sizeRaw === "string" ? sizeRaw.trim() : null;
|
||||
const respRaw = formData.get("response_format");
|
||||
const responseFormat = typeof respRaw === "string" ? respRaw.trim() : null;
|
||||
|
||||
// OpenAI's API and Open WebUI both accept either a single `image` field or
|
||||
// an `image[]` array. We use the first image when multiple are sent — the
|
||||
// chatgpt-web edit tool can only edit one image per conversation node.
|
||||
const imageEntry = formData.get("image") ?? formData.get("image[]");
|
||||
if (!imageEntry || typeof imageEntry === "string") {
|
||||
return { prompt, model, size, responseFormat, imageBytes: null, imageMime: null };
|
||||
}
|
||||
const file = imageEntry as File;
|
||||
const imageBytes = Buffer.from(await file.arrayBuffer());
|
||||
const imageMime = file.type || "image/png";
|
||||
return { prompt, model, size, responseFormat, imageBytes, imageMime };
|
||||
}
|
||||
|
||||
export async function POST(request: Request) {
|
||||
let formData: FormData;
|
||||
try {
|
||||
formData = await request.formData();
|
||||
} catch (err) {
|
||||
log.warn("IMAGE", `Invalid multipart body: ${err instanceof Error ? err.message : String(err)}`);
|
||||
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Invalid multipart body");
|
||||
}
|
||||
|
||||
const { prompt, model, size, responseFormat, imageBytes, imageMime } =
|
||||
await readMultipartImage(formData);
|
||||
|
||||
if (!prompt) {
|
||||
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: prompt");
|
||||
}
|
||||
if (!imageBytes || imageBytes.length === 0) {
|
||||
return errorResponse(HTTP_STATUS.BAD_REQUEST, "Missing required field: image");
|
||||
}
|
||||
|
||||
const apiKey = extractApiKey(request);
|
||||
if (!isValidApiKey(apiKey)) {
|
||||
const policyError = enforceApiKeyPolicy(apiKey);
|
||||
if (policyError) {
|
||||
return new Response(JSON.stringify(policyError.body), {
|
||||
status: policyError.status,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const fullModel = model || "cgpt-web/gpt-5.3-instant";
|
||||
const parsed = parseImageModel(fullModel);
|
||||
const providerConfig = getImageProvider(parsed.provider);
|
||||
if (!providerConfig) {
|
||||
return errorResponse(HTTP_STATUS.BAD_REQUEST, `Unknown image provider: ${parsed.provider}`);
|
||||
}
|
||||
if (providerConfig.format !== "chatgpt-web") {
|
||||
// We only implement edit for chatgpt-web today; everything else routes
|
||||
// through generations which doesn't accept image inputs. Surface a
|
||||
// useful error rather than silently dropping the image.
|
||||
return errorResponse(
|
||||
HTTP_STATUS.BAD_REQUEST,
|
||||
`Image edit is only supported for chatgpt-web models (got ${parsed.provider})`
|
||||
);
|
||||
}
|
||||
|
||||
const credentials = await getProviderCredentials(parsed.provider, apiKey);
|
||||
if (!credentials) {
|
||||
return errorResponse(HTTP_STATUS.UNAUTHORIZED, `No credentials for provider: ${parsed.provider}`);
|
||||
}
|
||||
if (credentials.allRateLimited) {
|
||||
return unavailableResponse(
|
||||
HTTP_STATUS.RATE_LIMITED,
|
||||
`[${parsed.provider}] All accounts rate limited`,
|
||||
credentials.retryAfter,
|
||||
credentials.retryAfterHuman
|
||||
);
|
||||
}
|
||||
|
||||
const result = await handleImageEdit({
|
||||
provider: parsed.provider,
|
||||
model: parsed.model,
|
||||
body: {
|
||||
prompt,
|
||||
size: size ?? undefined,
|
||||
response_format: responseFormat ?? undefined,
|
||||
n: 1,
|
||||
},
|
||||
imageBytes,
|
||||
imageMime,
|
||||
credentials,
|
||||
log,
|
||||
signal: request.signal,
|
||||
clientHeaders: publicBaseUrlHeaders(request.headers),
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
await clearRecoveredProviderState(credentials);
|
||||
return new Response(JSON.stringify((result as any).data), {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
}
|
||||
|
||||
const errorPayload = toJsonErrorPayload(
|
||||
(result as any).error,
|
||||
"Image edit provider error"
|
||||
);
|
||||
return new Response(JSON.stringify(errorPayload), {
|
||||
status: (result as any).status,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
}
|
||||
|
|
@ -101,6 +101,23 @@ function hasImageGenerationInput(body: Record<string, unknown>) {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Forward only the host-shaped headers the chatgpt-web image handler needs
|
||||
// to derive the browser-facing public base URL. Avoid copying the full
|
||||
// request header set: it's wider than the handler needs (auth tokens,
|
||||
// content-type, etc.) and `Headers.forEach` collapses repeated values, which
|
||||
// would silently drop entries if a wider helper were reused for headers
|
||||
// that can legitimately repeat (e.g., set-cookie).
|
||||
const PUBLIC_BASE_URL_HEADER_KEYS = ["host", "x-forwarded-host", "x-forwarded-proto"] as const;
|
||||
|
||||
function publicBaseUrlHeaders(headers: Headers): Record<string, string> {
|
||||
const out: Record<string, string> = {};
|
||||
for (const key of PUBLIC_BASE_URL_HEADER_KEYS) {
|
||||
const value = headers.get(key);
|
||||
if (value !== null) out[key] = value;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
export async function POST(request) {
|
||||
let rawBody;
|
||||
try {
|
||||
|
|
@ -228,6 +245,8 @@ export async function POST(request) {
|
|||
credentials,
|
||||
log,
|
||||
...(isCustomModel && { resolvedProvider: provider }),
|
||||
signal: request.signal,
|
||||
clientHeaders: publicBaseUrlHeaders(request.headers),
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
|
|
|
|||
1
src/lib/env/runtimeEnv.ts
vendored
1
src/lib/env/runtimeEnv.ts
vendored
|
|
@ -66,6 +66,7 @@ export const webRuntimeEnvSchema = z.object({
|
|||
OMNIROUTE_DISABLE_BACKGROUND_SERVICES: optionalBooleanEnv,
|
||||
CLOUD_URL: optionalHttpUrl,
|
||||
NEXT_PUBLIC_CLOUD_URL: optionalHttpUrl,
|
||||
OMNIROUTE_PUBLIC_BASE_URL: optionalHttpUrl,
|
||||
OMNIROUTE_BASE_URL: optionalHttpUrl,
|
||||
BASE_URL: optionalHttpUrl,
|
||||
NEXT_PUBLIC_BASE_URL: optionalHttpUrl,
|
||||
|
|
|
|||
3
src/types/global.d.ts
vendored
3
src/types/global.d.ts
vendored
|
|
@ -21,6 +21,9 @@ declare namespace NodeJS {
|
|||
PORT?: string;
|
||||
API_HOST?: string;
|
||||
DASHBOARD_PORT?: string;
|
||||
OMNIROUTE_PUBLIC_BASE_URL?: string;
|
||||
OMNIROUTE_CGPT_WEB_IMAGE_TIMEOUT_MS?: string;
|
||||
OMNIROUTE_CGPT_WEB_IMAGE_CACHE_MAX_MB?: string;
|
||||
OMNIROUTE_BASE_URL?: string;
|
||||
OMNIROUTE_DISABLE_BACKGROUND_SERVICES?: string;
|
||||
OMNIROUTE_PORT?: string;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue