fix: sanitize surrogates (#25934)

This commit is contained in:
Aiden Cline 2026-05-05 18:07:23 -05:00 committed by GitHub
parent 837cc92586
commit 6409aceb1a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 164 additions and 5 deletions

View file

@ -1,4 +1,4 @@
import type { ModelMessage } from "ai"
import type { ModelMessage, ToolResultPart } from "ai"
import { mergeDeep, unique } from "remeda"
import type { JSONSchema7 } from "@ai-sdk/provider"
import type { JSONSchema } from "zod/v4/core"
@ -19,6 +19,10 @@ function mimeToModality(mime: string): Modality | undefined {
export const OUTPUT_TOKEN_MAX = Flag.OPENCODE_EXPERIMENTAL_OUTPUT_TOKEN_MAX || 32_000
export function sanitizeSurrogates(content: string) {
return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "\uFFFD")
}
// Maps npm package to the key the AI SDK expects for providerOptions
function sdkKey(npm: string): string | undefined {
switch (npm) {
@ -52,11 +56,74 @@ function sdkKey(npm: string): string | undefined {
return undefined
}
// TODO: fix this stupid inefficient dogshit function
function normalizeMessages(
msgs: ModelMessage[],
model: Provider.Model,
_options: Record<string, unknown>,
): ModelMessage[] {
const sanitizeToolResultOutput = (content: ToolResultPart) => {
if (content.output.type === "text" || content.output.type === "error-text") {
content.output.value = sanitizeSurrogates(content.output.value)
}
if (content.output.type === "content") {
content.output.value = content.output.value.map((item) => {
if (item.type === "text") {
item.text = sanitizeSurrogates(item.text)
}
return item
})
}
return content
}
msgs = msgs.map((msg) => {
switch (msg.role) {
case "tool":
if (!Array.isArray(msg.content)) return msg
msg.content = msg.content.map((content) => {
if (content.type === "tool-result") {
return sanitizeToolResultOutput(content)
}
return content
})
return msg
case "system":
msg.content = sanitizeSurrogates(msg.content)
return msg
case "user":
if (typeof msg.content === "string") {
msg.content = sanitizeSurrogates(msg.content)
} else {
msg.content = msg.content.map((content) => {
if (content.type === "text") {
content.text = sanitizeSurrogates(content.text)
}
return content
})
}
return msg
case "assistant":
if (typeof msg.content === "string") {
msg.content = sanitizeSurrogates(msg.content)
} else {
msg.content = msg.content.map((content) => {
if (content.type === "text" || content.type === "reasoning") {
content.text = sanitizeSurrogates(content.text)
}
if (content.type === "tool-result") {
return sanitizeToolResultOutput(content)
}
return content
})
}
return msg
}
})
// Anthropic rejects messages with empty content - filter out empty string messages
// and remove empty text/reasoning parts from array content
if (model.api.npm === "@ai-sdk/anthropic") {

View file

@ -1123,6 +1123,98 @@ describe("ProviderTransform.message - DeepSeek reasoning content", () => {
})
})
describe("ProviderTransform.message - surrogate sanitization", () => {
const model = {
id: "test/test-model",
providerID: "test",
api: {
id: "test-model",
url: "https://api.test.com",
npm: "@ai-sdk/openai-compatible",
},
name: "Test Model",
capabilities: {
temperature: true,
reasoning: true,
attachment: true,
toolcall: true,
input: { text: true, audio: false, image: true, video: false, pdf: false },
output: { text: true, audio: false, image: false, video: false, pdf: false },
interleaved: false,
},
cost: { input: 0.001, output: 0.002, cache: { read: 0.0001, write: 0.0002 } },
limit: { context: 128000, output: 8192 },
status: "active",
options: {},
headers: {},
} as any
test("replaces lone surrogates in model-visible text", () => {
const lone = "\uD83D"
const valid = "🚀"
const sanitized = "<22>"
const text = (label: string) => `${label} ${lone} and ${valid}`
const expected = (label: string) => `${label} ${sanitized} and ${valid}`
const msgs = [
{ role: "system", content: text("system") },
{ role: "user", content: text("user string") },
{
role: "user",
content: [
{ type: "text", text: text("user text") },
{ type: "image", image: "data:image/png;base64,abcd" },
],
},
{ role: "assistant", content: text("assistant string") },
{
role: "assistant",
content: [
{ type: "text", text: text("assistant text") },
{ type: "reasoning", text: text("assistant reasoning") },
{ type: "tool-call", toolCallId: "call-1", toolName: "Read", input: { filePath: ".opencode/tool/emoji.ts" } },
{ type: "tool-result", toolCallId: "call-2", toolName: "Read", output: { type: "text", value: text("assistant tool text") } },
{ type: "tool-result", toolCallId: "call-3", toolName: "Read", output: { type: "error-text", value: text("assistant tool error") } },
{
type: "tool-result",
toolCallId: "call-4",
toolName: "Read",
output: { type: "content", value: [{ type: "text", text: text("assistant tool content") }] },
},
],
},
{
role: "tool",
content: [
{ type: "tool-result", toolCallId: "call-5", toolName: "Read", output: { type: "text", value: text("tool text") } },
{ type: "tool-result", toolCallId: "call-6", toolName: "Read", output: { type: "error-text", value: text("tool error") } },
{
type: "tool-result",
toolCallId: "call-7",
toolName: "Read",
output: { type: "content", value: [{ type: "text", text: text("tool content") }] },
},
],
},
] as any[]
const result = ProviderTransform.message(msgs, model, {}) as any[]
expect(result[0].content).toBe(expected("system"))
expect(result[1].content).toBe(expected("user string"))
expect(result[2].content[0].text).toBe(expected("user text"))
expect(result[3].content).toBe(expected("assistant string"))
expect(result[4].content[0].text).toBe(expected("assistant text"))
expect(result[4].content[1].text).toBe(expected("assistant reasoning"))
expect(result[4].content[3].output.value).toBe(expected("assistant tool text"))
expect(result[4].content[4].output.value).toBe(expected("assistant tool error"))
expect(result[4].content[5].output.value[0].text).toBe(expected("assistant tool content"))
expect(result[5].content[0].output.value).toBe(expected("tool text"))
expect(result[5].content[1].output.value).toBe(expected("tool error"))
expect(result[5].content[2].output.value[0].text).toBe(expected("tool content"))
expect(result[2].content[1]).toEqual({ type: "image", image: "data:image/png;base64,abcd" })
})
})
describe("ProviderTransform.message - empty image handling", () => {
const mockModel = {
id: "anthropic/claude-3-5-sonnet",
@ -1993,7 +2085,7 @@ describe("ProviderTransform.message - bedrock caching with non-bedrock providerI
const msgs = [
{
role: "system",
content: [{ type: "text", text: "You are a helpful assistant" }],
content: "You are a helpful assistant",
},
{
role: "user",
@ -2007,7 +2099,7 @@ describe("ProviderTransform.message - bedrock caching with non-bedrock providerI
expect(result[0].providerOptions?.bedrock).toEqual({
cachePoint: { type: "default" },
})
expect(result[0].content[0].providerOptions?.bedrock).toBeUndefined()
expect(result[0].content).toBe("You are a helpful assistant")
})
})
@ -2044,7 +2136,7 @@ describe("ProviderTransform.message - cache control on gateway", () => {
const msgs = [
{
role: "system",
content: [{ type: "text", text: "You are a helpful assistant" }],
content: "You are a helpful assistant",
},
{
role: "user",
@ -2054,7 +2146,7 @@ describe("ProviderTransform.message - cache control on gateway", () => {
const result = ProviderTransform.message(msgs, model, {}) as any[]
expect(result[0].content[0].providerOptions).toBeUndefined()
expect(result[0].content).toBe("You are a helpful assistant")
expect(result[0].providerOptions).toBeUndefined()
})