fix: sanitize surrogates (#25934)

2026-05-30 20:44:31 +00:00 · 2026-05-05 18:07:23 -05:00 · 2026-05-05 18:07:23 -05:00 · 6409aceb1a
commit 6409aceb1a
parent 837cc92586
2 changed files with 164 additions and 5 deletions
--- a/packages/opencode/src/provider/transform.ts
+++ b/packages/opencode/src/provider/transform.ts
@ -1,4 +1,4 @@
-import type { ModelMessage } from "ai"
+import type { ModelMessage, ToolResultPart } from "ai"
 import { mergeDeep, unique } from "remeda"
 import type { JSONSchema7 } from "@ai-sdk/provider"
 import type { JSONSchema } from "zod/v4/core"
@ -19,6 +19,10 @@ function mimeToModality(mime: string): Modality | undefined {

 export const OUTPUT_TOKEN_MAX = Flag.OPENCODE_EXPERIMENTAL_OUTPUT_TOKEN_MAX || 32_000

+export function sanitizeSurrogates(content: string) {
+  return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "\uFFFD")
+}
+
 // Maps npm package to the key the AI SDK expects for providerOptions
 function sdkKey(npm: string): string | undefined {
  switch (npm) {
@ -52,11 +56,74 @@ function sdkKey(npm: string): string | undefined {
  return undefined
 }

+// TODO: fix this stupid inefficient dogshit function
 function normalizeMessages(
  msgs: ModelMessage[],
  model: Provider.Model,
  _options: Record<string, unknown>,
 ): ModelMessage[] {
+  const sanitizeToolResultOutput = (content: ToolResultPart) => {
+    if (content.output.type === "text" || content.output.type === "error-text") {
+      content.output.value = sanitizeSurrogates(content.output.value)
+    }
+    if (content.output.type === "content") {
+      content.output.value = content.output.value.map((item) => {
+        if (item.type === "text") {
+          item.text = sanitizeSurrogates(item.text)
+        }
+        return item
+      })
+    }
+    return content
+  }
+
+  msgs = msgs.map((msg) => {
+    switch (msg.role) {
+      case "tool":
+        if (!Array.isArray(msg.content)) return msg
+        msg.content = msg.content.map((content) => {
+          if (content.type === "tool-result") {
+            return sanitizeToolResultOutput(content)
+          }
+          return content
+        })
+        return msg
+
+      case "system":
+        msg.content = sanitizeSurrogates(msg.content)
+        return msg
+
+      case "user":
+        if (typeof msg.content === "string") {
+          msg.content = sanitizeSurrogates(msg.content)
+        } else {
+          msg.content = msg.content.map((content) => {
+            if (content.type === "text") {
+              content.text = sanitizeSurrogates(content.text)
+            }
+            return content
+          })
+        }
+        return msg
+
+      case "assistant":
+        if (typeof msg.content === "string") {
+          msg.content = sanitizeSurrogates(msg.content)
+        } else {
+          msg.content = msg.content.map((content) => {
+            if (content.type === "text" || content.type === "reasoning") {
+              content.text = sanitizeSurrogates(content.text)
+            }
+            if (content.type === "tool-result") {
+              return sanitizeToolResultOutput(content)
+            }
+            return content
+          })
+        }
+        return msg
+    }
+  })
+
  // Anthropic rejects messages with empty content - filter out empty string messages
  // and remove empty text/reasoning parts from array content
  if (model.api.npm === "@ai-sdk/anthropic") {
--- a/packages/opencode/test/provider/transform.test.ts
+++ b/packages/opencode/test/provider/transform.test.ts
@ -1123,6 +1123,98 @@ describe("ProviderTransform.message - DeepSeek reasoning content", () => {
  })
 })

+describe("ProviderTransform.message - surrogate sanitization", () => {
+  const model = {
+    id: "test/test-model",
+    providerID: "test",
+    api: {
+      id: "test-model",
+      url: "https://api.test.com",
+      npm: "@ai-sdk/openai-compatible",
+    },
+    name: "Test Model",
+    capabilities: {
+      temperature: true,
+      reasoning: true,
+      attachment: true,
+      toolcall: true,
+      input: { text: true, audio: false, image: true, video: false, pdf: false },
+      output: { text: true, audio: false, image: false, video: false, pdf: false },
+      interleaved: false,
+    },
+    cost: { input: 0.001, output: 0.002, cache: { read: 0.0001, write: 0.0002 } },
+    limit: { context: 128000, output: 8192 },
+    status: "active",
+    options: {},
+    headers: {},
+  } as any
+
+  test("replaces lone surrogates in model-visible text", () => {
+    const lone = "\uD83D"
+    const valid = "🚀"
+    const sanitized = "<22>"
+    const text = (label: string) => `${label} ${lone} and ${valid}`
+    const expected = (label: string) => `${label} ${sanitized} and ${valid}`
+    const msgs = [
+      { role: "system", content: text("system") },
+      { role: "user", content: text("user string") },
+      {
+        role: "user",
+        content: [
+          { type: "text", text: text("user text") },
+          { type: "image", image: "data:image/png;base64,abcd" },
+        ],
+      },
+      { role: "assistant", content: text("assistant string") },
+      {
+        role: "assistant",
+        content: [
+          { type: "text", text: text("assistant text") },
+          { type: "reasoning", text: text("assistant reasoning") },
+          { type: "tool-call", toolCallId: "call-1", toolName: "Read", input: { filePath: ".opencode/tool/emoji.ts" } },
+          { type: "tool-result", toolCallId: "call-2", toolName: "Read", output: { type: "text", value: text("assistant tool text") } },
+          { type: "tool-result", toolCallId: "call-3", toolName: "Read", output: { type: "error-text", value: text("assistant tool error") } },
+          {
+            type: "tool-result",
+            toolCallId: "call-4",
+            toolName: "Read",
+            output: { type: "content", value: [{ type: "text", text: text("assistant tool content") }] },
+          },
+        ],
+      },
+      {
+        role: "tool",
+        content: [
+          { type: "tool-result", toolCallId: "call-5", toolName: "Read", output: { type: "text", value: text("tool text") } },
+          { type: "tool-result", toolCallId: "call-6", toolName: "Read", output: { type: "error-text", value: text("tool error") } },
+          {
+            type: "tool-result",
+            toolCallId: "call-7",
+            toolName: "Read",
+            output: { type: "content", value: [{ type: "text", text: text("tool content") }] },
+          },
+        ],
+      },
+    ] as any[]
+
+    const result = ProviderTransform.message(msgs, model, {}) as any[]
+
+    expect(result[0].content).toBe(expected("system"))
+    expect(result[1].content).toBe(expected("user string"))
+    expect(result[2].content[0].text).toBe(expected("user text"))
+    expect(result[3].content).toBe(expected("assistant string"))
+    expect(result[4].content[0].text).toBe(expected("assistant text"))
+    expect(result[4].content[1].text).toBe(expected("assistant reasoning"))
+    expect(result[4].content[3].output.value).toBe(expected("assistant tool text"))
+    expect(result[4].content[4].output.value).toBe(expected("assistant tool error"))
+    expect(result[4].content[5].output.value[0].text).toBe(expected("assistant tool content"))
+    expect(result[5].content[0].output.value).toBe(expected("tool text"))
+    expect(result[5].content[1].output.value).toBe(expected("tool error"))
+    expect(result[5].content[2].output.value[0].text).toBe(expected("tool content"))
+    expect(result[2].content[1]).toEqual({ type: "image", image: "data:image/png;base64,abcd" })
+  })
+})
+
 describe("ProviderTransform.message - empty image handling", () => {
  const mockModel = {
    id: "anthropic/claude-3-5-sonnet",
@ -1993,7 +2085,7 @@ describe("ProviderTransform.message - bedrock caching with non-bedrock providerI
    const msgs = [
      {
        role: "system",
-        content: [{ type: "text", text: "You are a helpful assistant" }],
+        content: "You are a helpful assistant",
      },
      {
        role: "user",
@ -2007,7 +2099,7 @@ describe("ProviderTransform.message - bedrock caching with non-bedrock providerI
    expect(result[0].providerOptions?.bedrock).toEqual({
      cachePoint: { type: "default" },
    })
-    expect(result[0].content[0].providerOptions?.bedrock).toBeUndefined()
+    expect(result[0].content).toBe("You are a helpful assistant")
  })
 })

@ -2044,7 +2136,7 @@ describe("ProviderTransform.message - cache control on gateway", () => {
    const msgs = [
      {
        role: "system",
-        content: [{ type: "text", text: "You are a helpful assistant" }],
+        content: "You are a helpful assistant",
      },
      {
        role: "user",
@ -2054,7 +2146,7 @@ describe("ProviderTransform.message - cache control on gateway", () => {

    const result = ProviderTransform.message(msgs, model, {}) as any[]

-    expect(result[0].content[0].providerOptions).toBeUndefined()
+    expect(result[0].content).toBe("You are a helpful assistant")
    expect(result[0].providerOptions).toBeUndefined()
  })