feat(tts): add Inworld speech provider (#55972)

Adds the bundled Inworld speech provider with docs, config surface, SSRF-guarded fetches, directive overrides, native voice-note/telephony output coverage, and live `.profile` verification.

Co-authored-by: cshape <cshape@users.noreply.github.com>
This commit is contained in:
Cale Shapera 2026-04-25 14:33:21 -07:00 committed by GitHub
parent 167588cb4f
commit 0bcb4c95c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1295 additions and 16 deletions

View file

@ -82,4 +82,5 @@ OPENCLAW_GATEWAY_TOKEN=
# ELEVENLABS_API_KEY=...
# XI_API_KEY=... # alias for ElevenLabs
# INWORLD_API_KEY=...
# DEEPGRAM_API_KEY=...

5
.github/labeler.yml vendored
View file

@ -307,6 +307,11 @@
- changed-files:
- any-glob-to-any-file:
- "extensions/huggingface/**"
"extensions: inworld":
- changed-files:
- any-glob-to-any-file:
- "extensions/inworld/**"
- "docs/providers/inworld.md"
"extensions: kilocode":
- changed-files:
- any-glob-to-any-file:

View file

@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai
- Providers/Xiaomi: add MiMo TTS as a bundled speech provider with MP3/WAV output and voice-note Opus transcoding. Fixes #52376. (#55614) Thanks @zoujiejun.
- Providers/ElevenLabs: include `eleven_v3` in the bundled TTS model catalog so model selection surfaces can offer ElevenLabs v3. (#68321) Thanks @itsuzef.
- Providers/Local CLI TTS: add a bundled local command speech provider with file/stdout input, voice-note Opus conversion, and telephony PCM output. (#56239) Thanks @solar2ain.
- Providers/Inworld: add Inworld as a bundled speech provider with streaming TTS synthesis, voice listing, voice-note output, and PCM telephony output. (#55972) Thanks @cshape.
- Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude.
- Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris.
- Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay.

View file

@ -1,4 +1,4 @@
9ac3d271f9bfa9611557f0b52e4d0a600693bdd1de75cc1bafc320fc4d4f0075 config-baseline.json
0b0d796bceddfb9e2929518ba84af626da7f5d75c392a217041f36e850c4e74f config-baseline.json
271fdf1d6652927e0fc160a6f25276bf6dccb8f1b27fab15e0fc2620e8cacab4 config-baseline.core.json
7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json
7825b56a5b3fcdbe2e09ef8fe5d9f12ac3598435afebe20413051e45b0d1968e config-baseline.plugin.json
17eb3f8887193579ff32e35f9bd520ba2bd6049e52ab18855c5d41fcbf195d83 config-baseline.plugin.json

View file

@ -1317,6 +1317,7 @@
"providers/groq",
"providers/huggingface",
"providers/inferrs",
"providers/inworld",
"providers/kilocode",
"providers/litellm",
"providers/lmstudio",

115
docs/providers/inworld.md Normal file
View file

@ -0,0 +1,115 @@
---
summary: "Inworld streaming text-to-speech for OpenClaw replies"
read_when:
- You want Inworld speech synthesis for outbound replies
- You need PCM telephony or OGG_OPUS voice-note output from Inworld
title: "Inworld"
---
Inworld is a streaming text-to-speech (TTS) provider. In OpenClaw it
synthesizes outbound reply audio (MP3 by default, OGG_OPUS for voice notes)
and PCM audio for telephony channels such as Voice Call.
OpenClaw posts to Inworld's streaming TTS endpoint, concatenates the
returned base64 audio chunks into a single buffer, and hands the result to
the standard reply-audio pipeline.
| Detail | Value |
| ------------- | ----------------------------------------------------------- |
| Website | [inworld.ai](https://inworld.ai) |
| Docs | [docs.inworld.ai/tts/tts](https://docs.inworld.ai/tts/tts) |
| Auth | `INWORLD_API_KEY` (HTTP Basic, Base64 dashboard credential) |
| Default voice | `Sarah` |
| Default model | `inworld-tts-1.5-max` |
## Getting started
<Steps>
<Step title="Set your API key">
Copy the credential from your Inworld dashboard (Workspace > API Keys)
and set it as an env var. The value is sent verbatim as the HTTP Basic
credential, so do not Base64-encode it again or convert it to a bearer
token.
```
INWORLD_API_KEY=<base64-credential-from-dashboard>
```
</Step>
<Step title="Select Inworld in messages.tts">
```json5
{
messages: {
tts: {
auto: "always",
provider: "inworld",
providers: {
inworld: {
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
},
},
},
},
}
```
</Step>
<Step title="Send a message">
Send a reply through any connected channel. OpenClaw synthesizes the
audio with Inworld and delivers it as MP3 (or OGG_OPUS when the channel
expects a voice note).
</Step>
</Steps>
## Configuration options
| Option | Path | Description |
| ------------- | -------------------------------------------- | ----------------------------------------------------------------- |
| `apiKey` | `messages.tts.providers.inworld.apiKey` | Base64 dashboard credential. Falls back to `INWORLD_API_KEY`. |
| `baseUrl` | `messages.tts.providers.inworld.baseUrl` | Override Inworld API base URL (default `https://api.inworld.ai`). |
| `voiceId` | `messages.tts.providers.inworld.voiceId` | Voice identifier (default `Sarah`). |
| `modelId` | `messages.tts.providers.inworld.modelId` | TTS model id (default `inworld-tts-1.5-max`). |
| `temperature` | `messages.tts.providers.inworld.temperature` | Sampling temperature `0..2` (optional). |
## Notes
<AccordionGroup>
<Accordion title="Authentication">
Inworld uses HTTP Basic auth with a single Base64-encoded credential
string. Copy it verbatim from the Inworld dashboard. The provider sends
it as `Authorization: Basic <apiKey>` without any further encoding, so
do not Base64-encode it yourself and do not pass a bearer-style token.
See [TTS auth notes](/tools/tts#inworld-primary) for the same callout.
</Accordion>
<Accordion title="Models">
Supported model ids: `inworld-tts-1.5-max` (default),
`inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`.
</Accordion>
<Accordion title="Audio outputs">
Replies use MP3 by default. When the channel target is `voice-note`
OpenClaw asks Inworld for `OGG_OPUS` so the audio plays as a native
voice bubble. Telephony synthesis uses raw `PCM` at 22050 Hz to feed
the telephony bridge.
</Accordion>
<Accordion title="Custom endpoints">
Override the API host with `messages.tts.providers.inworld.baseUrl`.
Trailing slashes are stripped before requests are sent.
</Accordion>
</AccordionGroup>
## Related
<CardGroup cols={2}>
<Card title="Text-to-speech" href="/tools/tts" icon="waveform-lines">
TTS overview, providers, and `messages.tts` config.
</Card>
<Card title="Configuration" href="/gateway/configuration" icon="gear">
Full config reference including `messages.tts` settings.
</Card>
<Card title="Providers" href="/providers" icon="grid">
All bundled OpenClaw providers.
</Card>
<Card title="Troubleshooting" href="/help/troubleshooting" icon="wrench">
Common issues and debugging steps.
</Card>
</CardGroup>

View file

@ -7,7 +7,7 @@ read_when:
title: "Text-to-speech"
---
OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo.
OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo.
It works anywhere OpenClaw can send audio.
## Supported services
@ -15,6 +15,7 @@ It works anywhere OpenClaw can send audio.
- **ElevenLabs** (primary or fallback provider)
- **Google Gemini** (primary or fallback provider; uses Gemini API TTS)
- **Gradium** (primary or fallback provider; supports voice-note and telephony output)
- **Inworld** (primary or fallback provider; uses the Inworld streaming TTS API)
- **Local CLI** (primary or fallback provider; runs a configured local TTS command)
- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`)
- **MiniMax** (primary or fallback provider; uses the T2A v2 API)
@ -38,11 +39,12 @@ or ElevenLabs.
## Optional keys
If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, xAI, or Xiaomi MiMo:
If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `GEMINI_API_KEY` (or `GOOGLE_API_KEY`)
- `GRADIUM_API_KEY`
- `INWORLD_API_KEY`
- `MINIMAX_API_KEY`; MiniMax TTS also accepts Token Plan auth via
`MINIMAX_OAUTH_TOKEN`, `MINIMAX_CODE_PLAN_KEY`, or
`MINIMAX_CODING_API_KEY`
@ -64,6 +66,7 @@ so that provider must also be authenticated if you enable summaries.
- [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech)
- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication)
- [Gradium](/providers/gradium)
- [Inworld TTS API](https://docs.inworld.ai/tts/tts)
- [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2)
- [Xiaomi MiMo speech synthesis](/providers/xiaomi#text-to-speech)
- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts)
@ -217,6 +220,35 @@ by the bundled Google image-generation provider. Resolution order is
`messages.tts.providers.google.apiKey` -> `models.providers.google.apiKey` ->
`GEMINI_API_KEY` -> `GOOGLE_API_KEY`.
### Inworld primary
```json5
{
messages: {
tts: {
auto: "always",
provider: "inworld",
providers: {
inworld: {
apiKey: "inworld_api_key",
baseUrl: "https://api.inworld.ai",
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
temperature: 0.8,
},
},
},
},
}
```
The `apiKey` value must be the Base64-encoded credential string copied
verbatim from the Inworld dashboard (Workspace > API Keys). The provider
sends it as `Authorization: Basic <apiKey>` without any additional
encoding, so do not pass a raw bearer token and do not Base64-encode it
yourself. The key falls back to the `INWORLD_API_KEY` env var. See
[Inworld provider](/providers/inworld) for full setup.
### xAI primary
```json5
@ -415,7 +447,7 @@ Then run:
- `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic).
- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"inworld"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic).
- If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order.
- Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and
rewritten to `provider: "microsoft"`.
@ -429,7 +461,7 @@ Then run:
- `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded.
- `timeoutMs`: request timeout (ms).
- `prefsPath`: override the local prefs JSON path (provider/limit/summary).
- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`).
- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`).
- `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL.
- `providers.openai.baseUrl`: override the OpenAI TTS endpoint.
- Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1`
@ -453,6 +485,10 @@ Then run:
- `providers.tts-local-cli.timeoutMs`: command timeout in milliseconds (default `120000`).
- `providers.tts-local-cli.cwd`: optional command working directory.
- `providers.tts-local-cli.env`: optional string environment overrides for the command.
- `providers.inworld.baseUrl`: override Inworld API base URL (default `https://api.inworld.ai`).
- `providers.inworld.voiceId`: Inworld voice identifier (default `Sarah`).
- `providers.inworld.modelId`: Inworld TTS model (default `inworld-tts-1.5-max`; also supports `inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`).
- `providers.inworld.temperature`: sampling temperature `0..2` (optional).
- `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
- `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text.
@ -586,6 +622,7 @@ These override `messages.tts.*` for that host.
with `ffmpeg`.
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony.
- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony.
- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.

View file

@ -0,0 +1,11 @@
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
import { buildInworldSpeechProvider } from "./speech-provider.js";
export default definePluginEntry({
id: "inworld",
name: "Inworld Speech",
description: "Bundled Inworld speech provider",
register(api) {
api.registerSpeechProvider(buildInworldSpeechProvider());
},
});

View file

@ -0,0 +1,84 @@
import { describe, expect, it } from "vitest";
import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
import {
registerProviderPlugin,
requireRegisteredProvider,
} from "../../test/helpers/plugins/provider-registration.js";
import plugin from "./index.js";
const INWORLD_API_KEY = process.env.INWORLD_API_KEY?.trim() ?? "";
const LIVE = isLiveTestEnabled() && INWORLD_API_KEY.length > 0;
const describeLive = LIVE ? describe : describe.skip;
const registerInworldPlugin = () =>
registerProviderPlugin({
plugin,
id: "inworld",
name: "Inworld",
});
describeLive("inworld plugin live", () => {
it("lists voices through the registered speech provider", async () => {
const { speechProviders } = await registerInworldPlugin();
const provider = requireRegisteredProvider(speechProviders, "inworld");
const voices = await provider.listVoices?.({
apiKey: INWORLD_API_KEY,
});
expect(voices?.length).toBeGreaterThan(0);
expect(voices).toEqual(expect.arrayContaining([expect.objectContaining({ id: "Sarah" })]));
}, 120_000);
it("synthesizes MP3, native voice-note Ogg/Opus, and telephony PCM", async () => {
const { speechProviders } = await registerInworldPlugin();
const provider = requireRegisteredProvider(speechProviders, "inworld");
const providerConfig = {
apiKey: INWORLD_API_KEY,
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
};
const audioFile = await provider.synthesize({
text: "OpenClaw Inworld text to speech integration test OK.",
cfg: { plugins: { enabled: true } } as never,
providerConfig,
target: "audio-file",
timeoutMs: 90_000,
});
expect(audioFile.outputFormat).toBe("mp3");
expect(audioFile.fileExtension).toBe(".mp3");
expect(audioFile.voiceCompatible).toBe(false);
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
expect(audioFile.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF");
const voiceNote = await provider.synthesize({
text: "OpenClaw Inworld voice note integration test OK.",
cfg: { plugins: { enabled: true } } as never,
providerConfig,
target: "voice-note",
timeoutMs: 90_000,
});
expect(voiceNote.outputFormat).toBe("ogg_opus");
expect(voiceNote.fileExtension).toBe(".ogg");
expect(voiceNote.voiceCompatible).toBe(true);
expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(128);
expect(voiceNote.audioBuffer.subarray(0, 4).toString("ascii")).toBe("OggS");
const telephony = await provider.synthesizeTelephony?.({
text: "OpenClaw Inworld telephony check OK.",
cfg: { plugins: { enabled: true } } as never,
providerConfig,
timeoutMs: 90_000,
});
if (!telephony) {
throw new Error("Inworld telephony synthesis did not return audio");
}
expect(telephony.outputFormat).toBe("pcm");
expect(telephony.sampleRate).toBe(22_050);
expect(telephony.audioBuffer.byteLength).toBeGreaterThan(512);
expect(telephony.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF");
}, 180_000);
});

View file

@ -0,0 +1,40 @@
{
"id": "inworld",
"enabledByDefault": true,
"name": "Inworld",
"description": "Inworld streaming text-to-speech (MP3, OGG_OPUS, PCM telephony).",
"providerAuthEnvVars": {
"inworld": ["INWORLD_API_KEY"]
},
"contracts": {
"speechProviders": ["inworld"]
},
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string",
"description": "Inworld API key. Must be the Base64 credential string from the Inworld dashboard (used as Authorization: Basic <apiKey>). Falls back to INWORLD_API_KEY env var."
},
"baseUrl": {
"type": "string",
"description": "Override Inworld API base URL (default https://api.inworld.ai)."
},
"voiceId": {
"type": "string",
"description": "Voice identifier (default Sarah)."
},
"modelId": {
"type": "string",
"description": "TTS model id (default inworld-tts-1.5-max)."
},
"temperature": {
"type": "number",
"minimum": 0,
"maximum": 2,
"description": "Sampling temperature 0..2."
}
}
}
}

View file

@ -0,0 +1,15 @@
{
"name": "@openclaw/inworld-speech",
"version": "2026.4.16",
"private": true,
"description": "OpenClaw Inworld speech plugin",
"type": "module",
"devDependencies": {
"@openclaw/plugin-sdk": "workspace:*"
},
"openclaw": {
"extensions": [
"./index.ts"
]
}
}

View file

@ -0,0 +1,213 @@
import { afterEach, describe, expect, it, vi } from "vitest";
const { inworldTTSMock, listInworldVoicesMock } = vi.hoisted(() => ({
inworldTTSMock: vi.fn(),
listInworldVoicesMock: vi.fn(),
}));
vi.mock("./tts.js", async (importOriginal) => {
const actual = await importOriginal<typeof import("./tts.js")>();
return {
...actual,
inworldTTS: inworldTTSMock,
listInworldVoices: listInworldVoicesMock,
};
});
import { buildInworldSpeechProvider } from "./speech-provider.js";
describe("buildInworldSpeechProvider", () => {
const originalEnv = process.env.INWORLD_API_KEY;
afterEach(() => {
process.env.INWORLD_API_KEY = originalEnv;
inworldTTSMock.mockReset();
listInworldVoicesMock.mockReset();
vi.restoreAllMocks();
});
it("reports configured when INWORLD_API_KEY env var is set", () => {
process.env.INWORLD_API_KEY = "test-key";
const provider = buildInworldSpeechProvider();
expect(
provider.isConfigured({
providerConfig: {},
timeoutMs: 30_000,
}),
).toBe(true);
});
it("reports configured when providerConfig apiKey is set", () => {
delete process.env.INWORLD_API_KEY;
const provider = buildInworldSpeechProvider();
expect(
provider.isConfigured({
providerConfig: { apiKey: "config-key" },
timeoutMs: 30_000,
}),
).toBe(true);
});
it("reports not configured when no key is available", () => {
delete process.env.INWORLD_API_KEY;
const provider = buildInworldSpeechProvider();
expect(
provider.isConfigured({
providerConfig: {},
timeoutMs: 30_000,
}),
).toBe(false);
});
it("has correct provider metadata", () => {
const provider = buildInworldSpeechProvider();
expect(provider.id).toBe("inworld");
expect(provider.label).toBe("Inworld");
expect(provider.autoSelectOrder).toBe(30);
expect(provider.models).toContain("inworld-tts-1.5-max");
expect(provider.models).toContain("inworld-tts-1.5-mini");
});
it("normalizes provider-owned speech config from raw provider config", () => {
const provider = buildInworldSpeechProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
timeoutMs: 30_000,
rawConfig: {
providers: {
inworld: {
apiKey: "basic-key",
baseUrl: "https://custom.inworld.example.com/",
voiceId: "Ashley",
modelId: "inworld-tts-1.5-mini",
temperature: 0.8,
},
},
},
});
expect(resolved).toEqual({
apiKey: "basic-key",
baseUrl: "https://custom.inworld.example.com",
voiceId: "Ashley",
modelId: "inworld-tts-1.5-mini",
temperature: 0.8,
});
});
it("parses Inworld TTS directive overrides", () => {
const provider = buildInworldSpeechProvider();
const policy = {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
};
expect(provider.parseDirectiveToken?.({ key: "voice", value: "Ashley", policy })).toEqual({
handled: true,
overrides: { voiceId: "Ashley" },
});
expect(
provider.parseDirectiveToken?.({
key: "model",
value: "inworld-tts-1.5-mini",
policy,
}),
).toEqual({
handled: true,
overrides: { modelId: "inworld-tts-1.5-mini" },
});
expect(provider.parseDirectiveToken?.({ key: "temperature", value: "0.7", policy })).toEqual({
handled: true,
overrides: { temperature: 0.7 },
});
});
it("warns on invalid directive temperature", () => {
const provider = buildInworldSpeechProvider();
expect(
provider.parseDirectiveToken?.({
key: "temperature",
value: "3",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
warnings: ['invalid Inworld temperature "3"'],
});
});
it("synthesizes voice-note targets with native OGG_OPUS output", async () => {
inworldTTSMock.mockResolvedValueOnce(Buffer.from("opus"));
const provider = buildInworldSpeechProvider();
const result = await provider.synthesize?.({
text: "Hello",
cfg: {} as never,
providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
target: "voice-note",
timeoutMs: 30_000,
});
expect(inworldTTSMock).toHaveBeenCalledWith({
text: "Hello",
apiKey: "key",
baseUrl: "https://api.inworld.ai",
voiceId: "Ashley",
modelId: "inworld-tts-1.5-mini",
audioEncoding: "OGG_OPUS",
temperature: 0.6,
timeoutMs: 30_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("opus"),
outputFormat: "ogg_opus",
fileExtension: ".ogg",
voiceCompatible: true,
});
});
it("synthesizes telephony PCM at 22050 Hz", async () => {
inworldTTSMock.mockResolvedValueOnce(Buffer.from("pcm"));
const provider = buildInworldSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "Hello",
cfg: {} as never,
providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
timeoutMs: 30_000,
});
expect(inworldTTSMock).toHaveBeenCalledWith({
text: "Hello",
apiKey: "key",
baseUrl: "https://api.inworld.ai",
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
audioEncoding: "PCM",
sampleRateHertz: 22_050,
temperature: undefined,
timeoutMs: 30_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("pcm"),
outputFormat: "pcm",
sampleRate: 22_050,
});
});
});

View file

@ -0,0 +1,221 @@
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
import {
DEFAULT_INWORLD_MODEL_ID,
DEFAULT_INWORLD_VOICE_ID,
type InworldAudioEncoding,
INWORLD_TTS_MODELS,
inworldTTS,
listInworldVoices,
normalizeInworldBaseUrl,
} from "./tts.js";
type InworldProviderConfig = {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
temperature?: number;
};
type InworldProviderOverrides = {
voiceId?: string;
modelId?: string;
temperature?: number;
};
function normalizeInworldProviderConfig(rawConfig: Record<string, unknown>): InworldProviderConfig {
const providers = asObject(rawConfig.providers);
const raw = asObject(providers?.inworld) ?? asObject(rawConfig.inworld);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.inworld.apiKey",
}),
baseUrl: normalizeInworldBaseUrl(trimToUndefined(raw?.baseUrl)),
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_INWORLD_VOICE_ID,
modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_INWORLD_MODEL_ID,
temperature: asFiniteNumber(raw?.temperature),
};
}
function readInworldProviderConfig(config: SpeechProviderConfig): InworldProviderConfig {
const defaults = normalizeInworldProviderConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
baseUrl: normalizeInworldBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
temperature: asFiniteNumber(config.temperature) ?? defaults.temperature,
};
}
function readInworldOverrides(
overrides: SpeechProviderOverrides | undefined,
): InworldProviderOverrides {
if (!overrides) {
return {};
}
return {
voiceId: trimToUndefined(overrides.voiceId ?? overrides.voice),
modelId: trimToUndefined(overrides.modelId ?? overrides.model),
temperature: asFiniteNumber(overrides.temperature),
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
warnings?: string[];
} {
switch (ctx.key) {
case "voice":
case "voiceid":
case "voice_id":
case "inworld_voice":
case "inworldvoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { voiceId: ctx.value } };
case "model":
case "modelid":
case "model_id":
case "inworld_model":
case "inworldmodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return { handled: true, overrides: { modelId: ctx.value } };
case "temperature": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const temperature = Number(ctx.value);
if (!Number.isFinite(temperature) || temperature < 0 || temperature > 2) {
return { handled: true, warnings: [`invalid Inworld temperature "${ctx.value}"`] };
}
return { handled: true, overrides: { temperature } };
}
default:
return { handled: false };
}
}
export function buildInworldSpeechProvider(): SpeechProviderPlugin {
return {
id: "inworld",
label: "Inworld",
autoSelectOrder: 30,
models: INWORLD_TTS_MODELS,
resolveConfig: ({ rawConfig }) => normalizeInworldProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeInworldProviderConfig(baseTtsConfig);
const resolvedApiKey =
talkProviderConfig.apiKey === undefined
? undefined
: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.inworld.apiKey",
});
return {
...base,
...(resolvedApiKey === undefined ? {} : { apiKey: resolvedApiKey }),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeInworldBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { modelId: trimToUndefined(talkProviderConfig.modelId) }),
...(asFiniteNumber(talkProviderConfig.temperature) == null
? {}
: { temperature: asFiniteNumber(talkProviderConfig.temperature) }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId) == null
? {}
: { voiceId: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { modelId: trimToUndefined(params.modelId) }),
...(asFiniteNumber(params.temperature) == null
? {}
: { temperature: asFiniteNumber(params.temperature) }),
}),
listVoices: async (req) => {
const config = req.providerConfig ? readInworldProviderConfig(req.providerConfig) : undefined;
const apiKey = req.apiKey || config?.apiKey || process.env.INWORLD_API_KEY;
if (!apiKey) {
throw new Error("Inworld API key missing");
}
return listInworldVoices({
apiKey,
baseUrl: req.baseUrl ?? config?.baseUrl,
});
},
isConfigured: ({ providerConfig }) =>
Boolean(readInworldProviderConfig(providerConfig).apiKey || process.env.INWORLD_API_KEY),
synthesize: async (req) => {
const config = readInworldProviderConfig(req.providerConfig);
const overrides = readInworldOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
if (!apiKey) {
throw new Error("Inworld API key missing");
}
const useOpus = req.target === "voice-note";
const audioEncoding: InworldAudioEncoding = useOpus ? "OGG_OPUS" : "MP3";
const audioBuffer = await inworldTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: overrides.voiceId ?? config.voiceId,
modelId: overrides.modelId ?? config.modelId,
audioEncoding,
temperature: overrides.temperature ?? config.temperature,
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
outputFormat: audioEncoding.toLowerCase(),
fileExtension: useOpus ? ".ogg" : ".mp3",
voiceCompatible: useOpus,
};
},
synthesizeTelephony: async (req) => {
const config = readInworldProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
if (!apiKey) {
throw new Error("Inworld API key missing");
}
const sampleRate = 22_050;
const audioBuffer = await inworldTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
audioEncoding: "PCM",
sampleRateHertz: sampleRate,
temperature: config.temperature,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat: "pcm", sampleRate };
},
};
}

View file

@ -0,0 +1,16 @@
{
"extends": "../tsconfig.package-boundary.base.json",
"compilerOptions": {
"rootDir": "."
},
"include": ["./*.ts", "./src/**/*.ts"],
"exclude": [
"./**/*.test.ts",
"./dist/**",
"./node_modules/**",
"./src/test-support/**",
"./src/**/*test-helpers.ts",
"./src/**/*test-harness.ts",
"./src/**/*test-support.ts"
]
}

View file

@ -0,0 +1,312 @@
import { afterEach, describe, expect, it, vi } from "vitest";
const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({
fetchWithSsrFGuardMock: vi.fn(),
}));
vi.mock("openclaw/plugin-sdk/ssrf-runtime", async (importOriginal) => {
const actual = await importOriginal<typeof import("openclaw/plugin-sdk/ssrf-runtime")>();
return {
...actual,
fetchWithSsrFGuard: fetchWithSsrFGuardMock,
};
});
import { inworldTTS, listInworldVoices } from "./tts.js";
type GuardRequest = {
url: string;
init?: RequestInit;
auditContext?: string;
policy?: unknown;
timeoutMs?: number;
};
function queueGuardedResponse(response: Response): { release: ReturnType<typeof vi.fn> } {
const release = vi.fn(async () => {});
fetchWithSsrFGuardMock.mockResolvedValueOnce({ response, release });
return { release };
}
function lastGuardRequest(): GuardRequest {
const call = fetchWithSsrFGuardMock.mock.calls.at(-1);
if (!call) {
throw new Error("fetchWithSsrFGuard was not called");
}
return call[0] as GuardRequest;
}
function readRequestBody(request: GuardRequest): string {
const body = request.init?.body;
if (typeof body !== "string") {
throw new Error("expected request body to be a string");
}
return body;
}
describe("listInworldVoices", () => {
afterEach(() => {
fetchWithSsrFGuardMock.mockClear();
vi.restoreAllMocks();
});
it("maps Inworld voice metadata into speech voice options", async () => {
queueGuardedResponse(
new Response(
JSON.stringify({
voices: [
{
voiceId: "Dennis",
displayName: "Dennis",
description: "Middle-aged man with a smooth, calm and friendly voice",
langCode: "EN_US",
tags: ["male", "middle-aged", "smooth", "calm", "friendly"],
source: "SYSTEM",
},
{
voiceId: "Ashley",
displayName: "Ashley",
description: "A warm, natural female voice",
langCode: "EN_US",
tags: ["female", "warm", "natural"],
source: "SYSTEM",
},
],
}),
{ status: 200 },
),
);
const voices = await listInworldVoices({ apiKey: "test-key" });
expect(voices).toEqual([
{
id: "Dennis",
name: "Dennis",
description: "Middle-aged man with a smooth, calm and friendly voice",
locale: "EN_US",
gender: "male",
},
{
id: "Ashley",
name: "Ashley",
description: "A warm, natural female voice",
locale: "EN_US",
gender: "female",
},
]);
const request = lastGuardRequest();
expect(request.url).toBe("https://api.inworld.ai/voices/v1/voices");
expect(request.auditContext).toBe("inworld-voices");
expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] });
const headers = new Headers(request.init?.headers);
expect(headers.get("authorization")).toBe("Basic test-key");
});
it("throws on API errors with response body", async () => {
queueGuardedResponse(new Response("service unavailable", { status: 503 }));
await expect(listInworldVoices({ apiKey: "test-key" })).rejects.toThrow(
"Inworld voices API error (503): service unavailable",
);
});
it("filters out voices with empty voiceId", async () => {
queueGuardedResponse(
new Response(
JSON.stringify({
voices: [
{ voiceId: "", displayName: "Empty" },
{ voiceId: "Dennis", displayName: "Dennis" },
],
}),
{ status: 200 },
),
);
const voices = await listInworldVoices({ apiKey: "test-key" });
expect(voices).toHaveLength(1);
expect(voices[0].id).toBe("Dennis");
});
it("returns empty array when no voices present", async () => {
queueGuardedResponse(new Response(JSON.stringify({}), { status: 200 }));
const voices = await listInworldVoices({ apiKey: "test-key" });
expect(voices).toEqual([]);
});
it("passes language filter as query parameter", async () => {
queueGuardedResponse(new Response(JSON.stringify({ voices: [] }), { status: 200 }));
await listInworldVoices({ apiKey: "test-key", language: "EN_US" });
expect(lastGuardRequest().url).toBe("https://api.inworld.ai/voices/v1/voices?languages=EN_US");
});
it("releases the guarded dispatcher after success", async () => {
const { release } = queueGuardedResponse(
new Response(JSON.stringify({ voices: [] }), { status: 200 }),
);
await listInworldVoices({ apiKey: "test-key" });
expect(release).toHaveBeenCalledTimes(1);
});
});
describe("inworldTTS", () => {
afterEach(() => {
fetchWithSsrFGuardMock.mockClear();
vi.restoreAllMocks();
});
it("concatenates base64 audio chunks from streaming response", async () => {
const chunk1 = Buffer.from("audio-chunk-1").toString("base64");
const chunk2 = Buffer.from("audio-chunk-2").toString("base64");
const body = [
JSON.stringify({ result: { audioContent: chunk1 } }),
JSON.stringify({ result: { audioContent: chunk2 } }),
].join("\n");
queueGuardedResponse(new Response(body, { status: 200 }));
const buffer = await inworldTTS({
text: "Hello world",
apiKey: "test-key",
});
expect(buffer).toEqual(
Buffer.concat([Buffer.from("audio-chunk-1"), Buffer.from("audio-chunk-2")]),
);
});
it("throws on HTTP errors with response body", async () => {
queueGuardedResponse(new Response("bad request body", { status: 400 }));
await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
"Inworld TTS API error (400): bad request body",
);
});
it("throws on in-stream errors", async () => {
const body = JSON.stringify({
error: { code: 3, message: "Invalid voice ID" },
});
queueGuardedResponse(new Response(body, { status: 200 }));
await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
"Inworld TTS stream error (3): Invalid voice ID",
);
});
it("throws on empty audio response", async () => {
const body = JSON.stringify({ result: { audioContent: "" } });
queueGuardedResponse(new Response(body, { status: 200 }));
await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
"Inworld TTS returned no audio data",
);
});
it("throws descriptive error on non-JSON line in stream", async () => {
queueGuardedResponse(new Response("<html>Rate limited</html>", { status: 200 }));
await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
"Inworld TTS stream parse error: unexpected non-JSON line:",
);
});
it("sends correct request body with defaults", async () => {
const chunk = Buffer.from("audio").toString("base64");
queueGuardedResponse(
new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
);
await inworldTTS({ text: "Hello", apiKey: "test-key" });
const request = lastGuardRequest();
expect(request.url).toBe("https://api.inworld.ai/tts/v1/voice:stream");
expect(request.auditContext).toBe("inworld-tts");
expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] });
expect(request.init?.method).toBe("POST");
const headers = new Headers(request.init?.headers);
expect(headers.get("authorization")).toBe("Basic test-key");
expect(headers.get("content-type")).toBe("application/json");
expect(JSON.parse(readRequestBody(request))).toEqual({
text: "Hello",
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
audioConfig: { audioEncoding: "MP3" },
});
});
it("includes temperature and sampleRateHertz when provided", async () => {
const chunk = Buffer.from("audio").toString("base64");
queueGuardedResponse(
new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
);
await inworldTTS({
text: "Hello",
apiKey: "test-key",
voiceId: "Ashley",
modelId: "inworld-tts-1.5-mini",
audioEncoding: "PCM",
sampleRateHertz: 22_050,
temperature: 0.8,
});
const callBody = JSON.parse(readRequestBody(lastGuardRequest()));
expect(callBody.voiceId).toBe("Ashley");
expect(callBody.modelId).toBe("inworld-tts-1.5-mini");
expect(callBody.audioConfig.audioEncoding).toBe("PCM");
expect(callBody.audioConfig.sampleRateHertz).toBe(22_050);
expect(callBody.temperature).toBe(0.8);
});
it("uses custom base URL", async () => {
const chunk = Buffer.from("audio").toString("base64");
queueGuardedResponse(
new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
);
await inworldTTS({
text: "Hello",
apiKey: "test-key",
baseUrl: "https://custom.inworld.example.com/",
});
expect(lastGuardRequest().url).toBe("https://custom.inworld.example.com/tts/v1/voice:stream");
expect(lastGuardRequest().policy).toEqual({
hostnameAllowlist: ["custom.inworld.example.com"],
});
});
it("skips empty lines in streaming response", async () => {
const chunk = Buffer.from("audio").toString("base64");
const body = `\n${JSON.stringify({ result: { audioContent: chunk } })}\n\n`;
queueGuardedResponse(new Response(body, { status: 200 }));
const buffer = await inworldTTS({ text: "test", apiKey: "test-key" });
expect(buffer).toEqual(Buffer.from("audio"));
});
it("releases the guarded dispatcher after success", async () => {
const chunk = Buffer.from("audio").toString("base64");
const { release } = queueGuardedResponse(
new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
);
await inworldTTS({ text: "test", apiKey: "test-key" });
expect(release).toHaveBeenCalledTimes(1);
});
it("releases the guarded dispatcher after failure", async () => {
const { release } = queueGuardedResponse(new Response("fail", { status: 500 }));
await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow();
expect(release).toHaveBeenCalledTimes(1);
});
});

190
extensions/inworld/tts.ts Normal file
View file

@ -0,0 +1,190 @@
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech-core";
import { fetchWithSsrFGuard, type SsrFPolicy } from "openclaw/plugin-sdk/ssrf-runtime";
export const DEFAULT_INWORLD_BASE_URL = "https://api.inworld.ai";
export const DEFAULT_INWORLD_VOICE_ID = "Sarah";
export const DEFAULT_INWORLD_MODEL_ID = "inworld-tts-1.5-max";
export const INWORLD_TTS_MODELS = [
"inworld-tts-1.5-max",
"inworld-tts-1.5-mini",
"inworld-tts-1-max",
"inworld-tts-1",
] as const;
export type InworldAudioEncoding =
| "MP3"
| "OGG_OPUS"
| "LINEAR16"
| "PCM"
| "WAV"
| "ALAW"
| "MULAW"
| "FLAC";
export function normalizeInworldBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
return trimmed?.replace(/\/+$/, "") || DEFAULT_INWORLD_BASE_URL;
}
function ssrfPolicyFromInworldBaseUrl(baseUrl: string): SsrFPolicy | undefined {
try {
const parsed = new URL(baseUrl);
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
return undefined;
}
return { hostnameAllowlist: [parsed.hostname] };
} catch {
return undefined;
}
}
/**
* Calls the Inworld streaming TTS endpoint and concatenates every audio chunk
* into a single buffer. The stream returns newline-delimited JSON, each line
* carrying base64 audio in `result.audioContent`.
*/
export async function inworldTTS(params: {
text: string;
apiKey: string;
baseUrl?: string;
voiceId?: string;
modelId?: string;
audioEncoding?: InworldAudioEncoding;
sampleRateHertz?: number;
temperature?: number;
timeoutMs?: number;
}): Promise<Buffer> {
const baseUrl = normalizeInworldBaseUrl(params.baseUrl);
const url = `${baseUrl}/tts/v1/voice:stream`;
const requestBody = JSON.stringify({
text: params.text,
voiceId: params.voiceId ?? DEFAULT_INWORLD_VOICE_ID,
modelId: params.modelId ?? DEFAULT_INWORLD_MODEL_ID,
audioConfig: {
audioEncoding: params.audioEncoding ?? "MP3",
...(params.sampleRateHertz && { sampleRateHertz: params.sampleRateHertz }),
},
...(params.temperature != null && { temperature: params.temperature }),
});
const { response, release } = await fetchWithSsrFGuard({
url,
init: {
method: "POST",
headers: {
"Content-Type": "application/json",
// apiKey is the Base64-encoded credential string copied from the
// Inworld dashboard; it is sent verbatim as the HTTP Basic
// credential. Do not Base64-encode it here, and do not normalize
// bearer-style tokens.
Authorization: `Basic ${params.apiKey}`,
},
body: requestBody,
},
timeoutMs: params.timeoutMs,
policy: ssrfPolicyFromInworldBaseUrl(baseUrl),
auditContext: "inworld-tts",
});
try {
if (!response.ok) {
const errorBody = await response.text().catch(() => "");
throw new Error(`Inworld TTS API error (${response.status}): ${errorBody}`);
}
const body = await response.text();
const chunks: Buffer[] = [];
for (const line of body.split("\n")) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
let parsed: {
result?: { audioContent?: string };
error?: { code?: number; message?: string };
};
try {
parsed = JSON.parse(trimmed) as typeof parsed;
} catch {
throw new Error(
`Inworld TTS stream parse error: unexpected non-JSON line: ${trimmed.slice(0, 80)}`,
);
}
if (parsed.error) {
throw new Error(`Inworld TTS stream error (${parsed.error.code}): ${parsed.error.message}`);
}
if (parsed.result?.audioContent) {
chunks.push(Buffer.from(parsed.result.audioContent, "base64"));
}
}
if (chunks.length === 0) {
throw new Error("Inworld TTS returned no audio data");
}
return Buffer.concat(chunks);
} finally {
await release();
}
}
export async function listInworldVoices(params: {
apiKey: string;
baseUrl?: string;
language?: string;
timeoutMs?: number;
}): Promise<SpeechVoiceOption[]> {
const baseUrl = normalizeInworldBaseUrl(params.baseUrl);
const langParam = params.language ? `?languages=${encodeURIComponent(params.language)}` : "";
const url = `${baseUrl}/voices/v1/voices${langParam}`;
const { response, release } = await fetchWithSsrFGuard({
url,
init: {
method: "GET",
headers: {
Authorization: `Basic ${params.apiKey}`,
},
},
timeoutMs: params.timeoutMs,
policy: ssrfPolicyFromInworldBaseUrl(baseUrl),
auditContext: "inworld-voices",
});
try {
if (!response.ok) {
const errorBody = await response.text().catch(() => "");
throw new Error(`Inworld voices API error (${response.status}): ${errorBody}`);
}
const json = (await response.json()) as {
voices?: Array<{
voiceId?: string;
displayName?: string;
description?: string;
langCode?: string;
tags?: string[];
source?: string;
}>;
};
return Array.isArray(json.voices)
? json.voices
.map((voice) => ({
id: voice.voiceId?.trim() ?? "",
name: voice.displayName?.trim() || undefined,
description: voice.description?.trim() || undefined,
locale: voice.langCode || undefined,
gender: voice.tags?.find((t) => t === "male" || t === "female") || undefined,
}))
.filter((voice) => voice.id.length > 0)
: [];
} finally {
await release();
}
}

6
pnpm-lock.yaml generated
View file

@ -675,6 +675,12 @@ importers:
specifier: workspace:*
version: link:../../packages/plugin-sdk
extensions/inworld:
devDependencies:
'@openclaw/plugin-sdk':
specifier: workspace:*
version: link:../../packages/plugin-sdk
extensions/irc:
devDependencies:
'@openclaw/plugin-sdk':

View file

@ -3,6 +3,7 @@ import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import { importFreshModule } from "../../../test/helpers/import-fresh.ts";
import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../../test-utils/jiti-runtime.js";
import {
isJavaScriptModulePath,
resolveCompiledBundledModulePath,
@ -92,7 +93,7 @@ describe("channel plugin module loader helpers", () => {
expect(createJiti).not.toHaveBeenCalled();
});
it("uses native Jiti import for Windows dist loads", async () => {
it("uses the runtime-supported Jiti boundary for Windows dist loads", async () => {
const createJiti = vi.fn(() => vi.fn(() => ({ ok: true })));
vi.doMock("jiti", () => ({
createJiti,
@ -119,7 +120,7 @@ describe("channel plugin module loader helpers", () => {
expect(createJiti).toHaveBeenCalledWith(
expect.any(String),
expect.objectContaining({
tryNative: true,
tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(),
}),
);
} finally {

View file

@ -463,8 +463,8 @@ describe("registerPreActionHooks", () => {
});
await runPreAction({
parseArgv: ["agents", "list"],
processArgv: ["node", "openclaw", "agents", "list", "--json"],
parseArgv: ["message", "send"],
processArgv: ["node", "openclaw", "message", "send", "--json"],
});
expect(ensurePluginRegistryLoadedMock).toHaveBeenCalled();

View file

@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import {
listImportedBundledPluginFacadeIds,
loadBundledPluginPublicSurfaceModuleSync,
@ -126,7 +127,7 @@ describe("plugin-sdk facade loader", () => {
expect(listImportedFacadeRuntimeIds()).toEqual(["demo"]);
});
it("uses native Jiti import for Windows dist facade loads", () => {
it("uses the runtime-supported Jiti boundary for Windows dist facade loads", () => {
const dir = createTempDirSync("openclaw-facade-loader-windows-dist-");
const bundledPluginsDir = path.join(dir, "dist");
fs.mkdirSync(path.join(bundledPluginsDir, "demo"), { recursive: true });
@ -158,7 +159,7 @@ describe("plugin-sdk facade loader", () => {
expect(createJitiCalls[0]?.[0]).toEqual(expect.any(String));
expect(createJitiCalls[0]?.[1]).toEqual(
expect.objectContaining({
tryNative: true,
tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(),
}),
);
} finally {

View file

@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js";
import {
getRegistryJitiMocks,
@ -34,7 +35,7 @@ describe("doctor-contract-registry getJiti", () => {
clearPluginDoctorContractRegistryCache();
});
it("uses native jiti loading on Windows for contract-api modules", () => {
it("uses the runtime-supported Jiti boundary on Windows for contract-api modules", () => {
const pluginRoot = makeTempDir();
fs.writeFileSync(path.join(pluginRoot, "contract-api.js"), "export default {};\n", "utf-8");
mocks.loadPluginManifestRegistry.mockReturnValue({
@ -42,6 +43,7 @@ describe("doctor-contract-registry getJiti", () => {
diagnostics: [],
});
const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32");
const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime();
try {
listPluginDoctorLegacyConfigRules({
@ -56,7 +58,7 @@ describe("doctor-contract-registry getJiti", () => {
expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "contract-api.js"));
expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual(
expect.objectContaining({
tryNative: true,
tryNative: expectedTryNative,
}),
);
});

View file

@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js";
import {
getRegistryJitiMocks,
@ -176,7 +177,7 @@ describe("setup-registry getJiti", () => {
clearPluginSetupRegistryCache();
});
it("uses native jiti loading on Windows for setup-api modules", () => {
it("uses the runtime-supported Jiti boundary on Windows for setup-api modules", () => {
const pluginRoot = makeTempDir();
fs.writeFileSync(path.join(pluginRoot, "setup-api.js"), "export default {};\n", "utf-8");
mocks.loadPluginManifestRegistry.mockReturnValue({
@ -185,6 +186,7 @@ describe("setup-registry getJiti", () => {
});
const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32");
const restoreVersions = forceNodeRuntimeVersionsForTest();
const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime();
try {
resolvePluginSetupRegistry({
@ -200,7 +202,7 @@ describe("setup-registry getJiti", () => {
expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "setup-api.js"));
expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual(
expect.objectContaining({
tryNative: true,
tryNative: expectedTryNative,
}),
);
});

View file

@ -0,0 +1,5 @@
export function shouldExpectNativeJitiForJavaScriptTestRuntime(): boolean {
return (
typeof (process.versions as { bun?: string }).bun !== "string" && process.platform !== "win32"
);
}