diff --git a/.env.example b/.env.example
index f7ed308c713..de02b73a595 100644
--- a/.env.example
+++ b/.env.example
@@ -82,4 +82,5 @@ OPENCLAW_GATEWAY_TOKEN=
# ELEVENLABS_API_KEY=...
# XI_API_KEY=... # alias for ElevenLabs
+# INWORLD_API_KEY=...
# DEEPGRAM_API_KEY=...
diff --git a/.github/labeler.yml b/.github/labeler.yml
index ebd77d64925..b55c5fe3b21 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -307,6 +307,11 @@
- changed-files:
- any-glob-to-any-file:
- "extensions/huggingface/**"
+"extensions: inworld":
+ - changed-files:
+ - any-glob-to-any-file:
+ - "extensions/inworld/**"
+ - "docs/providers/inworld.md"
"extensions: kilocode":
- changed-files:
- any-glob-to-any-file:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a36aca45d1..7afec93e23d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai
- Providers/Xiaomi: add MiMo TTS as a bundled speech provider with MP3/WAV output and voice-note Opus transcoding. Fixes #52376. (#55614) Thanks @zoujiejun.
- Providers/ElevenLabs: include `eleven_v3` in the bundled TTS model catalog so model selection surfaces can offer ElevenLabs v3. (#68321) Thanks @itsuzef.
- Providers/Local CLI TTS: add a bundled local command speech provider with file/stdout input, voice-note Opus conversion, and telephony PCM output. (#56239) Thanks @solar2ain.
+- Providers/Inworld: add Inworld as a bundled speech provider with streaming TTS synthesis, voice listing, voice-note output, and PCM telephony output. (#55972) Thanks @cshape.
- Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude.
- Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris.
- Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay.
diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256
index 6bc941f4abb..20265cbfe32 100644
--- a/docs/.generated/config-baseline.sha256
+++ b/docs/.generated/config-baseline.sha256
@@ -1,4 +1,4 @@
-9ac3d271f9bfa9611557f0b52e4d0a600693bdd1de75cc1bafc320fc4d4f0075 config-baseline.json
+0b0d796bceddfb9e2929518ba84af626da7f5d75c392a217041f36e850c4e74f config-baseline.json
271fdf1d6652927e0fc160a6f25276bf6dccb8f1b27fab15e0fc2620e8cacab4 config-baseline.core.json
7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json
-7825b56a5b3fcdbe2e09ef8fe5d9f12ac3598435afebe20413051e45b0d1968e config-baseline.plugin.json
+17eb3f8887193579ff32e35f9bd520ba2bd6049e52ab18855c5d41fcbf195d83 config-baseline.plugin.json
diff --git a/docs/docs.json b/docs/docs.json
index 9d6ea82f7ca..269c3a4f55a 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -1317,6 +1317,7 @@
"providers/groq",
"providers/huggingface",
"providers/inferrs",
+ "providers/inworld",
"providers/kilocode",
"providers/litellm",
"providers/lmstudio",
diff --git a/docs/providers/inworld.md b/docs/providers/inworld.md
new file mode 100644
index 00000000000..d0e6e964c2f
--- /dev/null
+++ b/docs/providers/inworld.md
@@ -0,0 +1,115 @@
+---
+summary: "Inworld streaming text-to-speech for OpenClaw replies"
+read_when:
+ - You want Inworld speech synthesis for outbound replies
+ - You need PCM telephony or OGG_OPUS voice-note output from Inworld
+title: "Inworld"
+---
+
+Inworld is a streaming text-to-speech (TTS) provider. In OpenClaw it
+synthesizes outbound reply audio (MP3 by default, OGG_OPUS for voice notes)
+and PCM audio for telephony channels such as Voice Call.
+
+OpenClaw posts to Inworld's streaming TTS endpoint, concatenates the
+returned base64 audio chunks into a single buffer, and hands the result to
+the standard reply-audio pipeline.
+
+| Detail | Value |
+| ------------- | ----------------------------------------------------------- |
+| Website | [inworld.ai](https://inworld.ai) |
+| Docs | [docs.inworld.ai/tts/tts](https://docs.inworld.ai/tts/tts) |
+| Auth | `INWORLD_API_KEY` (HTTP Basic, Base64 dashboard credential) |
+| Default voice | `Sarah` |
+| Default model | `inworld-tts-1.5-max` |
+
+## Getting started
+
+
+
+ Copy the credential from your Inworld dashboard (Workspace > API Keys)
+ and set it as an env var. The value is sent verbatim as the HTTP Basic
+ credential, so do not Base64-encode it again or convert it to a bearer
+ token.
+
+ ```
+ INWORLD_API_KEY=
+ ```
+
+
+
+ ```json5
+ {
+ messages: {
+ tts: {
+ auto: "always",
+ provider: "inworld",
+ providers: {
+ inworld: {
+ voiceId: "Sarah",
+ modelId: "inworld-tts-1.5-max",
+ },
+ },
+ },
+ },
+ }
+ ```
+
+
+ Send a reply through any connected channel. OpenClaw synthesizes the
+ audio with Inworld and delivers it as MP3 (or OGG_OPUS when the channel
+ expects a voice note).
+
+
+
+## Configuration options
+
+| Option | Path | Description |
+| ------------- | -------------------------------------------- | ----------------------------------------------------------------- |
+| `apiKey` | `messages.tts.providers.inworld.apiKey` | Base64 dashboard credential. Falls back to `INWORLD_API_KEY`. |
+| `baseUrl` | `messages.tts.providers.inworld.baseUrl` | Override Inworld API base URL (default `https://api.inworld.ai`). |
+| `voiceId` | `messages.tts.providers.inworld.voiceId` | Voice identifier (default `Sarah`). |
+| `modelId` | `messages.tts.providers.inworld.modelId` | TTS model id (default `inworld-tts-1.5-max`). |
+| `temperature` | `messages.tts.providers.inworld.temperature` | Sampling temperature `0..2` (optional). |
+
+## Notes
+
+
+
+ Inworld uses HTTP Basic auth with a single Base64-encoded credential
+ string. Copy it verbatim from the Inworld dashboard. The provider sends
+ it as `Authorization: Basic ` without any further encoding, so
+ do not Base64-encode it yourself and do not pass a bearer-style token.
+ See [TTS auth notes](/tools/tts#inworld-primary) for the same callout.
+
+
+ Supported model ids: `inworld-tts-1.5-max` (default),
+ `inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`.
+
+
+ Replies use MP3 by default. When the channel target is `voice-note`
+ OpenClaw asks Inworld for `OGG_OPUS` so the audio plays as a native
+ voice bubble. Telephony synthesis uses raw `PCM` at 22050 Hz to feed
+ the telephony bridge.
+
+
+ Override the API host with `messages.tts.providers.inworld.baseUrl`.
+ Trailing slashes are stripped before requests are sent.
+
+
+
+## Related
+
+
+
+ TTS overview, providers, and `messages.tts` config.
+
+
+ Full config reference including `messages.tts` settings.
+
+
+ All bundled OpenClaw providers.
+
+
+ Common issues and debugging steps.
+
+
diff --git a/docs/tools/tts.md b/docs/tools/tts.md
index 8995e9afafa..a2e23e22fc8 100644
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -7,7 +7,7 @@ read_when:
title: "Text-to-speech"
---
-OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo.
+OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo.
It works anywhere OpenClaw can send audio.
## Supported services
@@ -15,6 +15,7 @@ It works anywhere OpenClaw can send audio.
- **ElevenLabs** (primary or fallback provider)
- **Google Gemini** (primary or fallback provider; uses Gemini API TTS)
- **Gradium** (primary or fallback provider; supports voice-note and telephony output)
+- **Inworld** (primary or fallback provider; uses the Inworld streaming TTS API)
- **Local CLI** (primary or fallback provider; runs a configured local TTS command)
- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`)
- **MiniMax** (primary or fallback provider; uses the T2A v2 API)
@@ -38,11 +39,12 @@ or ElevenLabs.
## Optional keys
-If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, xAI, or Xiaomi MiMo:
+If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `GEMINI_API_KEY` (or `GOOGLE_API_KEY`)
- `GRADIUM_API_KEY`
+- `INWORLD_API_KEY`
- `MINIMAX_API_KEY`; MiniMax TTS also accepts Token Plan auth via
`MINIMAX_OAUTH_TOKEN`, `MINIMAX_CODE_PLAN_KEY`, or
`MINIMAX_CODING_API_KEY`
@@ -64,6 +66,7 @@ so that provider must also be authenticated if you enable summaries.
- [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech)
- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication)
- [Gradium](/providers/gradium)
+- [Inworld TTS API](https://docs.inworld.ai/tts/tts)
- [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2)
- [Xiaomi MiMo speech synthesis](/providers/xiaomi#text-to-speech)
- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts)
@@ -217,6 +220,35 @@ by the bundled Google image-generation provider. Resolution order is
`messages.tts.providers.google.apiKey` -> `models.providers.google.apiKey` ->
`GEMINI_API_KEY` -> `GOOGLE_API_KEY`.
+### Inworld primary
+
+```json5
+{
+ messages: {
+ tts: {
+ auto: "always",
+ provider: "inworld",
+ providers: {
+ inworld: {
+ apiKey: "inworld_api_key",
+ baseUrl: "https://api.inworld.ai",
+ voiceId: "Sarah",
+ modelId: "inworld-tts-1.5-max",
+ temperature: 0.8,
+ },
+ },
+ },
+ },
+}
+```
+
+The `apiKey` value must be the Base64-encoded credential string copied
+verbatim from the Inworld dashboard (Workspace > API Keys). The provider
+sends it as `Authorization: Basic ` without any additional
+encoding, so do not pass a raw bearer token and do not Base64-encode it
+yourself. The key falls back to the `INWORLD_API_KEY` env var. See
+[Inworld provider](/providers/inworld) for full setup.
+
### xAI primary
```json5
@@ -415,7 +447,7 @@ Then run:
- `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
-- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic).
+- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"inworld"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic).
- If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order.
- Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and
rewritten to `provider: "microsoft"`.
@@ -429,7 +461,7 @@ Then run:
- `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded.
- `timeoutMs`: request timeout (ms).
- `prefsPath`: override the local prefs JSON path (provider/limit/summary).
-- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`).
+- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`).
- `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL.
- `providers.openai.baseUrl`: override the OpenAI TTS endpoint.
- Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1`
@@ -453,6 +485,10 @@ Then run:
- `providers.tts-local-cli.timeoutMs`: command timeout in milliseconds (default `120000`).
- `providers.tts-local-cli.cwd`: optional command working directory.
- `providers.tts-local-cli.env`: optional string environment overrides for the command.
+- `providers.inworld.baseUrl`: override Inworld API base URL (default `https://api.inworld.ai`).
+- `providers.inworld.voiceId`: Inworld voice identifier (default `Sarah`).
+- `providers.inworld.modelId`: Inworld TTS model (default `inworld-tts-1.5-max`; also supports `inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`).
+- `providers.inworld.temperature`: sampling temperature `0..2` (optional).
- `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
- `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text.
@@ -586,6 +622,7 @@ These override `messages.tts.*` for that host.
with `ffmpeg`.
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony.
- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
+- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony.
- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
diff --git a/extensions/inworld/index.ts b/extensions/inworld/index.ts
new file mode 100644
index 00000000000..891e95109f6
--- /dev/null
+++ b/extensions/inworld/index.ts
@@ -0,0 +1,11 @@
+import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
+import { buildInworldSpeechProvider } from "./speech-provider.js";
+
+export default definePluginEntry({
+ id: "inworld",
+ name: "Inworld Speech",
+ description: "Bundled Inworld speech provider",
+ register(api) {
+ api.registerSpeechProvider(buildInworldSpeechProvider());
+ },
+});
diff --git a/extensions/inworld/inworld.live.test.ts b/extensions/inworld/inworld.live.test.ts
new file mode 100644
index 00000000000..6cb3736f14b
--- /dev/null
+++ b/extensions/inworld/inworld.live.test.ts
@@ -0,0 +1,84 @@
+import { describe, expect, it } from "vitest";
+import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
+import {
+ registerProviderPlugin,
+ requireRegisteredProvider,
+} from "../../test/helpers/plugins/provider-registration.js";
+import plugin from "./index.js";
+
+const INWORLD_API_KEY = process.env.INWORLD_API_KEY?.trim() ?? "";
+const LIVE = isLiveTestEnabled() && INWORLD_API_KEY.length > 0;
+const describeLive = LIVE ? describe : describe.skip;
+
+const registerInworldPlugin = () =>
+ registerProviderPlugin({
+ plugin,
+ id: "inworld",
+ name: "Inworld",
+ });
+
+describeLive("inworld plugin live", () => {
+ it("lists voices through the registered speech provider", async () => {
+ const { speechProviders } = await registerInworldPlugin();
+ const provider = requireRegisteredProvider(speechProviders, "inworld");
+
+ const voices = await provider.listVoices?.({
+ apiKey: INWORLD_API_KEY,
+ });
+
+ expect(voices?.length).toBeGreaterThan(0);
+ expect(voices).toEqual(expect.arrayContaining([expect.objectContaining({ id: "Sarah" })]));
+ }, 120_000);
+
+ it("synthesizes MP3, native voice-note Ogg/Opus, and telephony PCM", async () => {
+ const { speechProviders } = await registerInworldPlugin();
+ const provider = requireRegisteredProvider(speechProviders, "inworld");
+ const providerConfig = {
+ apiKey: INWORLD_API_KEY,
+ voiceId: "Sarah",
+ modelId: "inworld-tts-1.5-max",
+ };
+
+ const audioFile = await provider.synthesize({
+ text: "OpenClaw Inworld text to speech integration test OK.",
+ cfg: { plugins: { enabled: true } } as never,
+ providerConfig,
+ target: "audio-file",
+ timeoutMs: 90_000,
+ });
+
+ expect(audioFile.outputFormat).toBe("mp3");
+ expect(audioFile.fileExtension).toBe(".mp3");
+ expect(audioFile.voiceCompatible).toBe(false);
+ expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
+ expect(audioFile.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF");
+
+ const voiceNote = await provider.synthesize({
+ text: "OpenClaw Inworld voice note integration test OK.",
+ cfg: { plugins: { enabled: true } } as never,
+ providerConfig,
+ target: "voice-note",
+ timeoutMs: 90_000,
+ });
+
+ expect(voiceNote.outputFormat).toBe("ogg_opus");
+ expect(voiceNote.fileExtension).toBe(".ogg");
+ expect(voiceNote.voiceCompatible).toBe(true);
+ expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(128);
+ expect(voiceNote.audioBuffer.subarray(0, 4).toString("ascii")).toBe("OggS");
+
+ const telephony = await provider.synthesizeTelephony?.({
+ text: "OpenClaw Inworld telephony check OK.",
+ cfg: { plugins: { enabled: true } } as never,
+ providerConfig,
+ timeoutMs: 90_000,
+ });
+ if (!telephony) {
+ throw new Error("Inworld telephony synthesis did not return audio");
+ }
+ expect(telephony.outputFormat).toBe("pcm");
+ expect(telephony.sampleRate).toBe(22_050);
+ expect(telephony.audioBuffer.byteLength).toBeGreaterThan(512);
+ expect(telephony.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF");
+ }, 180_000);
+});
diff --git a/extensions/inworld/openclaw.plugin.json b/extensions/inworld/openclaw.plugin.json
new file mode 100644
index 00000000000..3b64bd56715
--- /dev/null
+++ b/extensions/inworld/openclaw.plugin.json
@@ -0,0 +1,40 @@
+{
+ "id": "inworld",
+ "enabledByDefault": true,
+ "name": "Inworld",
+ "description": "Inworld streaming text-to-speech (MP3, OGG_OPUS, PCM telephony).",
+ "providerAuthEnvVars": {
+ "inworld": ["INWORLD_API_KEY"]
+ },
+ "contracts": {
+ "speechProviders": ["inworld"]
+ },
+ "configSchema": {
+ "type": "object",
+ "additionalProperties": false,
+ "properties": {
+ "apiKey": {
+ "type": "string",
+ "description": "Inworld API key. Must be the Base64 credential string from the Inworld dashboard (used as Authorization: Basic ). Falls back to INWORLD_API_KEY env var."
+ },
+ "baseUrl": {
+ "type": "string",
+ "description": "Override Inworld API base URL (default https://api.inworld.ai)."
+ },
+ "voiceId": {
+ "type": "string",
+ "description": "Voice identifier (default Sarah)."
+ },
+ "modelId": {
+ "type": "string",
+ "description": "TTS model id (default inworld-tts-1.5-max)."
+ },
+ "temperature": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 2,
+ "description": "Sampling temperature 0..2."
+ }
+ }
+ }
+}
diff --git a/extensions/inworld/package.json b/extensions/inworld/package.json
new file mode 100644
index 00000000000..51c68d55d10
--- /dev/null
+++ b/extensions/inworld/package.json
@@ -0,0 +1,15 @@
+{
+ "name": "@openclaw/inworld-speech",
+ "version": "2026.4.16",
+ "private": true,
+ "description": "OpenClaw Inworld speech plugin",
+ "type": "module",
+ "devDependencies": {
+ "@openclaw/plugin-sdk": "workspace:*"
+ },
+ "openclaw": {
+ "extensions": [
+ "./index.ts"
+ ]
+ }
+}
diff --git a/extensions/inworld/speech-provider.test.ts b/extensions/inworld/speech-provider.test.ts
new file mode 100644
index 00000000000..2bbd401b5a6
--- /dev/null
+++ b/extensions/inworld/speech-provider.test.ts
@@ -0,0 +1,213 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+const { inworldTTSMock, listInworldVoicesMock } = vi.hoisted(() => ({
+ inworldTTSMock: vi.fn(),
+ listInworldVoicesMock: vi.fn(),
+}));
+
+vi.mock("./tts.js", async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ inworldTTS: inworldTTSMock,
+ listInworldVoices: listInworldVoicesMock,
+ };
+});
+
+import { buildInworldSpeechProvider } from "./speech-provider.js";
+
+describe("buildInworldSpeechProvider", () => {
+ const originalEnv = process.env.INWORLD_API_KEY;
+
+ afterEach(() => {
+ process.env.INWORLD_API_KEY = originalEnv;
+ inworldTTSMock.mockReset();
+ listInworldVoicesMock.mockReset();
+ vi.restoreAllMocks();
+ });
+
+ it("reports configured when INWORLD_API_KEY env var is set", () => {
+ process.env.INWORLD_API_KEY = "test-key";
+ const provider = buildInworldSpeechProvider();
+ expect(
+ provider.isConfigured({
+ providerConfig: {},
+ timeoutMs: 30_000,
+ }),
+ ).toBe(true);
+ });
+
+ it("reports configured when providerConfig apiKey is set", () => {
+ delete process.env.INWORLD_API_KEY;
+ const provider = buildInworldSpeechProvider();
+ expect(
+ provider.isConfigured({
+ providerConfig: { apiKey: "config-key" },
+ timeoutMs: 30_000,
+ }),
+ ).toBe(true);
+ });
+
+ it("reports not configured when no key is available", () => {
+ delete process.env.INWORLD_API_KEY;
+ const provider = buildInworldSpeechProvider();
+ expect(
+ provider.isConfigured({
+ providerConfig: {},
+ timeoutMs: 30_000,
+ }),
+ ).toBe(false);
+ });
+
+ it("has correct provider metadata", () => {
+ const provider = buildInworldSpeechProvider();
+ expect(provider.id).toBe("inworld");
+ expect(provider.label).toBe("Inworld");
+ expect(provider.autoSelectOrder).toBe(30);
+ expect(provider.models).toContain("inworld-tts-1.5-max");
+ expect(provider.models).toContain("inworld-tts-1.5-mini");
+ });
+
+ it("normalizes provider-owned speech config from raw provider config", () => {
+ const provider = buildInworldSpeechProvider();
+ const resolved = provider.resolveConfig?.({
+ cfg: {} as never,
+ timeoutMs: 30_000,
+ rawConfig: {
+ providers: {
+ inworld: {
+ apiKey: "basic-key",
+ baseUrl: "https://custom.inworld.example.com/",
+ voiceId: "Ashley",
+ modelId: "inworld-tts-1.5-mini",
+ temperature: 0.8,
+ },
+ },
+ },
+ });
+
+ expect(resolved).toEqual({
+ apiKey: "basic-key",
+ baseUrl: "https://custom.inworld.example.com",
+ voiceId: "Ashley",
+ modelId: "inworld-tts-1.5-mini",
+ temperature: 0.8,
+ });
+ });
+
+ it("parses Inworld TTS directive overrides", () => {
+ const provider = buildInworldSpeechProvider();
+ const policy = {
+ enabled: true,
+ allowText: true,
+ allowProvider: true,
+ allowVoice: true,
+ allowModelId: true,
+ allowVoiceSettings: true,
+ allowNormalization: true,
+ allowSeed: true,
+ };
+
+ expect(provider.parseDirectiveToken?.({ key: "voice", value: "Ashley", policy })).toEqual({
+ handled: true,
+ overrides: { voiceId: "Ashley" },
+ });
+ expect(
+ provider.parseDirectiveToken?.({
+ key: "model",
+ value: "inworld-tts-1.5-mini",
+ policy,
+ }),
+ ).toEqual({
+ handled: true,
+ overrides: { modelId: "inworld-tts-1.5-mini" },
+ });
+ expect(provider.parseDirectiveToken?.({ key: "temperature", value: "0.7", policy })).toEqual({
+ handled: true,
+ overrides: { temperature: 0.7 },
+ });
+ });
+
+ it("warns on invalid directive temperature", () => {
+ const provider = buildInworldSpeechProvider();
+ expect(
+ provider.parseDirectiveToken?.({
+ key: "temperature",
+ value: "3",
+ policy: {
+ enabled: true,
+ allowText: true,
+ allowProvider: true,
+ allowVoice: true,
+ allowModelId: true,
+ allowVoiceSettings: true,
+ allowNormalization: true,
+ allowSeed: true,
+ },
+ }),
+ ).toEqual({
+ handled: true,
+ warnings: ['invalid Inworld temperature "3"'],
+ });
+ });
+
+ it("synthesizes voice-note targets with native OGG_OPUS output", async () => {
+ inworldTTSMock.mockResolvedValueOnce(Buffer.from("opus"));
+ const provider = buildInworldSpeechProvider();
+
+ const result = await provider.synthesize?.({
+ text: "Hello",
+ cfg: {} as never,
+ providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
+ providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
+ target: "voice-note",
+ timeoutMs: 30_000,
+ });
+
+ expect(inworldTTSMock).toHaveBeenCalledWith({
+ text: "Hello",
+ apiKey: "key",
+ baseUrl: "https://api.inworld.ai",
+ voiceId: "Ashley",
+ modelId: "inworld-tts-1.5-mini",
+ audioEncoding: "OGG_OPUS",
+ temperature: 0.6,
+ timeoutMs: 30_000,
+ });
+ expect(result).toEqual({
+ audioBuffer: Buffer.from("opus"),
+ outputFormat: "ogg_opus",
+ fileExtension: ".ogg",
+ voiceCompatible: true,
+ });
+ });
+
+ it("synthesizes telephony PCM at 22050 Hz", async () => {
+ inworldTTSMock.mockResolvedValueOnce(Buffer.from("pcm"));
+ const provider = buildInworldSpeechProvider();
+
+ const result = await provider.synthesizeTelephony?.({
+ text: "Hello",
+ cfg: {} as never,
+ providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
+ timeoutMs: 30_000,
+ });
+
+ expect(inworldTTSMock).toHaveBeenCalledWith({
+ text: "Hello",
+ apiKey: "key",
+ baseUrl: "https://api.inworld.ai",
+ voiceId: "Sarah",
+ modelId: "inworld-tts-1.5-max",
+ audioEncoding: "PCM",
+ sampleRateHertz: 22_050,
+ temperature: undefined,
+ timeoutMs: 30_000,
+ });
+ expect(result).toEqual({
+ audioBuffer: Buffer.from("pcm"),
+ outputFormat: "pcm",
+ sampleRate: 22_050,
+ });
+ });
+});
diff --git a/extensions/inworld/speech-provider.ts b/extensions/inworld/speech-provider.ts
new file mode 100644
index 00000000000..f9c28a91e46
--- /dev/null
+++ b/extensions/inworld/speech-provider.ts
@@ -0,0 +1,221 @@
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import type {
+ SpeechDirectiveTokenParseContext,
+ SpeechProviderConfig,
+ SpeechProviderOverrides,
+ SpeechProviderPlugin,
+} from "openclaw/plugin-sdk/speech-core";
+import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
+import {
+ DEFAULT_INWORLD_MODEL_ID,
+ DEFAULT_INWORLD_VOICE_ID,
+ type InworldAudioEncoding,
+ INWORLD_TTS_MODELS,
+ inworldTTS,
+ listInworldVoices,
+ normalizeInworldBaseUrl,
+} from "./tts.js";
+
+type InworldProviderConfig = {
+ apiKey?: string;
+ baseUrl: string;
+ voiceId: string;
+ modelId: string;
+ temperature?: number;
+};
+
+type InworldProviderOverrides = {
+ voiceId?: string;
+ modelId?: string;
+ temperature?: number;
+};
+
+function normalizeInworldProviderConfig(rawConfig: Record): InworldProviderConfig {
+ const providers = asObject(rawConfig.providers);
+ const raw = asObject(providers?.inworld) ?? asObject(rawConfig.inworld);
+ return {
+ apiKey: normalizeResolvedSecretInputString({
+ value: raw?.apiKey,
+ path: "messages.tts.providers.inworld.apiKey",
+ }),
+ baseUrl: normalizeInworldBaseUrl(trimToUndefined(raw?.baseUrl)),
+ voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_INWORLD_VOICE_ID,
+ modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_INWORLD_MODEL_ID,
+ temperature: asFiniteNumber(raw?.temperature),
+ };
+}
+
+function readInworldProviderConfig(config: SpeechProviderConfig): InworldProviderConfig {
+ const defaults = normalizeInworldProviderConfig({});
+ return {
+ apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
+ baseUrl: normalizeInworldBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
+ voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
+ modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
+ temperature: asFiniteNumber(config.temperature) ?? defaults.temperature,
+ };
+}
+
+function readInworldOverrides(
+ overrides: SpeechProviderOverrides | undefined,
+): InworldProviderOverrides {
+ if (!overrides) {
+ return {};
+ }
+ return {
+ voiceId: trimToUndefined(overrides.voiceId ?? overrides.voice),
+ modelId: trimToUndefined(overrides.modelId ?? overrides.model),
+ temperature: asFiniteNumber(overrides.temperature),
+ };
+}
+
+function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
+ handled: boolean;
+ overrides?: SpeechProviderOverrides;
+ warnings?: string[];
+} {
+ switch (ctx.key) {
+ case "voice":
+ case "voiceid":
+ case "voice_id":
+ case "inworld_voice":
+ case "inworldvoice":
+ if (!ctx.policy.allowVoice) {
+ return { handled: true };
+ }
+ return { handled: true, overrides: { voiceId: ctx.value } };
+ case "model":
+ case "modelid":
+ case "model_id":
+ case "inworld_model":
+ case "inworldmodel":
+ if (!ctx.policy.allowModelId) {
+ return { handled: true };
+ }
+ return { handled: true, overrides: { modelId: ctx.value } };
+ case "temperature": {
+ if (!ctx.policy.allowVoiceSettings) {
+ return { handled: true };
+ }
+ const temperature = Number(ctx.value);
+ if (!Number.isFinite(temperature) || temperature < 0 || temperature > 2) {
+ return { handled: true, warnings: [`invalid Inworld temperature "${ctx.value}"`] };
+ }
+ return { handled: true, overrides: { temperature } };
+ }
+ default:
+ return { handled: false };
+ }
+}
+
+export function buildInworldSpeechProvider(): SpeechProviderPlugin {
+ return {
+ id: "inworld",
+ label: "Inworld",
+ autoSelectOrder: 30,
+ models: INWORLD_TTS_MODELS,
+ resolveConfig: ({ rawConfig }) => normalizeInworldProviderConfig(rawConfig),
+ parseDirectiveToken,
+ resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
+ const base = normalizeInworldProviderConfig(baseTtsConfig);
+ const resolvedApiKey =
+ talkProviderConfig.apiKey === undefined
+ ? undefined
+ : normalizeResolvedSecretInputString({
+ value: talkProviderConfig.apiKey,
+ path: "talk.providers.inworld.apiKey",
+ });
+ return {
+ ...base,
+ ...(resolvedApiKey === undefined ? {} : { apiKey: resolvedApiKey }),
+ ...(trimToUndefined(talkProviderConfig.baseUrl) == null
+ ? {}
+ : { baseUrl: normalizeInworldBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
+ ...(trimToUndefined(talkProviderConfig.voiceId) == null
+ ? {}
+ : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
+ ...(trimToUndefined(talkProviderConfig.modelId) == null
+ ? {}
+ : { modelId: trimToUndefined(talkProviderConfig.modelId) }),
+ ...(asFiniteNumber(talkProviderConfig.temperature) == null
+ ? {}
+ : { temperature: asFiniteNumber(talkProviderConfig.temperature) }),
+ };
+ },
+ resolveTalkOverrides: ({ params }) => ({
+ ...(trimToUndefined(params.voiceId) == null
+ ? {}
+ : { voiceId: trimToUndefined(params.voiceId) }),
+ ...(trimToUndefined(params.modelId) == null
+ ? {}
+ : { modelId: trimToUndefined(params.modelId) }),
+ ...(asFiniteNumber(params.temperature) == null
+ ? {}
+ : { temperature: asFiniteNumber(params.temperature) }),
+ }),
+ listVoices: async (req) => {
+ const config = req.providerConfig ? readInworldProviderConfig(req.providerConfig) : undefined;
+ const apiKey = req.apiKey || config?.apiKey || process.env.INWORLD_API_KEY;
+ if (!apiKey) {
+ throw new Error("Inworld API key missing");
+ }
+ return listInworldVoices({
+ apiKey,
+ baseUrl: req.baseUrl ?? config?.baseUrl,
+ });
+ },
+ isConfigured: ({ providerConfig }) =>
+ Boolean(readInworldProviderConfig(providerConfig).apiKey || process.env.INWORLD_API_KEY),
+ synthesize: async (req) => {
+ const config = readInworldProviderConfig(req.providerConfig);
+ const overrides = readInworldOverrides(req.providerOverrides);
+ const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
+ if (!apiKey) {
+ throw new Error("Inworld API key missing");
+ }
+
+ const useOpus = req.target === "voice-note";
+ const audioEncoding: InworldAudioEncoding = useOpus ? "OGG_OPUS" : "MP3";
+
+ const audioBuffer = await inworldTTS({
+ text: req.text,
+ apiKey,
+ baseUrl: config.baseUrl,
+ voiceId: overrides.voiceId ?? config.voiceId,
+ modelId: overrides.modelId ?? config.modelId,
+ audioEncoding,
+ temperature: overrides.temperature ?? config.temperature,
+ timeoutMs: req.timeoutMs,
+ });
+
+ return {
+ audioBuffer,
+ outputFormat: audioEncoding.toLowerCase(),
+ fileExtension: useOpus ? ".ogg" : ".mp3",
+ voiceCompatible: useOpus,
+ };
+ },
+ synthesizeTelephony: async (req) => {
+ const config = readInworldProviderConfig(req.providerConfig);
+ const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
+ if (!apiKey) {
+ throw new Error("Inworld API key missing");
+ }
+
+ const sampleRate = 22_050;
+ const audioBuffer = await inworldTTS({
+ text: req.text,
+ apiKey,
+ baseUrl: config.baseUrl,
+ voiceId: config.voiceId,
+ modelId: config.modelId,
+ audioEncoding: "PCM",
+ sampleRateHertz: sampleRate,
+ temperature: config.temperature,
+ timeoutMs: req.timeoutMs,
+ });
+
+ return { audioBuffer, outputFormat: "pcm", sampleRate };
+ },
+ };
+}
diff --git a/extensions/inworld/tsconfig.json b/extensions/inworld/tsconfig.json
new file mode 100644
index 00000000000..b8a85a99ac3
--- /dev/null
+++ b/extensions/inworld/tsconfig.json
@@ -0,0 +1,16 @@
+{
+ "extends": "../tsconfig.package-boundary.base.json",
+ "compilerOptions": {
+ "rootDir": "."
+ },
+ "include": ["./*.ts", "./src/**/*.ts"],
+ "exclude": [
+ "./**/*.test.ts",
+ "./dist/**",
+ "./node_modules/**",
+ "./src/test-support/**",
+ "./src/**/*test-helpers.ts",
+ "./src/**/*test-harness.ts",
+ "./src/**/*test-support.ts"
+ ]
+}
diff --git a/extensions/inworld/tts.test.ts b/extensions/inworld/tts.test.ts
new file mode 100644
index 00000000000..f3fadeddfe7
--- /dev/null
+++ b/extensions/inworld/tts.test.ts
@@ -0,0 +1,312 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({
+ fetchWithSsrFGuardMock: vi.fn(),
+}));
+
+vi.mock("openclaw/plugin-sdk/ssrf-runtime", async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ fetchWithSsrFGuard: fetchWithSsrFGuardMock,
+ };
+});
+
+import { inworldTTS, listInworldVoices } from "./tts.js";
+
+type GuardRequest = {
+ url: string;
+ init?: RequestInit;
+ auditContext?: string;
+ policy?: unknown;
+ timeoutMs?: number;
+};
+
+function queueGuardedResponse(response: Response): { release: ReturnType } {
+ const release = vi.fn(async () => {});
+ fetchWithSsrFGuardMock.mockResolvedValueOnce({ response, release });
+ return { release };
+}
+
+function lastGuardRequest(): GuardRequest {
+ const call = fetchWithSsrFGuardMock.mock.calls.at(-1);
+ if (!call) {
+ throw new Error("fetchWithSsrFGuard was not called");
+ }
+ return call[0] as GuardRequest;
+}
+
+function readRequestBody(request: GuardRequest): string {
+ const body = request.init?.body;
+ if (typeof body !== "string") {
+ throw new Error("expected request body to be a string");
+ }
+ return body;
+}
+
+describe("listInworldVoices", () => {
+ afterEach(() => {
+ fetchWithSsrFGuardMock.mockClear();
+ vi.restoreAllMocks();
+ });
+
+ it("maps Inworld voice metadata into speech voice options", async () => {
+ queueGuardedResponse(
+ new Response(
+ JSON.stringify({
+ voices: [
+ {
+ voiceId: "Dennis",
+ displayName: "Dennis",
+ description: "Middle-aged man with a smooth, calm and friendly voice",
+ langCode: "EN_US",
+ tags: ["male", "middle-aged", "smooth", "calm", "friendly"],
+ source: "SYSTEM",
+ },
+ {
+ voiceId: "Ashley",
+ displayName: "Ashley",
+ description: "A warm, natural female voice",
+ langCode: "EN_US",
+ tags: ["female", "warm", "natural"],
+ source: "SYSTEM",
+ },
+ ],
+ }),
+ { status: 200 },
+ ),
+ );
+
+ const voices = await listInworldVoices({ apiKey: "test-key" });
+
+ expect(voices).toEqual([
+ {
+ id: "Dennis",
+ name: "Dennis",
+ description: "Middle-aged man with a smooth, calm and friendly voice",
+ locale: "EN_US",
+ gender: "male",
+ },
+ {
+ id: "Ashley",
+ name: "Ashley",
+ description: "A warm, natural female voice",
+ locale: "EN_US",
+ gender: "female",
+ },
+ ]);
+ const request = lastGuardRequest();
+ expect(request.url).toBe("https://api.inworld.ai/voices/v1/voices");
+ expect(request.auditContext).toBe("inworld-voices");
+ expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] });
+ const headers = new Headers(request.init?.headers);
+ expect(headers.get("authorization")).toBe("Basic test-key");
+ });
+
+ it("throws on API errors with response body", async () => {
+ queueGuardedResponse(new Response("service unavailable", { status: 503 }));
+
+ await expect(listInworldVoices({ apiKey: "test-key" })).rejects.toThrow(
+ "Inworld voices API error (503): service unavailable",
+ );
+ });
+
+ it("filters out voices with empty voiceId", async () => {
+ queueGuardedResponse(
+ new Response(
+ JSON.stringify({
+ voices: [
+ { voiceId: "", displayName: "Empty" },
+ { voiceId: "Dennis", displayName: "Dennis" },
+ ],
+ }),
+ { status: 200 },
+ ),
+ );
+
+ const voices = await listInworldVoices({ apiKey: "test-key" });
+ expect(voices).toHaveLength(1);
+ expect(voices[0].id).toBe("Dennis");
+ });
+
+ it("returns empty array when no voices present", async () => {
+ queueGuardedResponse(new Response(JSON.stringify({}), { status: 200 }));
+
+ const voices = await listInworldVoices({ apiKey: "test-key" });
+ expect(voices).toEqual([]);
+ });
+
+ it("passes language filter as query parameter", async () => {
+ queueGuardedResponse(new Response(JSON.stringify({ voices: [] }), { status: 200 }));
+
+ await listInworldVoices({ apiKey: "test-key", language: "EN_US" });
+
+ expect(lastGuardRequest().url).toBe("https://api.inworld.ai/voices/v1/voices?languages=EN_US");
+ });
+
+ it("releases the guarded dispatcher after success", async () => {
+ const { release } = queueGuardedResponse(
+ new Response(JSON.stringify({ voices: [] }), { status: 200 }),
+ );
+
+ await listInworldVoices({ apiKey: "test-key" });
+
+ expect(release).toHaveBeenCalledTimes(1);
+ });
+});
+
+describe("inworldTTS", () => {
+ afterEach(() => {
+ fetchWithSsrFGuardMock.mockClear();
+ vi.restoreAllMocks();
+ });
+
+ it("concatenates base64 audio chunks from streaming response", async () => {
+ const chunk1 = Buffer.from("audio-chunk-1").toString("base64");
+ const chunk2 = Buffer.from("audio-chunk-2").toString("base64");
+ const body = [
+ JSON.stringify({ result: { audioContent: chunk1 } }),
+ JSON.stringify({ result: { audioContent: chunk2 } }),
+ ].join("\n");
+
+ queueGuardedResponse(new Response(body, { status: 200 }));
+
+ const buffer = await inworldTTS({
+ text: "Hello world",
+ apiKey: "test-key",
+ });
+
+ expect(buffer).toEqual(
+ Buffer.concat([Buffer.from("audio-chunk-1"), Buffer.from("audio-chunk-2")]),
+ );
+ });
+
+ it("throws on HTTP errors with response body", async () => {
+ queueGuardedResponse(new Response("bad request body", { status: 400 }));
+
+ await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
+ "Inworld TTS API error (400): bad request body",
+ );
+ });
+
+ it("throws on in-stream errors", async () => {
+ const body = JSON.stringify({
+ error: { code: 3, message: "Invalid voice ID" },
+ });
+ queueGuardedResponse(new Response(body, { status: 200 }));
+
+ await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
+ "Inworld TTS stream error (3): Invalid voice ID",
+ );
+ });
+
+ it("throws on empty audio response", async () => {
+ const body = JSON.stringify({ result: { audioContent: "" } });
+ queueGuardedResponse(new Response(body, { status: 200 }));
+
+ await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
+ "Inworld TTS returned no audio data",
+ );
+ });
+
+ it("throws descriptive error on non-JSON line in stream", async () => {
+ queueGuardedResponse(new Response("Rate limited", { status: 200 }));
+
+ await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(
+ "Inworld TTS stream parse error: unexpected non-JSON line:",
+ );
+ });
+
+ it("sends correct request body with defaults", async () => {
+ const chunk = Buffer.from("audio").toString("base64");
+ queueGuardedResponse(
+ new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
+ );
+
+ await inworldTTS({ text: "Hello", apiKey: "test-key" });
+
+ const request = lastGuardRequest();
+ expect(request.url).toBe("https://api.inworld.ai/tts/v1/voice:stream");
+ expect(request.auditContext).toBe("inworld-tts");
+ expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] });
+ expect(request.init?.method).toBe("POST");
+ const headers = new Headers(request.init?.headers);
+ expect(headers.get("authorization")).toBe("Basic test-key");
+ expect(headers.get("content-type")).toBe("application/json");
+ expect(JSON.parse(readRequestBody(request))).toEqual({
+ text: "Hello",
+ voiceId: "Sarah",
+ modelId: "inworld-tts-1.5-max",
+ audioConfig: { audioEncoding: "MP3" },
+ });
+ });
+
+ it("includes temperature and sampleRateHertz when provided", async () => {
+ const chunk = Buffer.from("audio").toString("base64");
+ queueGuardedResponse(
+ new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
+ );
+
+ await inworldTTS({
+ text: "Hello",
+ apiKey: "test-key",
+ voiceId: "Ashley",
+ modelId: "inworld-tts-1.5-mini",
+ audioEncoding: "PCM",
+ sampleRateHertz: 22_050,
+ temperature: 0.8,
+ });
+
+ const callBody = JSON.parse(readRequestBody(lastGuardRequest()));
+ expect(callBody.voiceId).toBe("Ashley");
+ expect(callBody.modelId).toBe("inworld-tts-1.5-mini");
+ expect(callBody.audioConfig.audioEncoding).toBe("PCM");
+ expect(callBody.audioConfig.sampleRateHertz).toBe(22_050);
+ expect(callBody.temperature).toBe(0.8);
+ });
+
+ it("uses custom base URL", async () => {
+ const chunk = Buffer.from("audio").toString("base64");
+ queueGuardedResponse(
+ new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
+ );
+
+ await inworldTTS({
+ text: "Hello",
+ apiKey: "test-key",
+ baseUrl: "https://custom.inworld.example.com/",
+ });
+
+ expect(lastGuardRequest().url).toBe("https://custom.inworld.example.com/tts/v1/voice:stream");
+ expect(lastGuardRequest().policy).toEqual({
+ hostnameAllowlist: ["custom.inworld.example.com"],
+ });
+ });
+
+ it("skips empty lines in streaming response", async () => {
+ const chunk = Buffer.from("audio").toString("base64");
+ const body = `\n${JSON.stringify({ result: { audioContent: chunk } })}\n\n`;
+ queueGuardedResponse(new Response(body, { status: 200 }));
+
+ const buffer = await inworldTTS({ text: "test", apiKey: "test-key" });
+ expect(buffer).toEqual(Buffer.from("audio"));
+ });
+
+ it("releases the guarded dispatcher after success", async () => {
+ const chunk = Buffer.from("audio").toString("base64");
+ const { release } = queueGuardedResponse(
+ new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }),
+ );
+
+ await inworldTTS({ text: "test", apiKey: "test-key" });
+
+ expect(release).toHaveBeenCalledTimes(1);
+ });
+
+ it("releases the guarded dispatcher after failure", async () => {
+ const { release } = queueGuardedResponse(new Response("fail", { status: 500 }));
+
+ await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow();
+ expect(release).toHaveBeenCalledTimes(1);
+ });
+});
diff --git a/extensions/inworld/tts.ts b/extensions/inworld/tts.ts
new file mode 100644
index 00000000000..e5009d1e8b5
--- /dev/null
+++ b/extensions/inworld/tts.ts
@@ -0,0 +1,190 @@
+import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech-core";
+import { fetchWithSsrFGuard, type SsrFPolicy } from "openclaw/plugin-sdk/ssrf-runtime";
+
+export const DEFAULT_INWORLD_BASE_URL = "https://api.inworld.ai";
+export const DEFAULT_INWORLD_VOICE_ID = "Sarah";
+export const DEFAULT_INWORLD_MODEL_ID = "inworld-tts-1.5-max";
+
+export const INWORLD_TTS_MODELS = [
+ "inworld-tts-1.5-max",
+ "inworld-tts-1.5-mini",
+ "inworld-tts-1-max",
+ "inworld-tts-1",
+] as const;
+
+export type InworldAudioEncoding =
+ | "MP3"
+ | "OGG_OPUS"
+ | "LINEAR16"
+ | "PCM"
+ | "WAV"
+ | "ALAW"
+ | "MULAW"
+ | "FLAC";
+
+export function normalizeInworldBaseUrl(baseUrl?: string): string {
+ const trimmed = baseUrl?.trim();
+ return trimmed?.replace(/\/+$/, "") || DEFAULT_INWORLD_BASE_URL;
+}
+
+function ssrfPolicyFromInworldBaseUrl(baseUrl: string): SsrFPolicy | undefined {
+ try {
+ const parsed = new URL(baseUrl);
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
+ return undefined;
+ }
+ return { hostnameAllowlist: [parsed.hostname] };
+ } catch {
+ return undefined;
+ }
+}
+
+/**
+ * Calls the Inworld streaming TTS endpoint and concatenates every audio chunk
+ * into a single buffer. The stream returns newline-delimited JSON, each line
+ * carrying base64 audio in `result.audioContent`.
+ */
+export async function inworldTTS(params: {
+ text: string;
+ apiKey: string;
+ baseUrl?: string;
+ voiceId?: string;
+ modelId?: string;
+ audioEncoding?: InworldAudioEncoding;
+ sampleRateHertz?: number;
+ temperature?: number;
+ timeoutMs?: number;
+}): Promise {
+ const baseUrl = normalizeInworldBaseUrl(params.baseUrl);
+ const url = `${baseUrl}/tts/v1/voice:stream`;
+ const requestBody = JSON.stringify({
+ text: params.text,
+ voiceId: params.voiceId ?? DEFAULT_INWORLD_VOICE_ID,
+ modelId: params.modelId ?? DEFAULT_INWORLD_MODEL_ID,
+ audioConfig: {
+ audioEncoding: params.audioEncoding ?? "MP3",
+ ...(params.sampleRateHertz && { sampleRateHertz: params.sampleRateHertz }),
+ },
+ ...(params.temperature != null && { temperature: params.temperature }),
+ });
+
+ const { response, release } = await fetchWithSsrFGuard({
+ url,
+ init: {
+ method: "POST",
+ headers: {
+ "Content-Type": "application/json",
+ // apiKey is the Base64-encoded credential string copied from the
+ // Inworld dashboard; it is sent verbatim as the HTTP Basic
+ // credential. Do not Base64-encode it here, and do not normalize
+ // bearer-style tokens.
+ Authorization: `Basic ${params.apiKey}`,
+ },
+ body: requestBody,
+ },
+ timeoutMs: params.timeoutMs,
+ policy: ssrfPolicyFromInworldBaseUrl(baseUrl),
+ auditContext: "inworld-tts",
+ });
+
+ try {
+ if (!response.ok) {
+ const errorBody = await response.text().catch(() => "");
+ throw new Error(`Inworld TTS API error (${response.status}): ${errorBody}`);
+ }
+
+ const body = await response.text();
+ const chunks: Buffer[] = [];
+
+ for (const line of body.split("\n")) {
+ const trimmed = line.trim();
+ if (!trimmed) {
+ continue;
+ }
+
+ let parsed: {
+ result?: { audioContent?: string };
+ error?: { code?: number; message?: string };
+ };
+ try {
+ parsed = JSON.parse(trimmed) as typeof parsed;
+ } catch {
+ throw new Error(
+ `Inworld TTS stream parse error: unexpected non-JSON line: ${trimmed.slice(0, 80)}`,
+ );
+ }
+
+ if (parsed.error) {
+ throw new Error(`Inworld TTS stream error (${parsed.error.code}): ${parsed.error.message}`);
+ }
+
+ if (parsed.result?.audioContent) {
+ chunks.push(Buffer.from(parsed.result.audioContent, "base64"));
+ }
+ }
+
+ if (chunks.length === 0) {
+ throw new Error("Inworld TTS returned no audio data");
+ }
+
+ return Buffer.concat(chunks);
+ } finally {
+ await release();
+ }
+}
+
+export async function listInworldVoices(params: {
+ apiKey: string;
+ baseUrl?: string;
+ language?: string;
+ timeoutMs?: number;
+}): Promise {
+ const baseUrl = normalizeInworldBaseUrl(params.baseUrl);
+ const langParam = params.language ? `?languages=${encodeURIComponent(params.language)}` : "";
+ const url = `${baseUrl}/voices/v1/voices${langParam}`;
+
+ const { response, release } = await fetchWithSsrFGuard({
+ url,
+ init: {
+ method: "GET",
+ headers: {
+ Authorization: `Basic ${params.apiKey}`,
+ },
+ },
+ timeoutMs: params.timeoutMs,
+ policy: ssrfPolicyFromInworldBaseUrl(baseUrl),
+ auditContext: "inworld-voices",
+ });
+
+ try {
+ if (!response.ok) {
+ const errorBody = await response.text().catch(() => "");
+ throw new Error(`Inworld voices API error (${response.status}): ${errorBody}`);
+ }
+
+ const json = (await response.json()) as {
+ voices?: Array<{
+ voiceId?: string;
+ displayName?: string;
+ description?: string;
+ langCode?: string;
+ tags?: string[];
+ source?: string;
+ }>;
+ };
+
+ return Array.isArray(json.voices)
+ ? json.voices
+ .map((voice) => ({
+ id: voice.voiceId?.trim() ?? "",
+ name: voice.displayName?.trim() || undefined,
+ description: voice.description?.trim() || undefined,
+ locale: voice.langCode || undefined,
+ gender: voice.tags?.find((t) => t === "male" || t === "female") || undefined,
+ }))
+ .filter((voice) => voice.id.length > 0)
+ : [];
+ } finally {
+ await release();
+ }
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 5ebedc4822b..ec1e96d2742 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -675,6 +675,12 @@ importers:
specifier: workspace:*
version: link:../../packages/plugin-sdk
+ extensions/inworld:
+ devDependencies:
+ '@openclaw/plugin-sdk':
+ specifier: workspace:*
+ version: link:../../packages/plugin-sdk
+
extensions/irc:
devDependencies:
'@openclaw/plugin-sdk':
diff --git a/src/channels/plugins/module-loader.test.ts b/src/channels/plugins/module-loader.test.ts
index 4ee33f12d26..66463bba47c 100644
--- a/src/channels/plugins/module-loader.test.ts
+++ b/src/channels/plugins/module-loader.test.ts
@@ -3,6 +3,7 @@ import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import { importFreshModule } from "../../../test/helpers/import-fresh.ts";
+import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../../test-utils/jiti-runtime.js";
import {
isJavaScriptModulePath,
resolveCompiledBundledModulePath,
@@ -92,7 +93,7 @@ describe("channel plugin module loader helpers", () => {
expect(createJiti).not.toHaveBeenCalled();
});
- it("uses native Jiti import for Windows dist loads", async () => {
+ it("uses the runtime-supported Jiti boundary for Windows dist loads", async () => {
const createJiti = vi.fn(() => vi.fn(() => ({ ok: true })));
vi.doMock("jiti", () => ({
createJiti,
@@ -119,7 +120,7 @@ describe("channel plugin module loader helpers", () => {
expect(createJiti).toHaveBeenCalledWith(
expect.any(String),
expect.objectContaining({
- tryNative: true,
+ tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(),
}),
);
} finally {
diff --git a/src/cli/program/preaction.test.ts b/src/cli/program/preaction.test.ts
index 49342b01ab9..a57e08f7771 100644
--- a/src/cli/program/preaction.test.ts
+++ b/src/cli/program/preaction.test.ts
@@ -463,8 +463,8 @@ describe("registerPreActionHooks", () => {
});
await runPreAction({
- parseArgv: ["agents", "list"],
- processArgv: ["node", "openclaw", "agents", "list", "--json"],
+ parseArgv: ["message", "send"],
+ processArgv: ["node", "openclaw", "message", "send", "--json"],
});
expect(ensurePluginRegistryLoadedMock).toHaveBeenCalled();
diff --git a/src/plugin-sdk/facade-loader.test.ts b/src/plugin-sdk/facade-loader.test.ts
index 98e8de84680..05b6a0150b4 100644
--- a/src/plugin-sdk/facade-loader.test.ts
+++ b/src/plugin-sdk/facade-loader.test.ts
@@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
+import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import {
listImportedBundledPluginFacadeIds,
loadBundledPluginPublicSurfaceModuleSync,
@@ -126,7 +127,7 @@ describe("plugin-sdk facade loader", () => {
expect(listImportedFacadeRuntimeIds()).toEqual(["demo"]);
});
- it("uses native Jiti import for Windows dist facade loads", () => {
+ it("uses the runtime-supported Jiti boundary for Windows dist facade loads", () => {
const dir = createTempDirSync("openclaw-facade-loader-windows-dist-");
const bundledPluginsDir = path.join(dir, "dist");
fs.mkdirSync(path.join(bundledPluginsDir, "demo"), { recursive: true });
@@ -158,7 +159,7 @@ describe("plugin-sdk facade loader", () => {
expect(createJitiCalls[0]?.[0]).toEqual(expect.any(String));
expect(createJitiCalls[0]?.[1]).toEqual(
expect.objectContaining({
- tryNative: true,
+ tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(),
}),
);
} finally {
diff --git a/src/plugins/doctor-contract-registry.test.ts b/src/plugins/doctor-contract-registry.test.ts
index 44d0a3470f1..e51c3578fbe 100644
--- a/src/plugins/doctor-contract-registry.test.ts
+++ b/src/plugins/doctor-contract-registry.test.ts
@@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js";
import {
getRegistryJitiMocks,
@@ -34,7 +35,7 @@ describe("doctor-contract-registry getJiti", () => {
clearPluginDoctorContractRegistryCache();
});
- it("uses native jiti loading on Windows for contract-api modules", () => {
+ it("uses the runtime-supported Jiti boundary on Windows for contract-api modules", () => {
const pluginRoot = makeTempDir();
fs.writeFileSync(path.join(pluginRoot, "contract-api.js"), "export default {};\n", "utf-8");
mocks.loadPluginManifestRegistry.mockReturnValue({
@@ -42,6 +43,7 @@ describe("doctor-contract-registry getJiti", () => {
diagnostics: [],
});
const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32");
+ const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime();
try {
listPluginDoctorLegacyConfigRules({
@@ -56,7 +58,7 @@ describe("doctor-contract-registry getJiti", () => {
expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "contract-api.js"));
expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual(
expect.objectContaining({
- tryNative: true,
+ tryNative: expectedTryNative,
}),
);
});
diff --git a/src/plugins/setup-registry.test.ts b/src/plugins/setup-registry.test.ts
index e2326258aaf..a332123705d 100644
--- a/src/plugins/setup-registry.test.ts
+++ b/src/plugins/setup-registry.test.ts
@@ -1,6 +1,7 @@
import fs from "node:fs";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js";
import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js";
import {
getRegistryJitiMocks,
@@ -176,7 +177,7 @@ describe("setup-registry getJiti", () => {
clearPluginSetupRegistryCache();
});
- it("uses native jiti loading on Windows for setup-api modules", () => {
+ it("uses the runtime-supported Jiti boundary on Windows for setup-api modules", () => {
const pluginRoot = makeTempDir();
fs.writeFileSync(path.join(pluginRoot, "setup-api.js"), "export default {};\n", "utf-8");
mocks.loadPluginManifestRegistry.mockReturnValue({
@@ -185,6 +186,7 @@ describe("setup-registry getJiti", () => {
});
const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32");
const restoreVersions = forceNodeRuntimeVersionsForTest();
+ const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime();
try {
resolvePluginSetupRegistry({
@@ -200,7 +202,7 @@ describe("setup-registry getJiti", () => {
expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "setup-api.js"));
expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual(
expect.objectContaining({
- tryNative: true,
+ tryNative: expectedTryNative,
}),
);
});
diff --git a/src/test-utils/jiti-runtime.ts b/src/test-utils/jiti-runtime.ts
new file mode 100644
index 00000000000..f6051312b63
--- /dev/null
+++ b/src/test-utils/jiti-runtime.ts
@@ -0,0 +1,5 @@
+export function shouldExpectNativeJitiForJavaScriptTestRuntime(): boolean {
+ return (
+ typeof (process.versions as { bun?: string }).bun !== "string" && process.platform !== "win32"
+ );
+}