diff --git a/.env.example b/.env.example index f7ed308c713..de02b73a595 100644 --- a/.env.example +++ b/.env.example @@ -82,4 +82,5 @@ OPENCLAW_GATEWAY_TOKEN= # ELEVENLABS_API_KEY=... # XI_API_KEY=... # alias for ElevenLabs +# INWORLD_API_KEY=... # DEEPGRAM_API_KEY=... diff --git a/.github/labeler.yml b/.github/labeler.yml index ebd77d64925..b55c5fe3b21 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -307,6 +307,11 @@ - changed-files: - any-glob-to-any-file: - "extensions/huggingface/**" +"extensions: inworld": + - changed-files: + - any-glob-to-any-file: + - "extensions/inworld/**" + - "docs/providers/inworld.md" "extensions: kilocode": - changed-files: - any-glob-to-any-file: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a36aca45d1..7afec93e23d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai - Providers/Xiaomi: add MiMo TTS as a bundled speech provider with MP3/WAV output and voice-note Opus transcoding. Fixes #52376. (#55614) Thanks @zoujiejun. - Providers/ElevenLabs: include `eleven_v3` in the bundled TTS model catalog so model selection surfaces can offer ElevenLabs v3. (#68321) Thanks @itsuzef. - Providers/Local CLI TTS: add a bundled local command speech provider with file/stdout input, voice-note Opus conversion, and telephony PCM output. (#56239) Thanks @solar2ain. +- Providers/Inworld: add Inworld as a bundled speech provider with streaming TTS synthesis, voice listing, voice-note output, and PCM telephony output. (#55972) Thanks @cshape. - Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude. - Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris. - Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 6bc941f4abb..20265cbfe32 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -9ac3d271f9bfa9611557f0b52e4d0a600693bdd1de75cc1bafc320fc4d4f0075 config-baseline.json +0b0d796bceddfb9e2929518ba84af626da7f5d75c392a217041f36e850c4e74f config-baseline.json 271fdf1d6652927e0fc160a6f25276bf6dccb8f1b27fab15e0fc2620e8cacab4 config-baseline.core.json 7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json -7825b56a5b3fcdbe2e09ef8fe5d9f12ac3598435afebe20413051e45b0d1968e config-baseline.plugin.json +17eb3f8887193579ff32e35f9bd520ba2bd6049e52ab18855c5d41fcbf195d83 config-baseline.plugin.json diff --git a/docs/docs.json b/docs/docs.json index 9d6ea82f7ca..269c3a4f55a 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1317,6 +1317,7 @@ "providers/groq", "providers/huggingface", "providers/inferrs", + "providers/inworld", "providers/kilocode", "providers/litellm", "providers/lmstudio", diff --git a/docs/providers/inworld.md b/docs/providers/inworld.md new file mode 100644 index 00000000000..d0e6e964c2f --- /dev/null +++ b/docs/providers/inworld.md @@ -0,0 +1,115 @@ +--- +summary: "Inworld streaming text-to-speech for OpenClaw replies" +read_when: + - You want Inworld speech synthesis for outbound replies + - You need PCM telephony or OGG_OPUS voice-note output from Inworld +title: "Inworld" +--- + +Inworld is a streaming text-to-speech (TTS) provider. In OpenClaw it +synthesizes outbound reply audio (MP3 by default, OGG_OPUS for voice notes) +and PCM audio for telephony channels such as Voice Call. + +OpenClaw posts to Inworld's streaming TTS endpoint, concatenates the +returned base64 audio chunks into a single buffer, and hands the result to +the standard reply-audio pipeline. + +| Detail | Value | +| ------------- | ----------------------------------------------------------- | +| Website | [inworld.ai](https://inworld.ai) | +| Docs | [docs.inworld.ai/tts/tts](https://docs.inworld.ai/tts/tts) | +| Auth | `INWORLD_API_KEY` (HTTP Basic, Base64 dashboard credential) | +| Default voice | `Sarah` | +| Default model | `inworld-tts-1.5-max` | + +## Getting started + + + + Copy the credential from your Inworld dashboard (Workspace > API Keys) + and set it as an env var. The value is sent verbatim as the HTTP Basic + credential, so do not Base64-encode it again or convert it to a bearer + token. + + ``` + INWORLD_API_KEY= + ``` + + + + ```json5 + { + messages: { + tts: { + auto: "always", + provider: "inworld", + providers: { + inworld: { + voiceId: "Sarah", + modelId: "inworld-tts-1.5-max", + }, + }, + }, + }, + } + ``` + + + Send a reply through any connected channel. OpenClaw synthesizes the + audio with Inworld and delivers it as MP3 (or OGG_OPUS when the channel + expects a voice note). + + + +## Configuration options + +| Option | Path | Description | +| ------------- | -------------------------------------------- | ----------------------------------------------------------------- | +| `apiKey` | `messages.tts.providers.inworld.apiKey` | Base64 dashboard credential. Falls back to `INWORLD_API_KEY`. | +| `baseUrl` | `messages.tts.providers.inworld.baseUrl` | Override Inworld API base URL (default `https://api.inworld.ai`). | +| `voiceId` | `messages.tts.providers.inworld.voiceId` | Voice identifier (default `Sarah`). | +| `modelId` | `messages.tts.providers.inworld.modelId` | TTS model id (default `inworld-tts-1.5-max`). | +| `temperature` | `messages.tts.providers.inworld.temperature` | Sampling temperature `0..2` (optional). | + +## Notes + + + + Inworld uses HTTP Basic auth with a single Base64-encoded credential + string. Copy it verbatim from the Inworld dashboard. The provider sends + it as `Authorization: Basic ` without any further encoding, so + do not Base64-encode it yourself and do not pass a bearer-style token. + See [TTS auth notes](/tools/tts#inworld-primary) for the same callout. + + + Supported model ids: `inworld-tts-1.5-max` (default), + `inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`. + + + Replies use MP3 by default. When the channel target is `voice-note` + OpenClaw asks Inworld for `OGG_OPUS` so the audio plays as a native + voice bubble. Telephony synthesis uses raw `PCM` at 22050 Hz to feed + the telephony bridge. + + + Override the API host with `messages.tts.providers.inworld.baseUrl`. + Trailing slashes are stripped before requests are sent. + + + +## Related + + + + TTS overview, providers, and `messages.tts` config. + + + Full config reference including `messages.tts` settings. + + + All bundled OpenClaw providers. + + + Common issues and debugging steps. + + diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 8995e9afafa..a2e23e22fc8 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,7 +7,7 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. +OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. It works anywhere OpenClaw can send audio. ## Supported services @@ -15,6 +15,7 @@ It works anywhere OpenClaw can send audio. - **ElevenLabs** (primary or fallback provider) - **Google Gemini** (primary or fallback provider; uses Gemini API TTS) - **Gradium** (primary or fallback provider; supports voice-note and telephony output) +- **Inworld** (primary or fallback provider; uses the Inworld streaming TTS API) - **Local CLI** (primary or fallback provider; runs a configured local TTS command) - **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **MiniMax** (primary or fallback provider; uses the T2A v2 API) @@ -38,11 +39,12 @@ or ElevenLabs. ## Optional keys -If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, xAI, or Xiaomi MiMo: +If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) - `GRADIUM_API_KEY` +- `INWORLD_API_KEY` - `MINIMAX_API_KEY`; MiniMax TTS also accepts Token Plan auth via `MINIMAX_OAUTH_TOKEN`, `MINIMAX_CODE_PLAN_KEY`, or `MINIMAX_CODING_API_KEY` @@ -64,6 +66,7 @@ so that provider must also be authenticated if you enable summaries. - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) - [Gradium](/providers/gradium) +- [Inworld TTS API](https://docs.inworld.ai/tts/tts) - [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2) - [Xiaomi MiMo speech synthesis](/providers/xiaomi#text-to-speech) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) @@ -217,6 +220,35 @@ by the bundled Google image-generation provider. Resolution order is `messages.tts.providers.google.apiKey` -> `models.providers.google.apiKey` -> `GEMINI_API_KEY` -> `GOOGLE_API_KEY`. +### Inworld primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "inworld", + providers: { + inworld: { + apiKey: "inworld_api_key", + baseUrl: "https://api.inworld.ai", + voiceId: "Sarah", + modelId: "inworld-tts-1.5-max", + temperature: 0.8, + }, + }, + }, + }, +} +``` + +The `apiKey` value must be the Base64-encoded credential string copied +verbatim from the Inworld dashboard (Workspace > API Keys). The provider +sends it as `Authorization: Basic ` without any additional +encoding, so do not pass a raw bearer token and do not Base64-encode it +yourself. The key falls back to the `INWORLD_API_KEY` env var. See +[Inworld provider](/providers/inworld) for full setup. + ### xAI primary ```json5 @@ -415,7 +447,7 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic). +- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"inworld"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic). - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and rewritten to `provider: "microsoft"`. @@ -429,7 +461,7 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` @@ -453,6 +485,10 @@ Then run: - `providers.tts-local-cli.timeoutMs`: command timeout in milliseconds (default `120000`). - `providers.tts-local-cli.cwd`: optional command working directory. - `providers.tts-local-cli.env`: optional string environment overrides for the command. +- `providers.inworld.baseUrl`: override Inworld API base URL (default `https://api.inworld.ai`). +- `providers.inworld.voiceId`: Inworld voice identifier (default `Sarah`). +- `providers.inworld.modelId`: Inworld TTS model (default `inworld-tts-1.5-max`; also supports `inworld-tts-1.5-mini`, `inworld-tts-1-max`, `inworld-tts-1`). +- `providers.inworld.temperature`: sampling temperature `0..2` (optional). - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`). - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). - `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text. @@ -586,6 +622,7 @@ These override `messages.tts.*` for that host. with `ffmpeg`. - **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony. - **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. +- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. diff --git a/extensions/inworld/index.ts b/extensions/inworld/index.ts new file mode 100644 index 00000000000..891e95109f6 --- /dev/null +++ b/extensions/inworld/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { buildInworldSpeechProvider } from "./speech-provider.js"; + +export default definePluginEntry({ + id: "inworld", + name: "Inworld Speech", + description: "Bundled Inworld speech provider", + register(api) { + api.registerSpeechProvider(buildInworldSpeechProvider()); + }, +}); diff --git a/extensions/inworld/inworld.live.test.ts b/extensions/inworld/inworld.live.test.ts new file mode 100644 index 00000000000..6cb3736f14b --- /dev/null +++ b/extensions/inworld/inworld.live.test.ts @@ -0,0 +1,84 @@ +import { describe, expect, it } from "vitest"; +import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js"; +import { + registerProviderPlugin, + requireRegisteredProvider, +} from "../../test/helpers/plugins/provider-registration.js"; +import plugin from "./index.js"; + +const INWORLD_API_KEY = process.env.INWORLD_API_KEY?.trim() ?? ""; +const LIVE = isLiveTestEnabled() && INWORLD_API_KEY.length > 0; +const describeLive = LIVE ? describe : describe.skip; + +const registerInworldPlugin = () => + registerProviderPlugin({ + plugin, + id: "inworld", + name: "Inworld", + }); + +describeLive("inworld plugin live", () => { + it("lists voices through the registered speech provider", async () => { + const { speechProviders } = await registerInworldPlugin(); + const provider = requireRegisteredProvider(speechProviders, "inworld"); + + const voices = await provider.listVoices?.({ + apiKey: INWORLD_API_KEY, + }); + + expect(voices?.length).toBeGreaterThan(0); + expect(voices).toEqual(expect.arrayContaining([expect.objectContaining({ id: "Sarah" })])); + }, 120_000); + + it("synthesizes MP3, native voice-note Ogg/Opus, and telephony PCM", async () => { + const { speechProviders } = await registerInworldPlugin(); + const provider = requireRegisteredProvider(speechProviders, "inworld"); + const providerConfig = { + apiKey: INWORLD_API_KEY, + voiceId: "Sarah", + modelId: "inworld-tts-1.5-max", + }; + + const audioFile = await provider.synthesize({ + text: "OpenClaw Inworld text to speech integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + target: "audio-file", + timeoutMs: 90_000, + }); + + expect(audioFile.outputFormat).toBe("mp3"); + expect(audioFile.fileExtension).toBe(".mp3"); + expect(audioFile.voiceCompatible).toBe(false); + expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); + expect(audioFile.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF"); + + const voiceNote = await provider.synthesize({ + text: "OpenClaw Inworld voice note integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + target: "voice-note", + timeoutMs: 90_000, + }); + + expect(voiceNote.outputFormat).toBe("ogg_opus"); + expect(voiceNote.fileExtension).toBe(".ogg"); + expect(voiceNote.voiceCompatible).toBe(true); + expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(128); + expect(voiceNote.audioBuffer.subarray(0, 4).toString("ascii")).toBe("OggS"); + + const telephony = await provider.synthesizeTelephony?.({ + text: "OpenClaw Inworld telephony check OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + timeoutMs: 90_000, + }); + if (!telephony) { + throw new Error("Inworld telephony synthesis did not return audio"); + } + expect(telephony.outputFormat).toBe("pcm"); + expect(telephony.sampleRate).toBe(22_050); + expect(telephony.audioBuffer.byteLength).toBeGreaterThan(512); + expect(telephony.audioBuffer.subarray(0, 4).toString("ascii")).not.toBe("RIFF"); + }, 180_000); +}); diff --git a/extensions/inworld/openclaw.plugin.json b/extensions/inworld/openclaw.plugin.json new file mode 100644 index 00000000000..3b64bd56715 --- /dev/null +++ b/extensions/inworld/openclaw.plugin.json @@ -0,0 +1,40 @@ +{ + "id": "inworld", + "enabledByDefault": true, + "name": "Inworld", + "description": "Inworld streaming text-to-speech (MP3, OGG_OPUS, PCM telephony).", + "providerAuthEnvVars": { + "inworld": ["INWORLD_API_KEY"] + }, + "contracts": { + "speechProviders": ["inworld"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { + "type": "string", + "description": "Inworld API key. Must be the Base64 credential string from the Inworld dashboard (used as Authorization: Basic ). Falls back to INWORLD_API_KEY env var." + }, + "baseUrl": { + "type": "string", + "description": "Override Inworld API base URL (default https://api.inworld.ai)." + }, + "voiceId": { + "type": "string", + "description": "Voice identifier (default Sarah)." + }, + "modelId": { + "type": "string", + "description": "TTS model id (default inworld-tts-1.5-max)." + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2, + "description": "Sampling temperature 0..2." + } + } + } +} diff --git a/extensions/inworld/package.json b/extensions/inworld/package.json new file mode 100644 index 00000000000..51c68d55d10 --- /dev/null +++ b/extensions/inworld/package.json @@ -0,0 +1,15 @@ +{ + "name": "@openclaw/inworld-speech", + "version": "2026.4.16", + "private": true, + "description": "OpenClaw Inworld speech plugin", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/inworld/speech-provider.test.ts b/extensions/inworld/speech-provider.test.ts new file mode 100644 index 00000000000..2bbd401b5a6 --- /dev/null +++ b/extensions/inworld/speech-provider.test.ts @@ -0,0 +1,213 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +const { inworldTTSMock, listInworldVoicesMock } = vi.hoisted(() => ({ + inworldTTSMock: vi.fn(), + listInworldVoicesMock: vi.fn(), +})); + +vi.mock("./tts.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + inworldTTS: inworldTTSMock, + listInworldVoices: listInworldVoicesMock, + }; +}); + +import { buildInworldSpeechProvider } from "./speech-provider.js"; + +describe("buildInworldSpeechProvider", () => { + const originalEnv = process.env.INWORLD_API_KEY; + + afterEach(() => { + process.env.INWORLD_API_KEY = originalEnv; + inworldTTSMock.mockReset(); + listInworldVoicesMock.mockReset(); + vi.restoreAllMocks(); + }); + + it("reports configured when INWORLD_API_KEY env var is set", () => { + process.env.INWORLD_API_KEY = "test-key"; + const provider = buildInworldSpeechProvider(); + expect( + provider.isConfigured({ + providerConfig: {}, + timeoutMs: 30_000, + }), + ).toBe(true); + }); + + it("reports configured when providerConfig apiKey is set", () => { + delete process.env.INWORLD_API_KEY; + const provider = buildInworldSpeechProvider(); + expect( + provider.isConfigured({ + providerConfig: { apiKey: "config-key" }, + timeoutMs: 30_000, + }), + ).toBe(true); + }); + + it("reports not configured when no key is available", () => { + delete process.env.INWORLD_API_KEY; + const provider = buildInworldSpeechProvider(); + expect( + provider.isConfigured({ + providerConfig: {}, + timeoutMs: 30_000, + }), + ).toBe(false); + }); + + it("has correct provider metadata", () => { + const provider = buildInworldSpeechProvider(); + expect(provider.id).toBe("inworld"); + expect(provider.label).toBe("Inworld"); + expect(provider.autoSelectOrder).toBe(30); + expect(provider.models).toContain("inworld-tts-1.5-max"); + expect(provider.models).toContain("inworld-tts-1.5-mini"); + }); + + it("normalizes provider-owned speech config from raw provider config", () => { + const provider = buildInworldSpeechProvider(); + const resolved = provider.resolveConfig?.({ + cfg: {} as never, + timeoutMs: 30_000, + rawConfig: { + providers: { + inworld: { + apiKey: "basic-key", + baseUrl: "https://custom.inworld.example.com/", + voiceId: "Ashley", + modelId: "inworld-tts-1.5-mini", + temperature: 0.8, + }, + }, + }, + }); + + expect(resolved).toEqual({ + apiKey: "basic-key", + baseUrl: "https://custom.inworld.example.com", + voiceId: "Ashley", + modelId: "inworld-tts-1.5-mini", + temperature: 0.8, + }); + }); + + it("parses Inworld TTS directive overrides", () => { + const provider = buildInworldSpeechProvider(); + const policy = { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, + }; + + expect(provider.parseDirectiveToken?.({ key: "voice", value: "Ashley", policy })).toEqual({ + handled: true, + overrides: { voiceId: "Ashley" }, + }); + expect( + provider.parseDirectiveToken?.({ + key: "model", + value: "inworld-tts-1.5-mini", + policy, + }), + ).toEqual({ + handled: true, + overrides: { modelId: "inworld-tts-1.5-mini" }, + }); + expect(provider.parseDirectiveToken?.({ key: "temperature", value: "0.7", policy })).toEqual({ + handled: true, + overrides: { temperature: 0.7 }, + }); + }); + + it("warns on invalid directive temperature", () => { + const provider = buildInworldSpeechProvider(); + expect( + provider.parseDirectiveToken?.({ + key: "temperature", + value: "3", + policy: { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, + }, + }), + ).toEqual({ + handled: true, + warnings: ['invalid Inworld temperature "3"'], + }); + }); + + it("synthesizes voice-note targets with native OGG_OPUS output", async () => { + inworldTTSMock.mockResolvedValueOnce(Buffer.from("opus")); + const provider = buildInworldSpeechProvider(); + + const result = await provider.synthesize?.({ + text: "Hello", + cfg: {} as never, + providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" }, + providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 }, + target: "voice-note", + timeoutMs: 30_000, + }); + + expect(inworldTTSMock).toHaveBeenCalledWith({ + text: "Hello", + apiKey: "key", + baseUrl: "https://api.inworld.ai", + voiceId: "Ashley", + modelId: "inworld-tts-1.5-mini", + audioEncoding: "OGG_OPUS", + temperature: 0.6, + timeoutMs: 30_000, + }); + expect(result).toEqual({ + audioBuffer: Buffer.from("opus"), + outputFormat: "ogg_opus", + fileExtension: ".ogg", + voiceCompatible: true, + }); + }); + + it("synthesizes telephony PCM at 22050 Hz", async () => { + inworldTTSMock.mockResolvedValueOnce(Buffer.from("pcm")); + const provider = buildInworldSpeechProvider(); + + const result = await provider.synthesizeTelephony?.({ + text: "Hello", + cfg: {} as never, + providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" }, + timeoutMs: 30_000, + }); + + expect(inworldTTSMock).toHaveBeenCalledWith({ + text: "Hello", + apiKey: "key", + baseUrl: "https://api.inworld.ai", + voiceId: "Sarah", + modelId: "inworld-tts-1.5-max", + audioEncoding: "PCM", + sampleRateHertz: 22_050, + temperature: undefined, + timeoutMs: 30_000, + }); + expect(result).toEqual({ + audioBuffer: Buffer.from("pcm"), + outputFormat: "pcm", + sampleRate: 22_050, + }); + }); +}); diff --git a/extensions/inworld/speech-provider.ts b/extensions/inworld/speech-provider.ts new file mode 100644 index 00000000000..f9c28a91e46 --- /dev/null +++ b/extensions/inworld/speech-provider.ts @@ -0,0 +1,221 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech-core"; +import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { + DEFAULT_INWORLD_MODEL_ID, + DEFAULT_INWORLD_VOICE_ID, + type InworldAudioEncoding, + INWORLD_TTS_MODELS, + inworldTTS, + listInworldVoices, + normalizeInworldBaseUrl, +} from "./tts.js"; + +type InworldProviderConfig = { + apiKey?: string; + baseUrl: string; + voiceId: string; + modelId: string; + temperature?: number; +}; + +type InworldProviderOverrides = { + voiceId?: string; + modelId?: string; + temperature?: number; +}; + +function normalizeInworldProviderConfig(rawConfig: Record): InworldProviderConfig { + const providers = asObject(rawConfig.providers); + const raw = asObject(providers?.inworld) ?? asObject(rawConfig.inworld); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.inworld.apiKey", + }), + baseUrl: normalizeInworldBaseUrl(trimToUndefined(raw?.baseUrl)), + voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_INWORLD_VOICE_ID, + modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_INWORLD_MODEL_ID, + temperature: asFiniteNumber(raw?.temperature), + }; +} + +function readInworldProviderConfig(config: SpeechProviderConfig): InworldProviderConfig { + const defaults = normalizeInworldProviderConfig({}); + return { + apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, + baseUrl: normalizeInworldBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl), + voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId, + modelId: trimToUndefined(config.modelId) ?? defaults.modelId, + temperature: asFiniteNumber(config.temperature) ?? defaults.temperature, + }; +} + +function readInworldOverrides( + overrides: SpeechProviderOverrides | undefined, +): InworldProviderOverrides { + if (!overrides) { + return {}; + } + return { + voiceId: trimToUndefined(overrides.voiceId ?? overrides.voice), + modelId: trimToUndefined(overrides.modelId ?? overrides.model), + temperature: asFiniteNumber(overrides.temperature), + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: SpeechProviderOverrides; + warnings?: string[]; +} { + switch (ctx.key) { + case "voice": + case "voiceid": + case "voice_id": + case "inworld_voice": + case "inworldvoice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { voiceId: ctx.value } }; + case "model": + case "modelid": + case "model_id": + case "inworld_model": + case "inworldmodel": + if (!ctx.policy.allowModelId) { + return { handled: true }; + } + return { handled: true, overrides: { modelId: ctx.value } }; + case "temperature": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const temperature = Number(ctx.value); + if (!Number.isFinite(temperature) || temperature < 0 || temperature > 2) { + return { handled: true, warnings: [`invalid Inworld temperature "${ctx.value}"`] }; + } + return { handled: true, overrides: { temperature } }; + } + default: + return { handled: false }; + } +} + +export function buildInworldSpeechProvider(): SpeechProviderPlugin { + return { + id: "inworld", + label: "Inworld", + autoSelectOrder: 30, + models: INWORLD_TTS_MODELS, + resolveConfig: ({ rawConfig }) => normalizeInworldProviderConfig(rawConfig), + parseDirectiveToken, + resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { + const base = normalizeInworldProviderConfig(baseTtsConfig); + const resolvedApiKey = + talkProviderConfig.apiKey === undefined + ? undefined + : normalizeResolvedSecretInputString({ + value: talkProviderConfig.apiKey, + path: "talk.providers.inworld.apiKey", + }); + return { + ...base, + ...(resolvedApiKey === undefined ? {} : { apiKey: resolvedApiKey }), + ...(trimToUndefined(talkProviderConfig.baseUrl) == null + ? {} + : { baseUrl: normalizeInworldBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }), + ...(trimToUndefined(talkProviderConfig.voiceId) == null + ? {} + : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }), + ...(trimToUndefined(talkProviderConfig.modelId) == null + ? {} + : { modelId: trimToUndefined(talkProviderConfig.modelId) }), + ...(asFiniteNumber(talkProviderConfig.temperature) == null + ? {} + : { temperature: asFiniteNumber(talkProviderConfig.temperature) }), + }; + }, + resolveTalkOverrides: ({ params }) => ({ + ...(trimToUndefined(params.voiceId) == null + ? {} + : { voiceId: trimToUndefined(params.voiceId) }), + ...(trimToUndefined(params.modelId) == null + ? {} + : { modelId: trimToUndefined(params.modelId) }), + ...(asFiniteNumber(params.temperature) == null + ? {} + : { temperature: asFiniteNumber(params.temperature) }), + }), + listVoices: async (req) => { + const config = req.providerConfig ? readInworldProviderConfig(req.providerConfig) : undefined; + const apiKey = req.apiKey || config?.apiKey || process.env.INWORLD_API_KEY; + if (!apiKey) { + throw new Error("Inworld API key missing"); + } + return listInworldVoices({ + apiKey, + baseUrl: req.baseUrl ?? config?.baseUrl, + }); + }, + isConfigured: ({ providerConfig }) => + Boolean(readInworldProviderConfig(providerConfig).apiKey || process.env.INWORLD_API_KEY), + synthesize: async (req) => { + const config = readInworldProviderConfig(req.providerConfig); + const overrides = readInworldOverrides(req.providerOverrides); + const apiKey = config.apiKey || process.env.INWORLD_API_KEY; + if (!apiKey) { + throw new Error("Inworld API key missing"); + } + + const useOpus = req.target === "voice-note"; + const audioEncoding: InworldAudioEncoding = useOpus ? "OGG_OPUS" : "MP3"; + + const audioBuffer = await inworldTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + voiceId: overrides.voiceId ?? config.voiceId, + modelId: overrides.modelId ?? config.modelId, + audioEncoding, + temperature: overrides.temperature ?? config.temperature, + timeoutMs: req.timeoutMs, + }); + + return { + audioBuffer, + outputFormat: audioEncoding.toLowerCase(), + fileExtension: useOpus ? ".ogg" : ".mp3", + voiceCompatible: useOpus, + }; + }, + synthesizeTelephony: async (req) => { + const config = readInworldProviderConfig(req.providerConfig); + const apiKey = config.apiKey || process.env.INWORLD_API_KEY; + if (!apiKey) { + throw new Error("Inworld API key missing"); + } + + const sampleRate = 22_050; + const audioBuffer = await inworldTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + voiceId: config.voiceId, + modelId: config.modelId, + audioEncoding: "PCM", + sampleRateHertz: sampleRate, + temperature: config.temperature, + timeoutMs: req.timeoutMs, + }); + + return { audioBuffer, outputFormat: "pcm", sampleRate }; + }, + }; +} diff --git a/extensions/inworld/tsconfig.json b/extensions/inworld/tsconfig.json new file mode 100644 index 00000000000..b8a85a99ac3 --- /dev/null +++ b/extensions/inworld/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../tsconfig.package-boundary.base.json", + "compilerOptions": { + "rootDir": "." + }, + "include": ["./*.ts", "./src/**/*.ts"], + "exclude": [ + "./**/*.test.ts", + "./dist/**", + "./node_modules/**", + "./src/test-support/**", + "./src/**/*test-helpers.ts", + "./src/**/*test-harness.ts", + "./src/**/*test-support.ts" + ] +} diff --git a/extensions/inworld/tts.test.ts b/extensions/inworld/tts.test.ts new file mode 100644 index 00000000000..f3fadeddfe7 --- /dev/null +++ b/extensions/inworld/tts.test.ts @@ -0,0 +1,312 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({ + fetchWithSsrFGuardMock: vi.fn(), +})); + +vi.mock("openclaw/plugin-sdk/ssrf-runtime", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + fetchWithSsrFGuard: fetchWithSsrFGuardMock, + }; +}); + +import { inworldTTS, listInworldVoices } from "./tts.js"; + +type GuardRequest = { + url: string; + init?: RequestInit; + auditContext?: string; + policy?: unknown; + timeoutMs?: number; +}; + +function queueGuardedResponse(response: Response): { release: ReturnType } { + const release = vi.fn(async () => {}); + fetchWithSsrFGuardMock.mockResolvedValueOnce({ response, release }); + return { release }; +} + +function lastGuardRequest(): GuardRequest { + const call = fetchWithSsrFGuardMock.mock.calls.at(-1); + if (!call) { + throw new Error("fetchWithSsrFGuard was not called"); + } + return call[0] as GuardRequest; +} + +function readRequestBody(request: GuardRequest): string { + const body = request.init?.body; + if (typeof body !== "string") { + throw new Error("expected request body to be a string"); + } + return body; +} + +describe("listInworldVoices", () => { + afterEach(() => { + fetchWithSsrFGuardMock.mockClear(); + vi.restoreAllMocks(); + }); + + it("maps Inworld voice metadata into speech voice options", async () => { + queueGuardedResponse( + new Response( + JSON.stringify({ + voices: [ + { + voiceId: "Dennis", + displayName: "Dennis", + description: "Middle-aged man with a smooth, calm and friendly voice", + langCode: "EN_US", + tags: ["male", "middle-aged", "smooth", "calm", "friendly"], + source: "SYSTEM", + }, + { + voiceId: "Ashley", + displayName: "Ashley", + description: "A warm, natural female voice", + langCode: "EN_US", + tags: ["female", "warm", "natural"], + source: "SYSTEM", + }, + ], + }), + { status: 200 }, + ), + ); + + const voices = await listInworldVoices({ apiKey: "test-key" }); + + expect(voices).toEqual([ + { + id: "Dennis", + name: "Dennis", + description: "Middle-aged man with a smooth, calm and friendly voice", + locale: "EN_US", + gender: "male", + }, + { + id: "Ashley", + name: "Ashley", + description: "A warm, natural female voice", + locale: "EN_US", + gender: "female", + }, + ]); + const request = lastGuardRequest(); + expect(request.url).toBe("https://api.inworld.ai/voices/v1/voices"); + expect(request.auditContext).toBe("inworld-voices"); + expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] }); + const headers = new Headers(request.init?.headers); + expect(headers.get("authorization")).toBe("Basic test-key"); + }); + + it("throws on API errors with response body", async () => { + queueGuardedResponse(new Response("service unavailable", { status: 503 })); + + await expect(listInworldVoices({ apiKey: "test-key" })).rejects.toThrow( + "Inworld voices API error (503): service unavailable", + ); + }); + + it("filters out voices with empty voiceId", async () => { + queueGuardedResponse( + new Response( + JSON.stringify({ + voices: [ + { voiceId: "", displayName: "Empty" }, + { voiceId: "Dennis", displayName: "Dennis" }, + ], + }), + { status: 200 }, + ), + ); + + const voices = await listInworldVoices({ apiKey: "test-key" }); + expect(voices).toHaveLength(1); + expect(voices[0].id).toBe("Dennis"); + }); + + it("returns empty array when no voices present", async () => { + queueGuardedResponse(new Response(JSON.stringify({}), { status: 200 })); + + const voices = await listInworldVoices({ apiKey: "test-key" }); + expect(voices).toEqual([]); + }); + + it("passes language filter as query parameter", async () => { + queueGuardedResponse(new Response(JSON.stringify({ voices: [] }), { status: 200 })); + + await listInworldVoices({ apiKey: "test-key", language: "EN_US" }); + + expect(lastGuardRequest().url).toBe("https://api.inworld.ai/voices/v1/voices?languages=EN_US"); + }); + + it("releases the guarded dispatcher after success", async () => { + const { release } = queueGuardedResponse( + new Response(JSON.stringify({ voices: [] }), { status: 200 }), + ); + + await listInworldVoices({ apiKey: "test-key" }); + + expect(release).toHaveBeenCalledTimes(1); + }); +}); + +describe("inworldTTS", () => { + afterEach(() => { + fetchWithSsrFGuardMock.mockClear(); + vi.restoreAllMocks(); + }); + + it("concatenates base64 audio chunks from streaming response", async () => { + const chunk1 = Buffer.from("audio-chunk-1").toString("base64"); + const chunk2 = Buffer.from("audio-chunk-2").toString("base64"); + const body = [ + JSON.stringify({ result: { audioContent: chunk1 } }), + JSON.stringify({ result: { audioContent: chunk2 } }), + ].join("\n"); + + queueGuardedResponse(new Response(body, { status: 200 })); + + const buffer = await inworldTTS({ + text: "Hello world", + apiKey: "test-key", + }); + + expect(buffer).toEqual( + Buffer.concat([Buffer.from("audio-chunk-1"), Buffer.from("audio-chunk-2")]), + ); + }); + + it("throws on HTTP errors with response body", async () => { + queueGuardedResponse(new Response("bad request body", { status: 400 })); + + await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow( + "Inworld TTS API error (400): bad request body", + ); + }); + + it("throws on in-stream errors", async () => { + const body = JSON.stringify({ + error: { code: 3, message: "Invalid voice ID" }, + }); + queueGuardedResponse(new Response(body, { status: 200 })); + + await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow( + "Inworld TTS stream error (3): Invalid voice ID", + ); + }); + + it("throws on empty audio response", async () => { + const body = JSON.stringify({ result: { audioContent: "" } }); + queueGuardedResponse(new Response(body, { status: 200 })); + + await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow( + "Inworld TTS returned no audio data", + ); + }); + + it("throws descriptive error on non-JSON line in stream", async () => { + queueGuardedResponse(new Response("Rate limited", { status: 200 })); + + await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow( + "Inworld TTS stream parse error: unexpected non-JSON line:", + ); + }); + + it("sends correct request body with defaults", async () => { + const chunk = Buffer.from("audio").toString("base64"); + queueGuardedResponse( + new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }), + ); + + await inworldTTS({ text: "Hello", apiKey: "test-key" }); + + const request = lastGuardRequest(); + expect(request.url).toBe("https://api.inworld.ai/tts/v1/voice:stream"); + expect(request.auditContext).toBe("inworld-tts"); + expect(request.policy).toEqual({ hostnameAllowlist: ["api.inworld.ai"] }); + expect(request.init?.method).toBe("POST"); + const headers = new Headers(request.init?.headers); + expect(headers.get("authorization")).toBe("Basic test-key"); + expect(headers.get("content-type")).toBe("application/json"); + expect(JSON.parse(readRequestBody(request))).toEqual({ + text: "Hello", + voiceId: "Sarah", + modelId: "inworld-tts-1.5-max", + audioConfig: { audioEncoding: "MP3" }, + }); + }); + + it("includes temperature and sampleRateHertz when provided", async () => { + const chunk = Buffer.from("audio").toString("base64"); + queueGuardedResponse( + new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }), + ); + + await inworldTTS({ + text: "Hello", + apiKey: "test-key", + voiceId: "Ashley", + modelId: "inworld-tts-1.5-mini", + audioEncoding: "PCM", + sampleRateHertz: 22_050, + temperature: 0.8, + }); + + const callBody = JSON.parse(readRequestBody(lastGuardRequest())); + expect(callBody.voiceId).toBe("Ashley"); + expect(callBody.modelId).toBe("inworld-tts-1.5-mini"); + expect(callBody.audioConfig.audioEncoding).toBe("PCM"); + expect(callBody.audioConfig.sampleRateHertz).toBe(22_050); + expect(callBody.temperature).toBe(0.8); + }); + + it("uses custom base URL", async () => { + const chunk = Buffer.from("audio").toString("base64"); + queueGuardedResponse( + new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }), + ); + + await inworldTTS({ + text: "Hello", + apiKey: "test-key", + baseUrl: "https://custom.inworld.example.com/", + }); + + expect(lastGuardRequest().url).toBe("https://custom.inworld.example.com/tts/v1/voice:stream"); + expect(lastGuardRequest().policy).toEqual({ + hostnameAllowlist: ["custom.inworld.example.com"], + }); + }); + + it("skips empty lines in streaming response", async () => { + const chunk = Buffer.from("audio").toString("base64"); + const body = `\n${JSON.stringify({ result: { audioContent: chunk } })}\n\n`; + queueGuardedResponse(new Response(body, { status: 200 })); + + const buffer = await inworldTTS({ text: "test", apiKey: "test-key" }); + expect(buffer).toEqual(Buffer.from("audio")); + }); + + it("releases the guarded dispatcher after success", async () => { + const chunk = Buffer.from("audio").toString("base64"); + const { release } = queueGuardedResponse( + new Response(JSON.stringify({ result: { audioContent: chunk } }), { status: 200 }), + ); + + await inworldTTS({ text: "test", apiKey: "test-key" }); + + expect(release).toHaveBeenCalledTimes(1); + }); + + it("releases the guarded dispatcher after failure", async () => { + const { release } = queueGuardedResponse(new Response("fail", { status: 500 })); + + await expect(inworldTTS({ text: "test", apiKey: "test-key" })).rejects.toThrow(); + expect(release).toHaveBeenCalledTimes(1); + }); +}); diff --git a/extensions/inworld/tts.ts b/extensions/inworld/tts.ts new file mode 100644 index 00000000000..e5009d1e8b5 --- /dev/null +++ b/extensions/inworld/tts.ts @@ -0,0 +1,190 @@ +import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech-core"; +import { fetchWithSsrFGuard, type SsrFPolicy } from "openclaw/plugin-sdk/ssrf-runtime"; + +export const DEFAULT_INWORLD_BASE_URL = "https://api.inworld.ai"; +export const DEFAULT_INWORLD_VOICE_ID = "Sarah"; +export const DEFAULT_INWORLD_MODEL_ID = "inworld-tts-1.5-max"; + +export const INWORLD_TTS_MODELS = [ + "inworld-tts-1.5-max", + "inworld-tts-1.5-mini", + "inworld-tts-1-max", + "inworld-tts-1", +] as const; + +export type InworldAudioEncoding = + | "MP3" + | "OGG_OPUS" + | "LINEAR16" + | "PCM" + | "WAV" + | "ALAW" + | "MULAW" + | "FLAC"; + +export function normalizeInworldBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + return trimmed?.replace(/\/+$/, "") || DEFAULT_INWORLD_BASE_URL; +} + +function ssrfPolicyFromInworldBaseUrl(baseUrl: string): SsrFPolicy | undefined { + try { + const parsed = new URL(baseUrl); + if (parsed.protocol !== "http:" && parsed.protocol !== "https:") { + return undefined; + } + return { hostnameAllowlist: [parsed.hostname] }; + } catch { + return undefined; + } +} + +/** + * Calls the Inworld streaming TTS endpoint and concatenates every audio chunk + * into a single buffer. The stream returns newline-delimited JSON, each line + * carrying base64 audio in `result.audioContent`. + */ +export async function inworldTTS(params: { + text: string; + apiKey: string; + baseUrl?: string; + voiceId?: string; + modelId?: string; + audioEncoding?: InworldAudioEncoding; + sampleRateHertz?: number; + temperature?: number; + timeoutMs?: number; +}): Promise { + const baseUrl = normalizeInworldBaseUrl(params.baseUrl); + const url = `${baseUrl}/tts/v1/voice:stream`; + const requestBody = JSON.stringify({ + text: params.text, + voiceId: params.voiceId ?? DEFAULT_INWORLD_VOICE_ID, + modelId: params.modelId ?? DEFAULT_INWORLD_MODEL_ID, + audioConfig: { + audioEncoding: params.audioEncoding ?? "MP3", + ...(params.sampleRateHertz && { sampleRateHertz: params.sampleRateHertz }), + }, + ...(params.temperature != null && { temperature: params.temperature }), + }); + + const { response, release } = await fetchWithSsrFGuard({ + url, + init: { + method: "POST", + headers: { + "Content-Type": "application/json", + // apiKey is the Base64-encoded credential string copied from the + // Inworld dashboard; it is sent verbatim as the HTTP Basic + // credential. Do not Base64-encode it here, and do not normalize + // bearer-style tokens. + Authorization: `Basic ${params.apiKey}`, + }, + body: requestBody, + }, + timeoutMs: params.timeoutMs, + policy: ssrfPolicyFromInworldBaseUrl(baseUrl), + auditContext: "inworld-tts", + }); + + try { + if (!response.ok) { + const errorBody = await response.text().catch(() => ""); + throw new Error(`Inworld TTS API error (${response.status}): ${errorBody}`); + } + + const body = await response.text(); + const chunks: Buffer[] = []; + + for (const line of body.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) { + continue; + } + + let parsed: { + result?: { audioContent?: string }; + error?: { code?: number; message?: string }; + }; + try { + parsed = JSON.parse(trimmed) as typeof parsed; + } catch { + throw new Error( + `Inworld TTS stream parse error: unexpected non-JSON line: ${trimmed.slice(0, 80)}`, + ); + } + + if (parsed.error) { + throw new Error(`Inworld TTS stream error (${parsed.error.code}): ${parsed.error.message}`); + } + + if (parsed.result?.audioContent) { + chunks.push(Buffer.from(parsed.result.audioContent, "base64")); + } + } + + if (chunks.length === 0) { + throw new Error("Inworld TTS returned no audio data"); + } + + return Buffer.concat(chunks); + } finally { + await release(); + } +} + +export async function listInworldVoices(params: { + apiKey: string; + baseUrl?: string; + language?: string; + timeoutMs?: number; +}): Promise { + const baseUrl = normalizeInworldBaseUrl(params.baseUrl); + const langParam = params.language ? `?languages=${encodeURIComponent(params.language)}` : ""; + const url = `${baseUrl}/voices/v1/voices${langParam}`; + + const { response, release } = await fetchWithSsrFGuard({ + url, + init: { + method: "GET", + headers: { + Authorization: `Basic ${params.apiKey}`, + }, + }, + timeoutMs: params.timeoutMs, + policy: ssrfPolicyFromInworldBaseUrl(baseUrl), + auditContext: "inworld-voices", + }); + + try { + if (!response.ok) { + const errorBody = await response.text().catch(() => ""); + throw new Error(`Inworld voices API error (${response.status}): ${errorBody}`); + } + + const json = (await response.json()) as { + voices?: Array<{ + voiceId?: string; + displayName?: string; + description?: string; + langCode?: string; + tags?: string[]; + source?: string; + }>; + }; + + return Array.isArray(json.voices) + ? json.voices + .map((voice) => ({ + id: voice.voiceId?.trim() ?? "", + name: voice.displayName?.trim() || undefined, + description: voice.description?.trim() || undefined, + locale: voice.langCode || undefined, + gender: voice.tags?.find((t) => t === "male" || t === "female") || undefined, + })) + .filter((voice) => voice.id.length > 0) + : []; + } finally { + await release(); + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5ebedc4822b..ec1e96d2742 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -675,6 +675,12 @@ importers: specifier: workspace:* version: link:../../packages/plugin-sdk + extensions/inworld: + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/irc: devDependencies: '@openclaw/plugin-sdk': diff --git a/src/channels/plugins/module-loader.test.ts b/src/channels/plugins/module-loader.test.ts index 4ee33f12d26..66463bba47c 100644 --- a/src/channels/plugins/module-loader.test.ts +++ b/src/channels/plugins/module-loader.test.ts @@ -3,6 +3,7 @@ import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it, vi } from "vitest"; import { importFreshModule } from "../../../test/helpers/import-fresh.ts"; +import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../../test-utils/jiti-runtime.js"; import { isJavaScriptModulePath, resolveCompiledBundledModulePath, @@ -92,7 +93,7 @@ describe("channel plugin module loader helpers", () => { expect(createJiti).not.toHaveBeenCalled(); }); - it("uses native Jiti import for Windows dist loads", async () => { + it("uses the runtime-supported Jiti boundary for Windows dist loads", async () => { const createJiti = vi.fn(() => vi.fn(() => ({ ok: true }))); vi.doMock("jiti", () => ({ createJiti, @@ -119,7 +120,7 @@ describe("channel plugin module loader helpers", () => { expect(createJiti).toHaveBeenCalledWith( expect.any(String), expect.objectContaining({ - tryNative: true, + tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(), }), ); } finally { diff --git a/src/cli/program/preaction.test.ts b/src/cli/program/preaction.test.ts index 49342b01ab9..a57e08f7771 100644 --- a/src/cli/program/preaction.test.ts +++ b/src/cli/program/preaction.test.ts @@ -463,8 +463,8 @@ describe("registerPreActionHooks", () => { }); await runPreAction({ - parseArgv: ["agents", "list"], - processArgv: ["node", "openclaw", "agents", "list", "--json"], + parseArgv: ["message", "send"], + processArgv: ["node", "openclaw", "message", "send", "--json"], }); expect(ensurePluginRegistryLoadedMock).toHaveBeenCalled(); diff --git a/src/plugin-sdk/facade-loader.test.ts b/src/plugin-sdk/facade-loader.test.ts index 98e8de84680..05b6a0150b4 100644 --- a/src/plugin-sdk/facade-loader.test.ts +++ b/src/plugin-sdk/facade-loader.test.ts @@ -1,6 +1,7 @@ import fs from "node:fs"; import path from "node:path"; import { afterEach, describe, expect, it, vi } from "vitest"; +import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js"; import { listImportedBundledPluginFacadeIds, loadBundledPluginPublicSurfaceModuleSync, @@ -126,7 +127,7 @@ describe("plugin-sdk facade loader", () => { expect(listImportedFacadeRuntimeIds()).toEqual(["demo"]); }); - it("uses native Jiti import for Windows dist facade loads", () => { + it("uses the runtime-supported Jiti boundary for Windows dist facade loads", () => { const dir = createTempDirSync("openclaw-facade-loader-windows-dist-"); const bundledPluginsDir = path.join(dir, "dist"); fs.mkdirSync(path.join(bundledPluginsDir, "demo"), { recursive: true }); @@ -158,7 +159,7 @@ describe("plugin-sdk facade loader", () => { expect(createJitiCalls[0]?.[0]).toEqual(expect.any(String)); expect(createJitiCalls[0]?.[1]).toEqual( expect.objectContaining({ - tryNative: true, + tryNative: shouldExpectNativeJitiForJavaScriptTestRuntime(), }), ); } finally { diff --git a/src/plugins/doctor-contract-registry.test.ts b/src/plugins/doctor-contract-registry.test.ts index 44d0a3470f1..e51c3578fbe 100644 --- a/src/plugins/doctor-contract-registry.test.ts +++ b/src/plugins/doctor-contract-registry.test.ts @@ -1,6 +1,7 @@ import fs from "node:fs"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js"; import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js"; import { getRegistryJitiMocks, @@ -34,7 +35,7 @@ describe("doctor-contract-registry getJiti", () => { clearPluginDoctorContractRegistryCache(); }); - it("uses native jiti loading on Windows for contract-api modules", () => { + it("uses the runtime-supported Jiti boundary on Windows for contract-api modules", () => { const pluginRoot = makeTempDir(); fs.writeFileSync(path.join(pluginRoot, "contract-api.js"), "export default {};\n", "utf-8"); mocks.loadPluginManifestRegistry.mockReturnValue({ @@ -42,6 +43,7 @@ describe("doctor-contract-registry getJiti", () => { diagnostics: [], }); const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32"); + const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime(); try { listPluginDoctorLegacyConfigRules({ @@ -56,7 +58,7 @@ describe("doctor-contract-registry getJiti", () => { expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "contract-api.js")); expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual( expect.objectContaining({ - tryNative: true, + tryNative: expectedTryNative, }), ); }); diff --git a/src/plugins/setup-registry.test.ts b/src/plugins/setup-registry.test.ts index e2326258aaf..a332123705d 100644 --- a/src/plugins/setup-registry.test.ts +++ b/src/plugins/setup-registry.test.ts @@ -1,6 +1,7 @@ import fs from "node:fs"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { shouldExpectNativeJitiForJavaScriptTestRuntime } from "../test-utils/jiti-runtime.js"; import { cleanupTrackedTempDirs, makeTrackedTempDir } from "./test-helpers/fs-fixtures.js"; import { getRegistryJitiMocks, @@ -176,7 +177,7 @@ describe("setup-registry getJiti", () => { clearPluginSetupRegistryCache(); }); - it("uses native jiti loading on Windows for setup-api modules", () => { + it("uses the runtime-supported Jiti boundary on Windows for setup-api modules", () => { const pluginRoot = makeTempDir(); fs.writeFileSync(path.join(pluginRoot, "setup-api.js"), "export default {};\n", "utf-8"); mocks.loadPluginManifestRegistry.mockReturnValue({ @@ -185,6 +186,7 @@ describe("setup-registry getJiti", () => { }); const platformSpy = vi.spyOn(process, "platform", "get").mockReturnValue("win32"); const restoreVersions = forceNodeRuntimeVersionsForTest(); + const expectedTryNative = shouldExpectNativeJitiForJavaScriptTestRuntime(); try { resolvePluginSetupRegistry({ @@ -200,7 +202,7 @@ describe("setup-registry getJiti", () => { expect(mocks.createJiti.mock.calls[0]?.[0]).toBe(path.join(pluginRoot, "setup-api.js")); expect(mocks.createJiti.mock.calls[0]?.[1]).toEqual( expect.objectContaining({ - tryNative: true, + tryNative: expectedTryNative, }), ); }); diff --git a/src/test-utils/jiti-runtime.ts b/src/test-utils/jiti-runtime.ts new file mode 100644 index 00000000000..f6051312b63 --- /dev/null +++ b/src/test-utils/jiti-runtime.ts @@ -0,0 +1,5 @@ +export function shouldExpectNativeJitiForJavaScriptTestRuntime(): boolean { + return ( + typeof (process.versions as { bun?: string }).bun !== "string" && process.platform !== "win32" + ); +}