mirror of
https://github.com/AventurasTeam/Aventuras.git
synced 2026-04-28 11:50:15 +00:00
577 lines
15 KiB
TypeScript
577 lines
15 KiB
TypeScript
/**
|
|
* Text-To-Speech (TTS) Service
|
|
* Provides audio generation via OpenAI-compatible APIs (and others) with streaming support.
|
|
* Designed for extensibility to support multiple TTS providers.
|
|
*/
|
|
|
|
import { PROVIDERS } from '../sdk/providers/config';
|
|
import type { APIProfile } from "$lib/types";
|
|
import { corsFetch } from "$lib/services/discovery/utils";
|
|
|
|
// TTS Configuration - matches TTSServiceSettings in settings.svelte.ts
|
|
export interface TTSSettings {
|
|
enabled: boolean;
|
|
endpoint: string;
|
|
apiKey: string;
|
|
model: string;
|
|
voice: string;
|
|
speed: number;
|
|
autoPlay: boolean;
|
|
excludedCharacters: string;
|
|
removeHtmlTags: boolean;
|
|
removeAllHtmlContent: boolean;
|
|
htmlTagsToRemoveContent: string;
|
|
provider: "openai" | "google";
|
|
}
|
|
|
|
export interface TTSVoice {
|
|
name: string;
|
|
id: string;
|
|
lang: string;
|
|
}
|
|
|
|
export interface TTSStreamChunk {
|
|
type: "data" | "end" | "error";
|
|
data?: ArrayBuffer;
|
|
error?: string;
|
|
}
|
|
|
|
/**
|
|
* Base TTS Provider - extend this for custom implementations
|
|
*/
|
|
export abstract class TTSProvider {
|
|
private currentAudio: HTMLAudioElement | null = null;
|
|
|
|
abstract get name(): string;
|
|
|
|
/**
|
|
* Get available voices for this provider
|
|
*/
|
|
abstract getAvailableVoices(): Promise<TTSVoice[]>;
|
|
|
|
/**
|
|
* Generate TTS audio and return as blob
|
|
*/
|
|
abstract generateSpeech(text: string, voice: string): Promise<Blob>;
|
|
|
|
/**
|
|
* Play audio blob using HTML Audio element
|
|
* IMPORTANT: We create a fresh audio element each time to avoid crashes
|
|
* on Linux with newer GStreamer versions when changing audio source.
|
|
* @param blob - The audio blob to play
|
|
* @param onProgress - Optional callback for playback progress
|
|
* @param playbackRate - Optional playback speed
|
|
*/
|
|
async playAudio(
|
|
blob: Blob,
|
|
onProgress?: (progress: number) => void,
|
|
playbackRate = 1.0,
|
|
): Promise<void> {
|
|
return new Promise((resolve, reject) => {
|
|
// Stop any previous playback and destroy old element
|
|
this.stopAudio();
|
|
|
|
const url = URL.createObjectURL(blob);
|
|
|
|
// Create a FRESH audio element each time (workaround for GStreamer crash)
|
|
const audio = new Audio();
|
|
this.currentAudio = audio;
|
|
|
|
const cleanup = () => {
|
|
URL.revokeObjectURL(url);
|
|
audio.oncanplaythrough = null;
|
|
audio.onended = null;
|
|
audio.onerror = null;
|
|
audio.ontimeupdate = null;
|
|
};
|
|
|
|
audio.onerror = (e) => {
|
|
// Only handle error if this is still the current audio element
|
|
// (prevents false errors when stopAudio() clears an old element)
|
|
if (this.currentAudio !== audio) return;
|
|
|
|
console.error("[TTS] Audio playback error:", e);
|
|
cleanup();
|
|
this.currentAudio = null;
|
|
reject(new Error("Failed to play audio"));
|
|
};
|
|
|
|
audio.onended = () => {
|
|
// Only handle ended if this is still the current audio element
|
|
if (this.currentAudio !== audio) return;
|
|
|
|
cleanup();
|
|
this.currentAudio = null;
|
|
resolve();
|
|
};
|
|
|
|
if (onProgress) {
|
|
audio.ontimeupdate = () => {
|
|
if (audio.duration) {
|
|
onProgress((audio.currentTime / audio.duration) * 100);
|
|
}
|
|
};
|
|
}
|
|
|
|
// Wait for audio to be ready before setting playbackRate and playing
|
|
audio.oncanplaythrough = () => {
|
|
// Only handle if this is still the current audio element
|
|
if (this.currentAudio !== audio) return;
|
|
|
|
try {
|
|
if (playbackRate !== 1.0) {
|
|
audio.playbackRate = playbackRate;
|
|
}
|
|
audio.play().catch((err) => {
|
|
if (this.currentAudio !== audio) return;
|
|
console.error("[TTS] Play failed:", err);
|
|
cleanup();
|
|
this.currentAudio = null;
|
|
reject(err);
|
|
});
|
|
} catch (err) {
|
|
if (this.currentAudio !== audio) return;
|
|
console.error("[TTS] Error during playback setup:", err);
|
|
cleanup();
|
|
this.currentAudio = null;
|
|
reject(err);
|
|
}
|
|
};
|
|
|
|
audio.src = url;
|
|
audio.load();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Stop current playback
|
|
*/
|
|
stopAudio(): void {
|
|
if (this.currentAudio) {
|
|
const audio = this.currentAudio;
|
|
// Clear reference FIRST to prevent callbacks from firing
|
|
this.currentAudio = null;
|
|
|
|
try {
|
|
// Clear all handlers to prevent any callbacks
|
|
audio.oncanplaythrough = null;
|
|
audio.onended = null;
|
|
audio.onerror = null;
|
|
audio.ontimeupdate = null;
|
|
|
|
audio.pause();
|
|
audio.src = '';
|
|
} catch (e) {
|
|
// Ignore errors during cleanup
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get current playback state
|
|
*/
|
|
getPlaybackState(): { playing: boolean; progress: number; duration: number } {
|
|
if (!this.currentAudio) {
|
|
return { playing: false, progress: 0, duration: 0 };
|
|
}
|
|
return {
|
|
playing: !this.currentAudio.paused,
|
|
progress: this.currentAudio.currentTime,
|
|
duration: this.currentAudio.duration || 0,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Split text into chunks for TTS (Google Translate has ~200 char limit)
|
|
* Similar to SillyTavern's splitRecursive approach
|
|
*/
|
|
function splitTextForTTS(text: string, maxLength = 200): string[] {
|
|
if (!text || text.length === 0) return [];
|
|
if (text.length <= maxLength) return [text];
|
|
|
|
const chunks: string[] = [];
|
|
// Priority: paragraph breaks, sentence ends, commas, spaces
|
|
const delimiters = ["\n\n", "\n", ". ", "! ", "? ", ", ", " "];
|
|
|
|
let remaining = text;
|
|
while (remaining.length > 0) {
|
|
if (remaining.length <= maxLength) {
|
|
chunks.push(remaining);
|
|
break;
|
|
}
|
|
|
|
let splitIndex = -1;
|
|
// Try each delimiter in priority order
|
|
for (const delimiter of delimiters) {
|
|
const searchRange = remaining.substring(0, maxLength);
|
|
const lastIndex = searchRange.lastIndexOf(delimiter);
|
|
if (lastIndex > 0) {
|
|
splitIndex = lastIndex + delimiter.length;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If no delimiter found, force split at maxLength
|
|
if (splitIndex === -1) {
|
|
splitIndex = maxLength;
|
|
}
|
|
|
|
const chunk = remaining.substring(0, splitIndex).trim();
|
|
if (chunk.length > 0) {
|
|
chunks.push(chunk);
|
|
}
|
|
remaining = remaining.substring(splitIndex).trim();
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
export const GOOGLE_TRANSLATE_LANGUAGES: TTSVoice[] = [
|
|
{ name: "English (US)", id: "en", lang: "en" },
|
|
{ name: "English (UK)", id: "en-GB", lang: "en-GB" },
|
|
{ name: "Italian", id: "it", lang: "it" },
|
|
{ name: "Spanish", id: "es", lang: "es" },
|
|
{ name: "French", id: "fr", lang: "fr" },
|
|
{ name: "German", id: "de", lang: "de" },
|
|
{ name: "Japanese", id: "ja", lang: "ja" },
|
|
{ name: "Korean", id: "ko", lang: "ko" },
|
|
{ name: "Chinese (Simplified)", id: "zh-CN", lang: "zh-CN" },
|
|
{ name: "Chinese (Traditional)", id: "zh-TW", lang: "zh-TW" },
|
|
{ name: "Russian", id: "ru", lang: "ru" },
|
|
{ name: "Portuguese", id: "pt", lang: "pt" },
|
|
{ name: "Dutch", id: "nl", lang: "nl" },
|
|
{ name: "Polish", id: "pl", lang: "pl" },
|
|
{ name: "Turkish", id: "tr", lang: "tr" },
|
|
];
|
|
|
|
/**
|
|
* Google Translate TTS Provider
|
|
* Unofficial API usage (similar to SillyTavern implementation)
|
|
*/
|
|
export class GoogleTranslateTTSProvider extends TTSProvider {
|
|
private settings: TTSSettings;
|
|
|
|
constructor(settings: TTSSettings) {
|
|
super();
|
|
this.settings = settings;
|
|
}
|
|
|
|
override get name(): string {
|
|
return "Google Translate";
|
|
}
|
|
|
|
override async getAvailableVoices(): Promise<TTSVoice[]> {
|
|
return GOOGLE_TRANSLATE_LANGUAGES;
|
|
}
|
|
|
|
override async generateSpeech(text: string, voice: string): Promise<Blob> {
|
|
if (!text || text.trim().length === 0) {
|
|
throw new Error("TTS: Cannot generate speech for empty text");
|
|
}
|
|
|
|
// Split text into chunks for Google's ~200 char limit
|
|
const chunks = splitTextForTTS(text, 200);
|
|
const audioBuffers: ArrayBuffer[] = [];
|
|
|
|
for (const chunk of chunks) {
|
|
const encodedText = encodeURIComponent(chunk);
|
|
const url = `https://translate.google.com/translate_tts?ie=UTF-8&q=${encodedText}&tl=${voice}&client=tw-ob`;
|
|
|
|
// Use corsFetch to bypass CORS restrictions in Tauri
|
|
const response = await corsFetch(url);
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
throw new Error(
|
|
`Google TTS generation failed: ${response.status} - ${error}`,
|
|
);
|
|
}
|
|
|
|
audioBuffers.push(await response.arrayBuffer());
|
|
}
|
|
|
|
// Concatenate all audio chunks into a single blob
|
|
return new Blob(audioBuffers, { type: "audio/mpeg" });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* OpenAI-compatible TTS Provider
|
|
* Supports OpenAI, OpenRouter, and any OpenAI-compatible endpoint
|
|
*/
|
|
export class OpenAICompatibleTTSProvider extends TTSProvider {
|
|
private settings: TTSSettings;
|
|
private voiceCache: Map<string, TTSVoice[]> = new Map();
|
|
|
|
constructor(settings: TTSSettings) {
|
|
super();
|
|
this.settings = settings;
|
|
}
|
|
|
|
override get name(): string {
|
|
return "OpenAI Compatible";
|
|
}
|
|
|
|
/**
|
|
* Get endpoint URL
|
|
*/
|
|
private getEndpoint(): string {
|
|
// If no custom endpoint, use OpenRouter default
|
|
if (!this.settings.endpoint) {
|
|
return `${PROVIDERS.openrouter.baseUrl}/audio/speech`;
|
|
}
|
|
// Ensure endpoint ends with /audio/speech
|
|
const url = this.settings.endpoint.replace(/\/$/, "");
|
|
return url.endsWith("/audio/speech") ? url : `${url}/audio/speech`;
|
|
}
|
|
|
|
/**
|
|
* Verify settings are valid
|
|
*/
|
|
private validateSettings(): void {
|
|
if (!this.settings.apiKey) {
|
|
throw new Error("TTS: No API key configured");
|
|
}
|
|
if (!this.settings.model) {
|
|
throw new Error("TTS: No model selected");
|
|
}
|
|
if (!this.settings.voice) {
|
|
throw new Error("TTS: No voice selected");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get request headers
|
|
*/
|
|
private getHeaders(): HeadersInit {
|
|
return {
|
|
"Content-Type": "application/json",
|
|
Authorization: `Bearer ${this.settings.apiKey}`,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get available voices - cached per endpoint
|
|
*/
|
|
override async getAvailableVoices(): Promise<TTSVoice[]> {
|
|
const endpoint = this.getEndpoint();
|
|
|
|
// Return cached voices if available
|
|
if (this.voiceCache.has(endpoint)) {
|
|
return this.voiceCache.get(endpoint)!;
|
|
}
|
|
|
|
// Default voices matching OpenAI's standard set
|
|
const defaultVoices: TTSVoice[] = [
|
|
{ name: "Alloy", id: "alloy", lang: "en-US" },
|
|
{ name: "Echo", id: "echo", lang: "en-US" },
|
|
{ name: "Fable", id: "fable", lang: "en-US" },
|
|
{ name: "Onyx", id: "onyx", lang: "en-US" },
|
|
{ name: "Nova", id: "nova", lang: "en-US" },
|
|
{ name: "Shimmer", id: "shimmer", lang: "en-US" },
|
|
];
|
|
|
|
// Try to fetch custom voices from provider (optional)
|
|
try {
|
|
const response = await fetch(
|
|
`${endpoint.replace("/audio/speech", "")}/models`,
|
|
{
|
|
headers: this.getHeaders(),
|
|
},
|
|
);
|
|
|
|
if (response.ok) {
|
|
const data = await response.json();
|
|
if (data.data && Array.isArray(data.data)) {
|
|
const voices = data.data
|
|
.filter((m: any) => m.id?.includes("tts"))
|
|
.map((m: any) => ({
|
|
name: m.id,
|
|
id: m.id,
|
|
lang: "en-US",
|
|
}));
|
|
if (voices.length > 0) {
|
|
this.voiceCache.set(endpoint, voices);
|
|
return voices;
|
|
}
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.warn("[TTS] Failed to fetch custom voices, using defaults", err);
|
|
}
|
|
|
|
this.voiceCache.set(endpoint, defaultVoices);
|
|
return defaultVoices;
|
|
}
|
|
|
|
/**
|
|
* Generate TTS audio blob
|
|
*/
|
|
override async generateSpeech(text: string, voice: string): Promise<Blob> {
|
|
this.validateSettings();
|
|
|
|
if (!text || text.trim().length === 0) {
|
|
throw new Error("TTS: Cannot generate speech for empty text");
|
|
}
|
|
|
|
const response = await fetch(this.getEndpoint(), {
|
|
method: "POST",
|
|
headers: this.getHeaders(),
|
|
body: JSON.stringify({
|
|
model: this.settings.model,
|
|
input: text,
|
|
voice: voice,
|
|
speed: this.settings.speed,
|
|
response_format: "mp3",
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
throw new Error(`TTS generation failed: ${response.status} - ${error}`);
|
|
}
|
|
|
|
return response.blob();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* TTS Service - Main API for the application
|
|
* Manages TTS operations and provider lifecycle
|
|
*/
|
|
export class AITTSService {
|
|
private provider: TTSProvider | null = null;
|
|
private settings: TTSSettings | null = null;
|
|
private isPlaying = false;
|
|
private currentAudio: Blob | null = null;
|
|
|
|
/**
|
|
* Initialize service with settings
|
|
*/
|
|
async initialize(settings: TTSSettings): Promise<void> {
|
|
this.settings = settings;
|
|
|
|
if (!settings.enabled) {
|
|
this.provider = null;
|
|
return;
|
|
}
|
|
|
|
try {
|
|
if (settings.provider === "google") {
|
|
this.provider = new GoogleTranslateTTSProvider(settings);
|
|
} else {
|
|
this.provider = new OpenAICompatibleTTSProvider(settings);
|
|
}
|
|
|
|
// Validate by fetching voices
|
|
await this.provider.getAvailableVoices();
|
|
} catch (error) {
|
|
console.error("[TTSService] Failed to initialize provider:", error);
|
|
this.provider = null;
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Update settings and reinitialize if needed
|
|
*/
|
|
async updateSettings(settings: Partial<TTSSettings>): Promise<void> {
|
|
if (!this.settings) {
|
|
throw new Error("TTS service not initialized");
|
|
}
|
|
|
|
this.settings = { ...this.settings, ...settings };
|
|
await this.initialize(this.settings);
|
|
}
|
|
|
|
/**
|
|
* Check if service is ready
|
|
*/
|
|
isReady(): boolean {
|
|
return (this.settings?.enabled ?? false) && !!this.provider;
|
|
}
|
|
|
|
/**
|
|
* Get available voices
|
|
*/
|
|
async getAvailableVoices(): Promise<TTSVoice[]> {
|
|
if (!this.provider) {
|
|
throw new Error("TTS provider not initialized");
|
|
}
|
|
return this.provider.getAvailableVoices();
|
|
}
|
|
|
|
/**
|
|
* Generate and play TTS audio
|
|
*/
|
|
async generateAndPlay(
|
|
text: string,
|
|
voice?: string,
|
|
onProgress?: (progress: number) => void,
|
|
): Promise<void> {
|
|
if (!this.provider || !this.settings) {
|
|
throw new Error("TTS service not ready");
|
|
}
|
|
|
|
const voiceToUse = voice || this.settings.voice;
|
|
// For Google TTS, speed is applied client-side via playbackRate
|
|
// For OpenAI-compatible APIs, speed is handled server-side during generation
|
|
const playbackRate =
|
|
this.settings.provider === "google" ? this.settings.speed : 1.0;
|
|
|
|
try {
|
|
this.isPlaying = true;
|
|
const blob = await this.provider.generateSpeech(text, voiceToUse);
|
|
this.currentAudio = blob;
|
|
await this.provider.playAudio(blob, onProgress, playbackRate);
|
|
} finally {
|
|
this.isPlaying = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate TTS audio without playing
|
|
*/
|
|
async generateSpeech(text: string, voice?: string): Promise<Blob> {
|
|
if (!this.provider || !this.settings) {
|
|
throw new Error("TTS service not ready");
|
|
}
|
|
|
|
const voiceToUse = voice || this.settings.voice;
|
|
return this.provider.generateSpeech(text, voiceToUse);
|
|
}
|
|
|
|
/**
|
|
* Stop playback
|
|
*/
|
|
stopPlayback(): void {
|
|
if (!this.provider) return;
|
|
this.provider.stopAudio();
|
|
this.isPlaying = false;
|
|
}
|
|
|
|
/**
|
|
* Check if currently playing
|
|
*/
|
|
isCurrentlyPlaying(): boolean {
|
|
return this.isPlaying;
|
|
}
|
|
|
|
/**
|
|
* Get current playback progress
|
|
*/
|
|
getPlaybackProgress(): {
|
|
playing: boolean;
|
|
progress: number;
|
|
duration: number;
|
|
} {
|
|
if (!this.provider) {
|
|
return { playing: false, progress: 0, duration: 0 };
|
|
}
|
|
return this.provider.getPlaybackState();
|
|
}
|
|
}
|
|
|
|
// Export singleton instance
|
|
export const aiTTSService = new AITTSService();
|