Aventuras/src/lib/services/ai/utils/TTSService.ts

/**
 * Text-To-Speech (TTS) Service
 * Provides audio generation via OpenAI-compatible APIs (and others) with streaming support.
 * Designed for extensibility to support multiple TTS providers.
 */

import { PROVIDERS } from '../sdk/providers/config';
import type { APIProfile } from "$lib/types";
import { corsFetch } from "$lib/services/discovery/utils";

// TTS Configuration - matches TTSServiceSettings in settings.svelte.ts
export interface TTSSettings {
  enabled: boolean;
  endpoint: string;
  apiKey: string;
  model: string;
  voice: string;
  speed: number;
  autoPlay: boolean;
  excludedCharacters: string;
  removeHtmlTags: boolean;
  removeAllHtmlContent: boolean;
  htmlTagsToRemoveContent: string;
  provider: "openai" | "google";
}

export interface TTSVoice {
  name: string;
  id: string;
  lang: string;
}

export interface TTSStreamChunk {
  type: "data" | "end" | "error";
  data?: ArrayBuffer;
  error?: string;
}

/**
 * Base TTS Provider - extend this for custom implementations
 */
export abstract class TTSProvider {
  private currentAudio: HTMLAudioElement | null = null;

  abstract get name(): string;

  /**
   * Get available voices for this provider
   */
  abstract getAvailableVoices(): Promise<TTSVoice[]>;

  /**
   * Generate TTS audio and return as blob
   */
  abstract generateSpeech(text: string, voice: string): Promise<Blob>;

  /**
   * Play audio blob using HTML Audio element
   * IMPORTANT: We create a fresh audio element each time to avoid crashes
   * on Linux with newer GStreamer versions when changing audio source.
   * @param blob - The audio blob to play
   * @param onProgress - Optional callback for playback progress
   * @param playbackRate - Optional playback speed
   */
  async playAudio(
    blob: Blob,
    onProgress?: (progress: number) => void,
    playbackRate = 1.0,
  ): Promise<void> {
    return new Promise((resolve, reject) => {
      // Stop any previous playback and destroy old element
      this.stopAudio();

      const url = URL.createObjectURL(blob);

      // Create a FRESH audio element each time (workaround for GStreamer crash)
      const audio = new Audio();
      this.currentAudio = audio;

      const cleanup = () => {
        URL.revokeObjectURL(url);
        audio.oncanplaythrough = null;
        audio.onended = null;
        audio.onerror = null;
        audio.ontimeupdate = null;
      };

      audio.onerror = (e) => {
        // Only handle error if this is still the current audio element
        // (prevents false errors when stopAudio() clears an old element)
        if (this.currentAudio !== audio) return;

        console.error("[TTS] Audio playback error:", e);
        cleanup();
        this.currentAudio = null;
        reject(new Error("Failed to play audio"));
      };

      audio.onended = () => {
        // Only handle ended if this is still the current audio element
        if (this.currentAudio !== audio) return;

        cleanup();
        this.currentAudio = null;
        resolve();
      };

      if (onProgress) {
        audio.ontimeupdate = () => {
          if (audio.duration) {
            onProgress((audio.currentTime / audio.duration) * 100);
          }
        };
      }

      // Wait for audio to be ready before setting playbackRate and playing
      audio.oncanplaythrough = () => {
        // Only handle if this is still the current audio element
        if (this.currentAudio !== audio) return;

        try {
          if (playbackRate !== 1.0) {
            audio.playbackRate = playbackRate;
          }
          audio.play().catch((err) => {
            if (this.currentAudio !== audio) return;
            console.error("[TTS] Play failed:", err);
            cleanup();
            this.currentAudio = null;
            reject(err);
          });
        } catch (err) {
          if (this.currentAudio !== audio) return;
          console.error("[TTS] Error during playback setup:", err);
          cleanup();
          this.currentAudio = null;
          reject(err);
        }
      };

      audio.src = url;
      audio.load();
    });
  }

  /**
   * Stop current playback
   */
  stopAudio(): void {
    if (this.currentAudio) {
      const audio = this.currentAudio;
      // Clear reference FIRST to prevent callbacks from firing
      this.currentAudio = null;

      try {
        // Clear all handlers to prevent any callbacks
        audio.oncanplaythrough = null;
        audio.onended = null;
        audio.onerror = null;
        audio.ontimeupdate = null;

        audio.pause();
        audio.src = '';
      } catch (e) {
        // Ignore errors during cleanup
      }
    }
  }

  /**
   * Get current playback state
   */
  getPlaybackState(): { playing: boolean; progress: number; duration: number } {
    if (!this.currentAudio) {
      return { playing: false, progress: 0, duration: 0 };
    }
    return {
      playing: !this.currentAudio.paused,
      progress: this.currentAudio.currentTime,
      duration: this.currentAudio.duration || 0,
    };
  }
}

/**
 * Split text into chunks for TTS (Google Translate has ~200 char limit)
 * Similar to SillyTavern's splitRecursive approach
 */
function splitTextForTTS(text: string, maxLength = 200): string[] {
  if (!text || text.length === 0) return [];
  if (text.length <= maxLength) return [text];

  const chunks: string[] = [];
  // Priority: paragraph breaks, sentence ends, commas, spaces
  const delimiters = ["\n\n", "\n", ". ", "! ", "? ", ", ", " "];

  let remaining = text;
  while (remaining.length > 0) {
    if (remaining.length <= maxLength) {
      chunks.push(remaining);
      break;
    }

    let splitIndex = -1;
    // Try each delimiter in priority order
    for (const delimiter of delimiters) {
      const searchRange = remaining.substring(0, maxLength);
      const lastIndex = searchRange.lastIndexOf(delimiter);
      if (lastIndex > 0) {
        splitIndex = lastIndex + delimiter.length;
        break;
      }
    }

    // If no delimiter found, force split at maxLength
    if (splitIndex === -1) {
      splitIndex = maxLength;
    }

    const chunk = remaining.substring(0, splitIndex).trim();
    if (chunk.length > 0) {
      chunks.push(chunk);
    }
    remaining = remaining.substring(splitIndex).trim();
  }

  return chunks;
}

export const GOOGLE_TRANSLATE_LANGUAGES: TTSVoice[] = [
  { name: "English (US)", id: "en", lang: "en" },
  { name: "English (UK)", id: "en-GB", lang: "en-GB" },
  { name: "Italian", id: "it", lang: "it" },
  { name: "Spanish", id: "es", lang: "es" },
  { name: "French", id: "fr", lang: "fr" },
  { name: "German", id: "de", lang: "de" },
  { name: "Japanese", id: "ja", lang: "ja" },
  { name: "Korean", id: "ko", lang: "ko" },
  { name: "Chinese (Simplified)", id: "zh-CN", lang: "zh-CN" },
  { name: "Chinese (Traditional)", id: "zh-TW", lang: "zh-TW" },
  { name: "Russian", id: "ru", lang: "ru" },
  { name: "Portuguese", id: "pt", lang: "pt" },
  { name: "Dutch", id: "nl", lang: "nl" },
  { name: "Polish", id: "pl", lang: "pl" },
  { name: "Turkish", id: "tr", lang: "tr" },
];

/**
 * Google Translate TTS Provider
 * Unofficial API usage (similar to SillyTavern implementation)
 */
export class GoogleTranslateTTSProvider extends TTSProvider {
  private settings: TTSSettings;

  constructor(settings: TTSSettings) {
    super();
    this.settings = settings;
  }

  override get name(): string {
    return "Google Translate";
  }

  override async getAvailableVoices(): Promise<TTSVoice[]> {
    return GOOGLE_TRANSLATE_LANGUAGES;
  }

  override async generateSpeech(text: string, voice: string): Promise<Blob> {
    if (!text || text.trim().length === 0) {
      throw new Error("TTS: Cannot generate speech for empty text");
    }

    // Split text into chunks for Google's ~200 char limit
    const chunks = splitTextForTTS(text, 200);
    const audioBuffers: ArrayBuffer[] = [];

    for (const chunk of chunks) {
      const encodedText = encodeURIComponent(chunk);
      const url = `https://translate.google.com/translate_tts?ie=UTF-8&q=${encodedText}&tl=${voice}&client=tw-ob`;

      // Use corsFetch to bypass CORS restrictions in Tauri
      const response = await corsFetch(url);

      if (!response.ok) {
        const error = await response.text();
        throw new Error(
          `Google TTS generation failed: ${response.status} - ${error}`,
        );
      }

      audioBuffers.push(await response.arrayBuffer());
    }

    // Concatenate all audio chunks into a single blob
    return new Blob(audioBuffers, { type: "audio/mpeg" });
  }
}

/**
 * OpenAI-compatible TTS Provider
 * Supports OpenAI, OpenRouter, and any OpenAI-compatible endpoint
 */
export class OpenAICompatibleTTSProvider extends TTSProvider {
  private settings: TTSSettings;
  private voiceCache: Map<string, TTSVoice[]> = new Map();

  constructor(settings: TTSSettings) {
    super();
    this.settings = settings;
  }

  override get name(): string {
    return "OpenAI Compatible";
  }

  /**
   * Get endpoint URL
   */
  private getEndpoint(): string {
    // If no custom endpoint, use OpenRouter default
    if (!this.settings.endpoint) {
      return `${PROVIDERS.openrouter.baseUrl}/audio/speech`;
    }
    // Ensure endpoint ends with /audio/speech
    const url = this.settings.endpoint.replace(/\/$/, "");
    return url.endsWith("/audio/speech") ? url : `${url}/audio/speech`;
  }

  /**
   * Verify settings are valid
   */
  private validateSettings(): void {
    if (!this.settings.apiKey) {
      throw new Error("TTS: No API key configured");
    }
    if (!this.settings.model) {
      throw new Error("TTS: No model selected");
    }
    if (!this.settings.voice) {
      throw new Error("TTS: No voice selected");
    }
  }

  /**
   * Get request headers
   */
  private getHeaders(): HeadersInit {
    return {
      "Content-Type": "application/json",
      Authorization: `Bearer ${this.settings.apiKey}`,
    };
  }

  /**
   * Get available voices - cached per endpoint
   */
  override async getAvailableVoices(): Promise<TTSVoice[]> {
    const endpoint = this.getEndpoint();

    // Return cached voices if available
    if (this.voiceCache.has(endpoint)) {
      return this.voiceCache.get(endpoint)!;
    }

    // Default voices matching OpenAI's standard set
    const defaultVoices: TTSVoice[] = [
      { name: "Alloy", id: "alloy", lang: "en-US" },
      { name: "Echo", id: "echo", lang: "en-US" },
      { name: "Fable", id: "fable", lang: "en-US" },
      { name: "Onyx", id: "onyx", lang: "en-US" },
      { name: "Nova", id: "nova", lang: "en-US" },
      { name: "Shimmer", id: "shimmer", lang: "en-US" },
    ];

    // Try to fetch custom voices from provider (optional)
    try {
      const response = await fetch(
        `${endpoint.replace("/audio/speech", "")}/models`,
        {
          headers: this.getHeaders(),
        },
      );

      if (response.ok) {
        const data = await response.json();
        if (data.data && Array.isArray(data.data)) {
          const voices = data.data
            .filter((m: any) => m.id?.includes("tts"))
            .map((m: any) => ({
              name: m.id,
              id: m.id,
              lang: "en-US",
            }));
          if (voices.length > 0) {
            this.voiceCache.set(endpoint, voices);
            return voices;
          }
        }
      }
    } catch (err) {
      console.warn("[TTS] Failed to fetch custom voices, using defaults", err);
    }

    this.voiceCache.set(endpoint, defaultVoices);
    return defaultVoices;
  }

  /**
   * Generate TTS audio blob
   */
  override async generateSpeech(text: string, voice: string): Promise<Blob> {
    this.validateSettings();

    if (!text || text.trim().length === 0) {
      throw new Error("TTS: Cannot generate speech for empty text");
    }

    const response = await fetch(this.getEndpoint(), {
      method: "POST",
      headers: this.getHeaders(),
      body: JSON.stringify({
        model: this.settings.model,
        input: text,
        voice: voice,
        speed: this.settings.speed,
        response_format: "mp3",
      }),
    });

    if (!response.ok) {
      const error = await response.text();
      throw new Error(`TTS generation failed: ${response.status} - ${error}`);
    }

    return response.blob();
  }
}

/**
 * TTS Service - Main API for the application
 * Manages TTS operations and provider lifecycle
 */
export class AITTSService {
  private provider: TTSProvider | null = null;
  private settings: TTSSettings | null = null;
  private isPlaying = false;
  private currentAudio: Blob | null = null;

  /**
   * Initialize service with settings
   */
  async initialize(settings: TTSSettings): Promise<void> {
    this.settings = settings;

    if (!settings.enabled) {
      this.provider = null;
      return;
    }

    try {
      if (settings.provider === "google") {
        this.provider = new GoogleTranslateTTSProvider(settings);
      } else {
        this.provider = new OpenAICompatibleTTSProvider(settings);
      }

      // Validate by fetching voices
      await this.provider.getAvailableVoices();
    } catch (error) {
      console.error("[TTSService] Failed to initialize provider:", error);
      this.provider = null;
      throw error;
    }
  }

  /**
   * Update settings and reinitialize if needed
   */
  async updateSettings(settings: Partial<TTSSettings>): Promise<void> {
    if (!this.settings) {
      throw new Error("TTS service not initialized");
    }

    this.settings = { ...this.settings, ...settings };
    await this.initialize(this.settings);
  }

  /**
   * Check if service is ready
   */
  isReady(): boolean {
    return (this.settings?.enabled ?? false) && !!this.provider;
  }

  /**
   * Get available voices
   */
  async getAvailableVoices(): Promise<TTSVoice[]> {
    if (!this.provider) {
      throw new Error("TTS provider not initialized");
    }
    return this.provider.getAvailableVoices();
  }

  /**
   * Generate and play TTS audio
   */
  async generateAndPlay(
    text: string,
    voice?: string,
    onProgress?: (progress: number) => void,
  ): Promise<void> {
    if (!this.provider || !this.settings) {
      throw new Error("TTS service not ready");
    }

    const voiceToUse = voice || this.settings.voice;
    // For Google TTS, speed is applied client-side via playbackRate
    // For OpenAI-compatible APIs, speed is handled server-side during generation
    const playbackRate =
      this.settings.provider === "google" ? this.settings.speed : 1.0;

    try {
      this.isPlaying = true;
      const blob = await this.provider.generateSpeech(text, voiceToUse);
      this.currentAudio = blob;
      await this.provider.playAudio(blob, onProgress, playbackRate);
    } finally {
      this.isPlaying = false;
    }
  }

  /**
   * Generate TTS audio without playing
   */
  async generateSpeech(text: string, voice?: string): Promise<Blob> {
    if (!this.provider || !this.settings) {
      throw new Error("TTS service not ready");
    }

    const voiceToUse = voice || this.settings.voice;
    return this.provider.generateSpeech(text, voiceToUse);
  }

  /**
   * Stop playback
   */
  stopPlayback(): void {
    if (!this.provider) return;
    this.provider.stopAudio();
    this.isPlaying = false;
  }

  /**
   * Check if currently playing
   */
  isCurrentlyPlaying(): boolean {
    return this.isPlaying;
  }

  /**
   * Get current playback progress
   */
  getPlaybackProgress(): {
    playing: boolean;
    progress: number;
    duration: number;
  } {
    if (!this.provider) {
      return { playing: false, progress: 0, duration: 0 };
    }
    return this.provider.getPlaybackState();
  }
}

// Export singleton instance
export const aiTTSService = new AITTSService();