tts.cpp merged and working in kcpp!

2025-09-10 17:14:36 +00:00 · 2025-08-17 18:09:28 +08:00 · 2025-08-17 18:09:28 +08:00 · bcaf379509
commit bcaf379509
parent 52606e9b1d
5 changed files with 90 additions and 11 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -5333,13 +5333,13 @@ def show_gui():
    audio_tab = tabcontent["Audio"]
    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
    whisper_model_var.trace_add("write", gui_changed_modelfile)
-    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech Required):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.")
    tts_model_var.trace_add("write", gui_changed_modelfile)
-    makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
+    makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
-    makelabelentry(audio_tab, "OuteTTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
+    makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
    ttsgpu_var.trace_add("write", gui_changed_modelfile)
-    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech Required):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "WavTokenizer Model (Required for OuteTTS):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
    wavtokenizer_var.trace_add("write", gui_changed_modelfile)
    admin_tab = tabcontent["Admin"]
@ -7610,7 +7610,7 @@ if __name__ == '__main__':
    whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
    ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
-    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
+    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the TTS Text-To-Speech GGUF model.", default="")
    ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
    ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
    ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.",  type=int, default=default_ttsmaxlen)
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -499,6 +499,7 @@ static int nthreads = 4;
 static int tts_max_len = 4096;
 //ttscpp specific
 static bool is_ttscpp_file = false;
 static generation_configuration * ttscpp_config = nullptr;
 static struct tts_runner * ttscpp_runner = nullptr;
@ -539,7 +540,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
    std::string modelfile_cts = inputs.cts_model_filename;
    std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
-    bool is_ttscpp_file = false;
+    is_ttscpp_file = false;
    if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
        is_ttscpp_file = true;
        printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
@ -556,7 +557,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
    // tts init
    if (is_ttscpp_file) {
-        ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0);
+        ttscpp_config = new generation_configuration("am_adam", 50, 1.0, 1.0, true, "", 0, 1.0);
        ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
        if (ttscpp_runner == nullptr) {
            printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
@ -640,7 +641,72 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
    return true;
 }
-tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
+static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_inputs inputs)
 {
    tts_generation_outputs output;
    if(ttscpp_runner==nullptr || ttscpp_config==nullptr)
    {
        printf("\nWarning: KCPP TTSCPP not initialized! Make sure TTS model is loaded successfully.\n");
        output.data = "";
        output.status = 0;
        return output;
    }
    int speaker_seed = inputs.speaker_seed;
    std::string voiceused = "am_adam";
    std::string prompt = inputs.prompt;
    double ttstime = 0;
    timer_start();
    switch(speaker_seed)
    {
        case 1:
            voiceused = "am_adam";
            break;
        case 2:
            voiceused = "af_alloy";
            break;
        case 3:
            voiceused = "af_jessica";
            break;
        case 4:
            voiceused = "bm_george";
            break;
        case 5:
            voiceused = "bf_isabella";
            break;
    }
    if(ttsdebugmode==1 && !tts_is_quiet)
    {
        printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());
        printf("\nInput: %s\n", prompt.c_str());
    }
    ttscpp_config->voice = voiceused;
    tts_response response_data;
    int errorres = generate(ttscpp_runner, prompt, &response_data, ttscpp_config);
    if(errorres==0)
    {
        ttstime = timer_check();
        printf("\nTTS Generated %d audio in %.2fs.\n",ttstime);
        std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
        last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
        output.data = last_generated_audio.c_str();
        output.status = 1;
        last_generation_settings_audio_seed = 0;
        last_generation_settings_speaker_seed = speaker_seed;
        last_generation_settings_prompt = std::string(prompt);
        total_tts_gens += 1;
        return output;
    }
    else
    {
        printf("\nError: TTSCPP generation failed\n");
        output.data = "";
        output.status = 0;
        return output;
    }
 }
 static tts_generation_outputs ttstype_generate_outetts(const tts_generation_inputs inputs)
 {
    tts_generation_outputs output;
@ -1051,3 +1117,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
        return output;
    }
 }
 tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
 {
    if (is_ttscpp_file) {
        return ttstype_generate_ttscpp(inputs);
    } else {
        return ttstype_generate_outetts(inputs);
    }
 }
--- a/otherarch/ttscpp/src/kokoro_model.cpp
+++ b/otherarch/ttscpp/src/kokoro_model.cpp
@ -1389,7 +1389,8 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
 int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
 	if (model->voices.find(voice) == model->voices.end()) {
-		TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str());
+		fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
 		return -1;
    } else {
    	// if the language changed then we should change the phonemization voice
    	if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {
--- a/otherarch/ttscpp/src/orpheus_model.cpp
+++ b/otherarch/ttscpp/src/orpheus_model.cpp
@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
    // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
    // surpass the default size.
    if (batch.tokens.size() > model->max_context_length) {
-        TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
+        fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
        return -1;
    }
    octx->reset();
    generation_sampler->reset();
@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {
    generation_sampler->top_k = config->top_k;
    generation_sampler->top_p = config->top_p;
    if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
-        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
+        fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());
        config->voice = "zoe";
    }
    octx->voice = config->voice;
 }
--- a/otherarch/ttscpp/src/ttscpp.cpp
+++ b/otherarch/ttscpp/src/ttscpp.cpp
@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
    }
 }
 //returns 0 on success
 int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
    switch(runner->arch) {
        case PARLER_TTS_ARCH: