tts.cpp merged and working in kcpp!

This commit is contained in:
Concedo 2025-08-17 18:09:28 +08:00
parent 52606e9b1d
commit bcaf379509
5 changed files with 90 additions and 11 deletions

View file

@ -5333,13 +5333,13 @@ def show_gui():
audio_tab = tabcontent["Audio"] audio_tab = tabcontent["Audio"]
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.") makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
whisper_model_var.trace_add("write", gui_changed_modelfile) whisper_model_var.trace_add("write", gui_changed_modelfile)
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech Required):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.") makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.")
tts_model_var.trace_add("write", gui_changed_modelfile) tts_model_var.trace_add("write", gui_changed_modelfile)
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.") makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
makelabelentry(audio_tab, "OuteTTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.") makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.") makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
ttsgpu_var.trace_add("write", gui_changed_modelfile) ttsgpu_var.trace_add("write", gui_changed_modelfile)
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech Required):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.") makefileentry(audio_tab, "WavTokenizer Model (Required for OuteTTS):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
wavtokenizer_var.trace_add("write", gui_changed_modelfile) wavtokenizer_var.trace_add("write", gui_changed_modelfile)
admin_tab = tabcontent["Admin"] admin_tab = tabcontent["Admin"]
@ -7610,7 +7610,7 @@ if __name__ == '__main__':
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="") whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
ttsparsergroup = parser.add_argument_group('TTS Narration Commands') ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="") ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the TTS Text-To-Speech GGUF model.", default="")
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="") ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true') ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen) ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)

View file

@ -499,6 +499,7 @@ static int nthreads = 4;
static int tts_max_len = 4096; static int tts_max_len = 4096;
//ttscpp specific //ttscpp specific
static bool is_ttscpp_file = false;
static generation_configuration * ttscpp_config = nullptr; static generation_configuration * ttscpp_config = nullptr;
static struct tts_runner * ttscpp_runner = nullptr; static struct tts_runner * ttscpp_runner = nullptr;
@ -539,7 +540,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
std::string modelfile_cts = inputs.cts_model_filename; std::string modelfile_cts = inputs.cts_model_filename;
std::string detectedarch = gguf_get_model_arch(modelfile_ttc); std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
bool is_ttscpp_file = false; is_ttscpp_file = false;
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) { if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
is_ttscpp_file = true; is_ttscpp_file = true;
printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str()); printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
@ -556,7 +557,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
// tts init // tts init
if (is_ttscpp_file) { if (is_ttscpp_file) {
ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0); ttscpp_config = new generation_configuration("am_adam", 50, 1.0, 1.0, true, "", 0, 1.0);
ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true); ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
if (ttscpp_runner == nullptr) { if (ttscpp_runner == nullptr) {
printf("\nTTS Load Error: Failed to initialize TTSCPP!\n"); printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
@ -640,7 +641,72 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
return true; return true;
} }
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_inputs inputs)
{
tts_generation_outputs output;
if(ttscpp_runner==nullptr || ttscpp_config==nullptr)
{
printf("\nWarning: KCPP TTSCPP not initialized! Make sure TTS model is loaded successfully.\n");
output.data = "";
output.status = 0;
return output;
}
int speaker_seed = inputs.speaker_seed;
std::string voiceused = "am_adam";
std::string prompt = inputs.prompt;
double ttstime = 0;
timer_start();
switch(speaker_seed)
{
case 1:
voiceused = "am_adam";
break;
case 2:
voiceused = "af_alloy";
break;
case 3:
voiceused = "af_jessica";
break;
case 4:
voiceused = "bm_george";
break;
case 5:
voiceused = "bf_isabella";
break;
}
if(ttsdebugmode==1 && !tts_is_quiet)
{
printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());
printf("\nInput: %s\n", prompt.c_str());
}
ttscpp_config->voice = voiceused;
tts_response response_data;
int errorres = generate(ttscpp_runner, prompt, &response_data, ttscpp_config);
if(errorres==0)
{
ttstime = timer_check();
printf("\nTTS Generated %d audio in %.2fs.\n",ttstime);
std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
output.data = last_generated_audio.c_str();
output.status = 1;
last_generation_settings_audio_seed = 0;
last_generation_settings_speaker_seed = speaker_seed;
last_generation_settings_prompt = std::string(prompt);
total_tts_gens += 1;
return output;
}
else
{
printf("\nError: TTSCPP generation failed\n");
output.data = "";
output.status = 0;
return output;
}
}
static tts_generation_outputs ttstype_generate_outetts(const tts_generation_inputs inputs)
{ {
tts_generation_outputs output; tts_generation_outputs output;
@ -1051,3 +1117,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
return output; return output;
} }
} }
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
{
if (is_ttscpp_file) {
return ttstype_generate_ttscpp(inputs);
} else {
return ttstype_generate_outetts(inputs);
}
}

View file

@ -1389,7 +1389,8 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) { int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
if (model->voices.find(voice) == model->voices.end()) { if (model->voices.find(voice) == model->voices.end()) {
TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str()); fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
return -1;
} else { } else {
// if the language changed then we should change the phonemization voice // if the language changed then we should change the phonemization voice
if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) { if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {

View file

@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
// surpass the default size. // surpass the default size.
if (batch.tokens.size() > model->max_context_length) { if (batch.tokens.size() > model->max_context_length) {
TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt."); fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
return -1;
} }
octx->reset(); octx->reset();
generation_sampler->reset(); generation_sampler->reset();
@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {
generation_sampler->top_k = config->top_k; generation_sampler->top_k = config->top_k;
generation_sampler->top_p = config->top_p; generation_sampler->top_p = config->top_p;
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) { if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str()); fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());
config->voice = "zoe";
} }
octx->voice = config->voice; octx->voice = config->voice;
} }

View file

@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
} }
} }
//returns 0 on success
int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) { int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
switch(runner->arch) { switch(runner->arch) {
case PARLER_TTS_ARCH: case PARLER_TTS_ARCH: