mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
tts.cpp merged and working in kcpp!
This commit is contained in:
parent
52606e9b1d
commit
bcaf379509
5 changed files with 90 additions and 11 deletions
10
koboldcpp.py
10
koboldcpp.py
|
@ -5333,13 +5333,13 @@ def show_gui():
|
||||||
audio_tab = tabcontent["Audio"]
|
audio_tab = tabcontent["Audio"]
|
||||||
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
||||||
whisper_model_var.trace_add("write", gui_changed_modelfile)
|
whisper_model_var.trace_add("write", gui_changed_modelfile)
|
||||||
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech Required):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.")
|
||||||
tts_model_var.trace_add("write", gui_changed_modelfile)
|
tts_model_var.trace_add("write", gui_changed_modelfile)
|
||||||
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
|
makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
|
||||||
makelabelentry(audio_tab, "OuteTTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
|
makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
|
||||||
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
|
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
|
||||||
ttsgpu_var.trace_add("write", gui_changed_modelfile)
|
ttsgpu_var.trace_add("write", gui_changed_modelfile)
|
||||||
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech Required):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
makefileentry(audio_tab, "WavTokenizer Model (Required for OuteTTS):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
||||||
wavtokenizer_var.trace_add("write", gui_changed_modelfile)
|
wavtokenizer_var.trace_add("write", gui_changed_modelfile)
|
||||||
|
|
||||||
admin_tab = tabcontent["Admin"]
|
admin_tab = tabcontent["Admin"]
|
||||||
|
@ -7610,7 +7610,7 @@ if __name__ == '__main__':
|
||||||
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
|
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
|
||||||
|
|
||||||
ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
|
ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
|
||||||
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the TTS Text-To-Speech GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
||||||
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)
|
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)
|
||||||
|
|
|
@ -499,6 +499,7 @@ static int nthreads = 4;
|
||||||
static int tts_max_len = 4096;
|
static int tts_max_len = 4096;
|
||||||
|
|
||||||
//ttscpp specific
|
//ttscpp specific
|
||||||
|
static bool is_ttscpp_file = false;
|
||||||
static generation_configuration * ttscpp_config = nullptr;
|
static generation_configuration * ttscpp_config = nullptr;
|
||||||
static struct tts_runner * ttscpp_runner = nullptr;
|
static struct tts_runner * ttscpp_runner = nullptr;
|
||||||
|
|
||||||
|
@ -539,7 +540,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
std::string modelfile_cts = inputs.cts_model_filename;
|
std::string modelfile_cts = inputs.cts_model_filename;
|
||||||
std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
|
std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
|
||||||
|
|
||||||
bool is_ttscpp_file = false;
|
is_ttscpp_file = false;
|
||||||
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
|
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
|
||||||
is_ttscpp_file = true;
|
is_ttscpp_file = true;
|
||||||
printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
|
printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
|
||||||
|
@ -556,7 +557,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
|
|
||||||
// tts init
|
// tts init
|
||||||
if (is_ttscpp_file) {
|
if (is_ttscpp_file) {
|
||||||
ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0);
|
ttscpp_config = new generation_configuration("am_adam", 50, 1.0, 1.0, true, "", 0, 1.0);
|
||||||
ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
|
ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
|
||||||
if (ttscpp_runner == nullptr) {
|
if (ttscpp_runner == nullptr) {
|
||||||
printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
|
printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
|
||||||
|
@ -640,7 +641,72 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_inputs inputs)
|
||||||
|
{
|
||||||
|
tts_generation_outputs output;
|
||||||
|
if(ttscpp_runner==nullptr || ttscpp_config==nullptr)
|
||||||
|
{
|
||||||
|
printf("\nWarning: KCPP TTSCPP not initialized! Make sure TTS model is loaded successfully.\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
int speaker_seed = inputs.speaker_seed;
|
||||||
|
std::string voiceused = "am_adam";
|
||||||
|
std::string prompt = inputs.prompt;
|
||||||
|
double ttstime = 0;
|
||||||
|
timer_start();
|
||||||
|
switch(speaker_seed)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
voiceused = "am_adam";
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
voiceused = "af_alloy";
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
voiceused = "af_jessica";
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
voiceused = "bm_george";
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
voiceused = "bf_isabella";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(ttsdebugmode==1 && !tts_is_quiet)
|
||||||
|
{
|
||||||
|
printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());
|
||||||
|
printf("\nInput: %s\n", prompt.c_str());
|
||||||
|
}
|
||||||
|
ttscpp_config->voice = voiceused;
|
||||||
|
|
||||||
|
tts_response response_data;
|
||||||
|
int errorres = generate(ttscpp_runner, prompt, &response_data, ttscpp_config);
|
||||||
|
if(errorres==0)
|
||||||
|
{
|
||||||
|
ttstime = timer_check();
|
||||||
|
printf("\nTTS Generated %d audio in %.2fs.\n",ttstime);
|
||||||
|
std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
|
||||||
|
last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
|
||||||
|
output.data = last_generated_audio.c_str();
|
||||||
|
output.status = 1;
|
||||||
|
last_generation_settings_audio_seed = 0;
|
||||||
|
last_generation_settings_speaker_seed = speaker_seed;
|
||||||
|
last_generation_settings_prompt = std::string(prompt);
|
||||||
|
total_tts_gens += 1;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("\nError: TTSCPP generation failed\n");
|
||||||
|
output.data = "";
|
||||||
|
output.status = 0;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static tts_generation_outputs ttstype_generate_outetts(const tts_generation_inputs inputs)
|
||||||
{
|
{
|
||||||
tts_generation_outputs output;
|
tts_generation_outputs output;
|
||||||
|
|
||||||
|
@ -1051,3 +1117,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
|
{
|
||||||
|
if (is_ttscpp_file) {
|
||||||
|
return ttstype_generate_ttscpp(inputs);
|
||||||
|
} else {
|
||||||
|
return ttstype_generate_outetts(inputs);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1389,7 +1389,8 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
|
||||||
|
|
||||||
int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
|
int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
|
||||||
if (model->voices.find(voice) == model->voices.end()) {
|
if (model->voices.find(voice) == model->voices.end()) {
|
||||||
TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str());
|
fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
|
||||||
|
return -1;
|
||||||
} else {
|
} else {
|
||||||
// if the language changed then we should change the phonemization voice
|
// if the language changed then we should change the phonemization voice
|
||||||
if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {
|
if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {
|
||||||
|
|
|
@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
|
||||||
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
|
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
|
||||||
// surpass the default size.
|
// surpass the default size.
|
||||||
if (batch.tokens.size() > model->max_context_length) {
|
if (batch.tokens.size() > model->max_context_length) {
|
||||||
TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
|
fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
octx->reset();
|
octx->reset();
|
||||||
generation_sampler->reset();
|
generation_sampler->reset();
|
||||||
|
@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {
|
||||||
generation_sampler->top_k = config->top_k;
|
generation_sampler->top_k = config->top_k;
|
||||||
generation_sampler->top_p = config->top_p;
|
generation_sampler->top_p = config->top_p;
|
||||||
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
|
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
|
||||||
TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
|
fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());
|
||||||
|
config->voice = "zoe";
|
||||||
}
|
}
|
||||||
octx->voice = config->voice;
|
octx->voice = config->voice;
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//returns 0 on success
|
||||||
int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
|
int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
|
||||||
switch(runner->arch) {
|
switch(runner->arch) {
|
||||||
case PARLER_TTS_ARCH:
|
case PARLER_TTS_ARCH:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue