From 2abe11071bc580b3788c81055da11b47d8f185f9 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:57:34 +0800 Subject: [PATCH] custom voice handling --- expose.h | 1 + koboldcpp.py | 2 ++ otherarch/tts_adapter.cpp | 45 ++++++++++++++++++++++++--------------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/expose.h b/expose.h index 575b3306e..c27ecebbd 100644 --- a/expose.h +++ b/expose.h @@ -247,6 +247,7 @@ struct tts_generation_inputs const char * prompt = nullptr; const int speaker_seed = 0; const int audio_seed = 0; + const char * custom_speaker_voice = ""; const char * custom_speaker_text = ""; const char * custom_speaker_data = ""; }; diff --git a/koboldcpp.py b/koboldcpp.py index 161a1cae6..d2dcba3af 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -352,6 +352,7 @@ class tts_generation_inputs(ctypes.Structure): _fields_ = [("prompt", ctypes.c_char_p), ("speaker_seed", ctypes.c_int), ("audio_seed", ctypes.c_int), + ("custom_speaker_voice", ctypes.c_char_p), ("custom_speaker_text", ctypes.c_char_p), ("custom_speaker_data", ctypes.c_char_p)] @@ -1880,6 +1881,7 @@ def tts_generate(genparams): else: voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1 inputs = tts_generation_inputs() + inputs.custom_speaker_voice = normalized_voice.encode("UTF-8") inputs.prompt = prompt.encode("UTF-8") inputs.speaker_seed = voice aseed = -1 diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 4e9d1c0ae..264a5a686 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -502,6 +502,7 @@ static int tts_max_len = 4096; static bool is_ttscpp_file = false; static generation_configuration * ttscpp_config = nullptr; static struct tts_runner * ttscpp_runner = nullptr; +static std::string detectedarch = ""; int total_tts_gens = 0; static std::string tts_executable_path = ""; @@ -540,7 +541,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs) std::string modelfile_ttc = inputs.ttc_model_filename; std::string modelfile_cts = inputs.cts_model_filename; - std::string detectedarch = gguf_get_model_arch(modelfile_ttc); + detectedarch = gguf_get_model_arch(modelfile_ttc); is_ttscpp_file = false; if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) { @@ -663,24 +664,34 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input std::string prompt = inputs.prompt; double ttstime = 0; timer_start(); - switch(speaker_seed) + + std::vector vmapper = {}; + std::vector vpermitted = {}; + + if(detectedarch=="kokoro") { - case 1: - voiceused = "am_echo"; - break; - case 2: - voiceused = "af_alloy"; - break; - case 3: - voiceused = "af_jessica"; - break; - case 4: - voiceused = "bm_daniel"; - break; - case 5: - voiceused = "bf_isabella"; - break; + vmapper = {"am_echo","af_heart","af_alloy","bm_daniel","bf_isabella"}; + vpermitted = {"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis"}; } + else if(detectedarch=="dia") + { + vmapper = {"zoe", "zac", "jess", "leo", "mia"}; + vpermitted = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"}; + } + + if(speaker_seed>=1 && speaker_seed<=5 && vmapper.size()>=5) + { + voiceused = vmapper[speaker_seed-1]; + } + else if(vpermitted.size()>0) + { + //if we can match the voice, use it + const std::string cspeaker = inputs.custom_speaker_voice; + if (std::find(vpermitted.begin(), vpermitted.end(), cspeaker) != vpermitted.end()) { + voiceused = cspeaker; + } + } + if(ttsdebugmode==1 && !tts_is_quiet) { printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());