custom voice handling

2025-09-10 17:14:36 +00:00 · 2025-08-18 16:57:34 +08:00 · 2025-08-18 16:57:34 +08:00 · 2abe11071b
commit 2abe11071b
parent 67ef5e6c02
3 changed files with 31 additions and 17 deletions
--- a/expose.h
+++ b/expose.h
@ -247,6 +247,7 @@ struct tts_generation_inputs
    const char * prompt = nullptr;
    const int speaker_seed = 0;
    const int audio_seed = 0;
    const char * custom_speaker_voice = "";
    const char * custom_speaker_text = "";
    const char * custom_speaker_data = "";
 };
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -352,6 +352,7 @@ class tts_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("speaker_seed", ctypes.c_int),
                ("audio_seed", ctypes.c_int),
                ("custom_speaker_voice", ctypes.c_char_p),
                ("custom_speaker_text", ctypes.c_char_p),
                ("custom_speaker_data", ctypes.c_char_p)]
@ -1880,6 +1881,7 @@ def tts_generate(genparams):
    else:
        voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
    inputs = tts_generation_inputs()
    inputs.custom_speaker_voice = normalized_voice.encode("UTF-8")
    inputs.prompt = prompt.encode("UTF-8")
    inputs.speaker_seed = voice
    aseed = -1
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -502,6 +502,7 @@ static int tts_max_len = 4096;
 static bool is_ttscpp_file = false;
 static generation_configuration * ttscpp_config = nullptr;
 static struct tts_runner * ttscpp_runner = nullptr;
 static std::string detectedarch = "";
 int total_tts_gens = 0;
 static std::string tts_executable_path = "";
@ -540,7 +541,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
    std::string modelfile_ttc = inputs.ttc_model_filename;
    std::string modelfile_cts = inputs.cts_model_filename;
-    std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
+    detectedarch = gguf_get_model_arch(modelfile_ttc);
    is_ttscpp_file = false;
    if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
@ -663,24 +664,34 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input
    std::string prompt = inputs.prompt;
    double ttstime = 0;
    timer_start();
-    switch(speaker_seed)
+
    std::vector<std::string> vmapper = {};
    std::vector<std::string> vpermitted = {};
    if(detectedarch=="kokoro")
    {
-        case 1:
+        vmapper = {"am_echo","af_heart","af_alloy","bm_daniel","bf_isabella"};
-            voiceused = "am_echo";
+        vpermitted = {"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis"};
            break;
        case 2:
            voiceused = "af_alloy";
            break;
        case 3:
            voiceused = "af_jessica";
            break;
        case 4:
            voiceused = "bm_daniel";
            break;
        case 5:
            voiceused = "bf_isabella";
            break;
    }
    else if(detectedarch=="dia")
    {
        vmapper = {"zoe", "zac", "jess", "leo", "mia"};
        vpermitted = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
    }
    if(speaker_seed>=1 && speaker_seed<=5 && vmapper.size()>=5)
    {
        voiceused = vmapper[speaker_seed-1];
    }
    else if(vpermitted.size()>0)
    {
        //if we can match the voice, use it
        const std::string cspeaker = inputs.custom_speaker_voice;
        if (std::find(vpermitted.begin(), vpermitted.end(), cspeaker) != vpermitted.end()) {
            voiceused = cspeaker;
        }
    }
    if(ttsdebugmode==1 && !tts_is_quiet)
    {
        printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());