From 2abe11071bc580b3788c81055da11b47d8f185f9 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:57:34 +0800
Subject: [PATCH] custom voice handling

---
 expose.h                  |  1 +
 koboldcpp.py              |  2 ++
 otherarch/tts_adapter.cpp | 45 ++++++++++++++++++++++++---------------
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/expose.h b/expose.h
index 575b3306e..c27ecebbd 100644
--- a/expose.h
+++ b/expose.h
@@ -247,6 +247,7 @@ struct tts_generation_inputs
     const char * prompt = nullptr;
     const int speaker_seed = 0;
     const int audio_seed = 0;
+    const char * custom_speaker_voice = "";
     const char * custom_speaker_text = "";
     const char * custom_speaker_data = "";
 };
diff --git a/koboldcpp.py b/koboldcpp.py
index 161a1cae6..d2dcba3af 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -352,6 +352,7 @@ class tts_generation_inputs(ctypes.Structure):
     _fields_ = [("prompt", ctypes.c_char_p),
                 ("speaker_seed", ctypes.c_int),
                 ("audio_seed", ctypes.c_int),
+                ("custom_speaker_voice", ctypes.c_char_p),
                 ("custom_speaker_text", ctypes.c_char_p),
                 ("custom_speaker_data", ctypes.c_char_p)]
 
@@ -1880,6 +1881,7 @@ def tts_generate(genparams):
     else:
         voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
     inputs = tts_generation_inputs()
+    inputs.custom_speaker_voice = normalized_voice.encode("UTF-8")
     inputs.prompt = prompt.encode("UTF-8")
     inputs.speaker_seed = voice
     aseed = -1
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index 4e9d1c0ae..264a5a686 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -502,6 +502,7 @@ static int tts_max_len = 4096;
 static bool is_ttscpp_file = false;
 static generation_configuration * ttscpp_config = nullptr;
 static struct tts_runner * ttscpp_runner = nullptr;
+static std::string detectedarch = "";
 
 int total_tts_gens = 0;
 static std::string tts_executable_path = "";
@@ -540,7 +541,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
 
     std::string modelfile_ttc = inputs.ttc_model_filename;
     std::string modelfile_cts = inputs.cts_model_filename;
-    std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
+    detectedarch = gguf_get_model_arch(modelfile_ttc);
 
     is_ttscpp_file = false;
     if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
@@ -663,24 +664,34 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input
     std::string prompt = inputs.prompt;
     double ttstime = 0;
     timer_start();
-    switch(speaker_seed)
+
+    std::vector<std::string> vmapper = {};
+    std::vector<std::string> vpermitted = {};
+
+    if(detectedarch=="kokoro")
     {
-        case 1:
-            voiceused = "am_echo";
-            break;
-        case 2:
-            voiceused = "af_alloy";
-            break;
-        case 3:
-            voiceused = "af_jessica";
-            break;
-        case 4:
-            voiceused = "bm_daniel";
-            break;
-        case 5:
-            voiceused = "bf_isabella";
-            break;
+        vmapper = {"am_echo","af_heart","af_alloy","bm_daniel","bf_isabella"};
+        vpermitted = {"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis"};
     }
+    else if(detectedarch=="dia")
+    {
+        vmapper = {"zoe", "zac", "jess", "leo", "mia"};
+        vpermitted = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
+    }
+
+    if(speaker_seed>=1 && speaker_seed<=5 && vmapper.size()>=5)
+    {
+        voiceused = vmapper[speaker_seed-1];
+    }
+    else if(vpermitted.size()>0)
+    {
+        //if we can match the voice, use it
+        const std::string cspeaker = inputs.custom_speaker_voice;
+        if (std::find(vpermitted.begin(), vpermitted.end(), cspeaker) != vpermitted.end()) {
+            voiceused = cspeaker;
+        }
+    }
+
     if(ttsdebugmode==1 && !tts_is_quiet)
     {
         printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());