improved tts default voices quality and sample rate

2025-09-10 09:04:36 +00:00 · 2025-01-17 18:45:16 +08:00 · 2025-01-17 18:45:16 +08:00 · e8570de0e6
commit e8570de0e6
parent 8d961bba29
5 changed files with 37 additions and 28 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -283,7 +283,8 @@ class whisper_generation_outputs(ctypes.Structure):
                ("data", ctypes.c_char_p)]

 class tts_load_model_inputs(ctypes.Structure):
-    _fields_ = [("ttc_model_filename", ctypes.c_char_p),
+    _fields_ = [("threads", ctypes.c_int),
+                ("ttc_model_filename", ctypes.c_char_p),
                ("cts_model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("clblast_info", ctypes.c_int),
@ -1346,6 +1347,12 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
    inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
    inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
    inputs.gpulayers = (999 if args.ttsgpu else 0)
+    thds = args.threads
+    if args.ttsthreads and args.ttsthreads > 0:
+        ttst = int(args.ttsthreads)
+        if ttst > 0:
+            thds = ttst
+    inputs.threads = thds
    inputs = set_backend_props(inputs)
    ret = handle.tts_load_model(inputs)
    return ret
@ -1357,7 +1364,7 @@ def tts_generate(genparams):
    prompt = prompt.strip()
    voice = 1
    voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
-    voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
+    voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
    normalized_voice = voicestr.strip().lower() if voicestr else ""
    if normalized_voice in voice_mapping:
        voice = voice_mapping.index(normalized_voice) + 1
@ -2332,9 +2339,9 @@ Enter Prompt:<br>
           response_body = (json.dumps([]).encode())

        elif self.path.endswith(('/speakers_list')): #xtts compatible
-            response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
+            response_body = (json.dumps(["kobo","cheery","sleepy","shouty","chatty"]).encode()) #some random voices for them to enjoy
        elif self.path.endswith(('/speakers')): #xtts compatible
-            response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"tutor","voice_id":"tutor","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"bored","voice_id":"bored","preview_url":""},{"name":"record","voice_id":"record","preview_url":""}]).encode()) #some random voices for them to enjoy
+            response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"chatty","voice_id":"chatty","preview_url":""}]).encode()) #some random voices for them to enjoy
        elif self.path.endswith(('/get_tts_settings')): #xtts compatible
            response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy

@ -3158,6 +3165,7 @@ def show_gui():
    tts_model_var = ctk.StringVar()
    wavtokenizer_var = ctk.StringVar()
    ttsgpu_var = ctk.IntVar(value=0)
+    tts_threads_var = ctk.StringVar(value=str(default_threads))

    def tabbuttonaction(name):
        for t in tabcontent:
@ -3728,11 +3736,12 @@ def show_gui():
    audio_tab = tabcontent["Audio"]
    makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
    whisper_model_var.trace("w", gui_changed_modelfile)
-    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
+    makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 3, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
+    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
    tts_model_var.trace("w", gui_changed_modelfile)
-    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 7, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
    wavtokenizer_var.trace("w", gui_changed_modelfile)
-    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
+    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
    ttsgpu_var.trace("w", gui_changed_modelfile)

    def kcpp_export_template():
@ -3760,6 +3769,7 @@ def show_gui():
        savdict["tensor_split"] = None
        savdict["draftgpusplit"] = None
        savdict["config"] = None
+        savdict["ttsthreads"] = 0
        filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
        if filename is None:
            return
@ -3950,6 +3960,7 @@ def show_gui():
            args.whispermodel = whisper_model_var.get()

        if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
+            args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
            args.ttsmodel = tts_model_var.get()
            args.ttswavtokenizer = wavtokenizer_var.get()
            args.ttsgpu = (ttsgpu_var.get()==1)
@ -4114,6 +4125,7 @@ def show_gui():

        whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")

+        tts_threads_var.set(str(dict["ttsthreads"]) if ("ttsthreads" in dict and dict["ttsthreads"]) else str(default_threads))
        tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
        wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
        ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
@ -5527,6 +5539,7 @@ if __name__ == '__main__':
    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
    ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
    ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
+    ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)

    deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
    deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')