added support for seeded tts voices

2025-09-10 17:14:36 +00:00 · 2025-01-13 19:11:34 +08:00 · 2025-01-13 19:11:34 +08:00 · 62e33d0bf7
commit 62e33d0bf7
parent b3de1598e7
2 changed files with 211 additions and 44 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -622,6 +622,16 @@ def bring_terminal_to_foreground():
        ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 9)
        ctypes.windll.user32.SetForegroundWindow(ctypes.windll.kernel32.GetConsoleWindow())

+def simple_lcg_hash(input_string): #turns any string into a number between 10000 and 99999
+    a = 1664525
+    c = 1013904223
+    m = 89999  # Modulo
+    hash_value = 25343
+    for char in input_string:
+        hash_value = (a * hash_value + ord(char) + c) % m
+    hash_value += 10000
+    return hash_value
+
 def string_has_overlap(str_a, str_b, maxcheck):
    max_overlap = min(maxcheck, len(str_a), len(str_b))
    for i in range(1, max_overlap + 1):
@ -1331,11 +1341,13 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
 def tts_generate(genparams):
    global args
    is_quiet = True if (args.quiet or args.debugmode == -1) else False
-    prompt = genparams.get("input", "")
+    prompt = genparams.get("input", genparams.get("text", ""))
    prompt = prompt.strip()
+    voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
+    voice = simple_lcg_hash(voicestr) if voicestr else 1
    inputs = tts_generation_inputs()
    inputs.prompt = prompt.encode("UTF-8")
-    inputs.speaker_seed = 0
+    inputs.speaker_seed = voice
    inputs.audio_seed = 0
    inputs.quiet = is_quiet
    ret = handle.tts_generate(inputs)
@ -2296,6 +2308,9 @@ Enter Prompt:<br>
        elif self.path.endswith('/sdapi/v1/upscalers'):
           response_body = (json.dumps([]).encode())

+        elif self.path.endswith(('/speakers_list')): #xtts compatible
+            response_body = (json.dumps(["kobo","bean","corn","spicy","lime","fire","metal","potato"]).encode()) #some random voices for them to enjoy
+
        elif self.path.endswith(('/api/tags')): #ollama compatible
            response_body = (json.dumps({"models":[{"name":"koboldcpp","model":friendlymodelname,"modified_at":"2024-07-19T15:26:55.6122841+08:00","size":394998579,"digest":"b5dc5e784f2a3ee1582373093acf69a2f4e2ac1710b253a001712b86a61f88bb","details":{"parent_model":"","format":"gguf","family":"koboldcpp","families":["koboldcpp"],"parameter_size":"128M","quantization_level":"Q4_0"}}]}).encode())

@ -2671,7 +2686,7 @@ Enter Prompt:<br>
            if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
                is_transcribe = True

-            if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech'):
+            if self.path.endswith('/api/extra/tts') or self.path.endswith('/v1/audio/speech') or self.path.endswith('/tts_to_audio'):
                is_tts = True

            if is_imggen or is_transcribe or is_tts or api_format > 0: