mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
improved tts default voices quality and sample rate
This commit is contained in:
parent
8d961bba29
commit
e8570de0e6
5 changed files with 37 additions and 28 deletions
27
koboldcpp.py
27
koboldcpp.py
|
@ -283,7 +283,8 @@ class whisper_generation_outputs(ctypes.Structure):
|
|||
("data", ctypes.c_char_p)]
|
||||
|
||||
class tts_load_model_inputs(ctypes.Structure):
|
||||
_fields_ = [("ttc_model_filename", ctypes.c_char_p),
|
||||
_fields_ = [("threads", ctypes.c_int),
|
||||
("ttc_model_filename", ctypes.c_char_p),
|
||||
("cts_model_filename", ctypes.c_char_p),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
("clblast_info", ctypes.c_int),
|
||||
|
@ -1346,6 +1347,12 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
|
|||
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
|
||||
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
|
||||
inputs.gpulayers = (999 if args.ttsgpu else 0)
|
||||
thds = args.threads
|
||||
if args.ttsthreads and args.ttsthreads > 0:
|
||||
ttst = int(args.ttsthreads)
|
||||
if ttst > 0:
|
||||
thds = ttst
|
||||
inputs.threads = thds
|
||||
inputs = set_backend_props(inputs)
|
||||
ret = handle.tts_load_model(inputs)
|
||||
return ret
|
||||
|
@ -1357,7 +1364,7 @@ def tts_generate(genparams):
|
|||
prompt = prompt.strip()
|
||||
voice = 1
|
||||
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
|
||||
voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
|
||||
voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
|
||||
normalized_voice = voicestr.strip().lower() if voicestr else ""
|
||||
if normalized_voice in voice_mapping:
|
||||
voice = voice_mapping.index(normalized_voice) + 1
|
||||
|
@ -2332,9 +2339,9 @@ Enter Prompt:<br>
|
|||
response_body = (json.dumps([]).encode())
|
||||
|
||||
elif self.path.endswith(('/speakers_list')): #xtts compatible
|
||||
response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
|
||||
response_body = (json.dumps(["kobo","cheery","sleepy","shouty","chatty"]).encode()) #some random voices for them to enjoy
|
||||
elif self.path.endswith(('/speakers')): #xtts compatible
|
||||
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"tutor","voice_id":"tutor","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"bored","voice_id":"bored","preview_url":""},{"name":"record","voice_id":"record","preview_url":""}]).encode()) #some random voices for them to enjoy
|
||||
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"chatty","voice_id":"chatty","preview_url":""}]).encode()) #some random voices for them to enjoy
|
||||
elif self.path.endswith(('/get_tts_settings')): #xtts compatible
|
||||
response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy
|
||||
|
||||
|
@ -3158,6 +3165,7 @@ def show_gui():
|
|||
tts_model_var = ctk.StringVar()
|
||||
wavtokenizer_var = ctk.StringVar()
|
||||
ttsgpu_var = ctk.IntVar(value=0)
|
||||
tts_threads_var = ctk.StringVar(value=str(default_threads))
|
||||
|
||||
def tabbuttonaction(name):
|
||||
for t in tabcontent:
|
||||
|
@ -3728,11 +3736,12 @@ def show_gui():
|
|||
audio_tab = tabcontent["Audio"]
|
||||
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
||||
whisper_model_var.trace("w", gui_changed_modelfile)
|
||||
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
||||
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 3, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
|
||||
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
||||
tts_model_var.trace("w", gui_changed_modelfile)
|
||||
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
||||
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 7, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
||||
wavtokenizer_var.trace("w", gui_changed_modelfile)
|
||||
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
|
||||
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
|
||||
ttsgpu_var.trace("w", gui_changed_modelfile)
|
||||
|
||||
def kcpp_export_template():
|
||||
|
@ -3760,6 +3769,7 @@ def show_gui():
|
|||
savdict["tensor_split"] = None
|
||||
savdict["draftgpusplit"] = None
|
||||
savdict["config"] = None
|
||||
savdict["ttsthreads"] = 0
|
||||
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
|
||||
if filename is None:
|
||||
return
|
||||
|
@ -3950,6 +3960,7 @@ def show_gui():
|
|||
args.whispermodel = whisper_model_var.get()
|
||||
|
||||
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
|
||||
args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
|
||||
args.ttsmodel = tts_model_var.get()
|
||||
args.ttswavtokenizer = wavtokenizer_var.get()
|
||||
args.ttsgpu = (ttsgpu_var.get()==1)
|
||||
|
@ -4114,6 +4125,7 @@ def show_gui():
|
|||
|
||||
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
|
||||
|
||||
tts_threads_var.set(str(dict["ttsthreads"]) if ("ttsthreads" in dict and dict["ttsthreads"]) else str(default_threads))
|
||||
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
||||
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
||||
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
||||
|
@ -5527,6 +5539,7 @@ if __name__ == '__main__':
|
|||
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
||||
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
||||
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
||||
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
|
||||
|
||||
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
||||
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue