mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
improved tts default voices quality and sample rate
This commit is contained in:
parent
8d961bba29
commit
e8570de0e6
5 changed files with 37 additions and 28 deletions
1
expose.h
1
expose.h
|
@ -205,6 +205,7 @@ struct whisper_generation_outputs
|
||||||
|
|
||||||
struct tts_load_model_inputs
|
struct tts_load_model_inputs
|
||||||
{
|
{
|
||||||
|
const int threads = 4;
|
||||||
const char * ttc_model_filename = nullptr;
|
const char * ttc_model_filename = nullptr;
|
||||||
const char * cts_model_filename = nullptr;
|
const char * cts_model_filename = nullptr;
|
||||||
const char * executable_path = nullptr;
|
const char * executable_path = nullptr;
|
||||||
|
|
|
@ -1463,7 +1463,7 @@
|
||||||
},
|
},
|
||||||
"voice": {
|
"voice": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,tutor,shouty,bored,record"
|
"description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,shouty,chatty"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"type": "object"
|
"type": "object"
|
||||||
|
|
|
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
const LITEVER = 203;
|
const LITEVER = 204;
|
||||||
const urlParams = new URLSearchParams(window.location.search);
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
var localflag = true;
|
var localflag = true;
|
||||||
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
||||||
|
@ -12085,6 +12085,7 @@ initializeInstructUIFunctionality();
|
||||||
payload =
|
payload =
|
||||||
{
|
{
|
||||||
"input": text,
|
"input": text,
|
||||||
|
"nocache": true,
|
||||||
"voice": (document.getElementById("kcpp_tts_voice").value == "custom")?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
|
"voice": (document.getElementById("kcpp_tts_voice").value == "custom")?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
|
||||||
};
|
};
|
||||||
ttsheaders = get_kobold_header();
|
ttsheaders = get_kobold_header();
|
||||||
|
@ -20266,10 +20267,8 @@ initializeInstructUIFunctionality();
|
||||||
<option value="kobo" selected>kobo</option>
|
<option value="kobo" selected>kobo</option>
|
||||||
<option value="cheery">cheery</option>
|
<option value="cheery">cheery</option>
|
||||||
<option value="sleepy">sleepy</option>
|
<option value="sleepy">sleepy</option>
|
||||||
<option value="tutor">tutor</option>
|
|
||||||
<option value="shouty">shouty</option>
|
<option value="shouty">shouty</option>
|
||||||
<option value="bored">bored</option>
|
<option value="chatty">chatty</option>
|
||||||
<option value="record">record</option>
|
|
||||||
<option value="custom">custom</option>
|
<option value="custom">custom</option>
|
||||||
</select></td>
|
</select></td>
|
||||||
<td><input class="settinglabel miniinput" type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; height:18px; width:44px; padding: 2px;"></td></tr>
|
<td><input class="settinglabel miniinput" type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; height:18px; width:44px; padding: 2px;"></td></tr>
|
||||||
|
|
27
koboldcpp.py
27
koboldcpp.py
|
@ -283,7 +283,8 @@ class whisper_generation_outputs(ctypes.Structure):
|
||||||
("data", ctypes.c_char_p)]
|
("data", ctypes.c_char_p)]
|
||||||
|
|
||||||
class tts_load_model_inputs(ctypes.Structure):
|
class tts_load_model_inputs(ctypes.Structure):
|
||||||
_fields_ = [("ttc_model_filename", ctypes.c_char_p),
|
_fields_ = [("threads", ctypes.c_int),
|
||||||
|
("ttc_model_filename", ctypes.c_char_p),
|
||||||
("cts_model_filename", ctypes.c_char_p),
|
("cts_model_filename", ctypes.c_char_p),
|
||||||
("executable_path", ctypes.c_char_p),
|
("executable_path", ctypes.c_char_p),
|
||||||
("clblast_info", ctypes.c_int),
|
("clblast_info", ctypes.c_int),
|
||||||
|
@ -1346,6 +1347,12 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
|
||||||
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
|
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
|
||||||
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
|
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
|
||||||
inputs.gpulayers = (999 if args.ttsgpu else 0)
|
inputs.gpulayers = (999 if args.ttsgpu else 0)
|
||||||
|
thds = args.threads
|
||||||
|
if args.ttsthreads and args.ttsthreads > 0:
|
||||||
|
ttst = int(args.ttsthreads)
|
||||||
|
if ttst > 0:
|
||||||
|
thds = ttst
|
||||||
|
inputs.threads = thds
|
||||||
inputs = set_backend_props(inputs)
|
inputs = set_backend_props(inputs)
|
||||||
ret = handle.tts_load_model(inputs)
|
ret = handle.tts_load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
@ -1357,7 +1364,7 @@ def tts_generate(genparams):
|
||||||
prompt = prompt.strip()
|
prompt = prompt.strip()
|
||||||
voice = 1
|
voice = 1
|
||||||
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
|
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
|
||||||
voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
|
voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
|
||||||
normalized_voice = voicestr.strip().lower() if voicestr else ""
|
normalized_voice = voicestr.strip().lower() if voicestr else ""
|
||||||
if normalized_voice in voice_mapping:
|
if normalized_voice in voice_mapping:
|
||||||
voice = voice_mapping.index(normalized_voice) + 1
|
voice = voice_mapping.index(normalized_voice) + 1
|
||||||
|
@ -2332,9 +2339,9 @@ Enter Prompt:<br>
|
||||||
response_body = (json.dumps([]).encode())
|
response_body = (json.dumps([]).encode())
|
||||||
|
|
||||||
elif self.path.endswith(('/speakers_list')): #xtts compatible
|
elif self.path.endswith(('/speakers_list')): #xtts compatible
|
||||||
response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
|
response_body = (json.dumps(["kobo","cheery","sleepy","shouty","chatty"]).encode()) #some random voices for them to enjoy
|
||||||
elif self.path.endswith(('/speakers')): #xtts compatible
|
elif self.path.endswith(('/speakers')): #xtts compatible
|
||||||
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"tutor","voice_id":"tutor","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"bored","voice_id":"bored","preview_url":""},{"name":"record","voice_id":"record","preview_url":""}]).encode()) #some random voices for them to enjoy
|
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"chatty","voice_id":"chatty","preview_url":""}]).encode()) #some random voices for them to enjoy
|
||||||
elif self.path.endswith(('/get_tts_settings')): #xtts compatible
|
elif self.path.endswith(('/get_tts_settings')): #xtts compatible
|
||||||
response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy
|
response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy
|
||||||
|
|
||||||
|
@ -3158,6 +3165,7 @@ def show_gui():
|
||||||
tts_model_var = ctk.StringVar()
|
tts_model_var = ctk.StringVar()
|
||||||
wavtokenizer_var = ctk.StringVar()
|
wavtokenizer_var = ctk.StringVar()
|
||||||
ttsgpu_var = ctk.IntVar(value=0)
|
ttsgpu_var = ctk.IntVar(value=0)
|
||||||
|
tts_threads_var = ctk.StringVar(value=str(default_threads))
|
||||||
|
|
||||||
def tabbuttonaction(name):
|
def tabbuttonaction(name):
|
||||||
for t in tabcontent:
|
for t in tabcontent:
|
||||||
|
@ -3728,11 +3736,12 @@ def show_gui():
|
||||||
audio_tab = tabcontent["Audio"]
|
audio_tab = tabcontent["Audio"]
|
||||||
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
|
||||||
whisper_model_var.trace("w", gui_changed_modelfile)
|
whisper_model_var.trace("w", gui_changed_modelfile)
|
||||||
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 3, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
|
||||||
|
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
|
||||||
tts_model_var.trace("w", gui_changed_modelfile)
|
tts_model_var.trace("w", gui_changed_modelfile)
|
||||||
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 7, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
|
||||||
wavtokenizer_var.trace("w", gui_changed_modelfile)
|
wavtokenizer_var.trace("w", gui_changed_modelfile)
|
||||||
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
|
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
|
||||||
ttsgpu_var.trace("w", gui_changed_modelfile)
|
ttsgpu_var.trace("w", gui_changed_modelfile)
|
||||||
|
|
||||||
def kcpp_export_template():
|
def kcpp_export_template():
|
||||||
|
@ -3760,6 +3769,7 @@ def show_gui():
|
||||||
savdict["tensor_split"] = None
|
savdict["tensor_split"] = None
|
||||||
savdict["draftgpusplit"] = None
|
savdict["draftgpusplit"] = None
|
||||||
savdict["config"] = None
|
savdict["config"] = None
|
||||||
|
savdict["ttsthreads"] = 0
|
||||||
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
|
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
return
|
return
|
||||||
|
@ -3950,6 +3960,7 @@ def show_gui():
|
||||||
args.whispermodel = whisper_model_var.get()
|
args.whispermodel = whisper_model_var.get()
|
||||||
|
|
||||||
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
|
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
|
||||||
|
args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
|
||||||
args.ttsmodel = tts_model_var.get()
|
args.ttsmodel = tts_model_var.get()
|
||||||
args.ttswavtokenizer = wavtokenizer_var.get()
|
args.ttswavtokenizer = wavtokenizer_var.get()
|
||||||
args.ttsgpu = (ttsgpu_var.get()==1)
|
args.ttsgpu = (ttsgpu_var.get()==1)
|
||||||
|
@ -4114,6 +4125,7 @@ def show_gui():
|
||||||
|
|
||||||
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
|
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
|
||||||
|
|
||||||
|
tts_threads_var.set(str(dict["ttsthreads"]) if ("ttsthreads" in dict and dict["ttsthreads"]) else str(default_threads))
|
||||||
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
||||||
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
||||||
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
||||||
|
@ -5527,6 +5539,7 @@ if __name__ == '__main__':
|
||||||
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
||||||
|
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
|
||||||
|
|
||||||
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
||||||
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
|
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
|
||||||
|
|
|
@ -468,6 +468,7 @@ static int last_speaker_seed = -999;
|
||||||
static int cts_offset = 151672;
|
static int cts_offset = 151672;
|
||||||
static int space_id = 151670;
|
static int space_id = 151670;
|
||||||
static int code_terminate_id = 151670;
|
static int code_terminate_id = 151670;
|
||||||
|
static int nthreads = 4;
|
||||||
|
|
||||||
bool ttstype_load_model(const tts_load_model_inputs inputs)
|
bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
{
|
{
|
||||||
|
@ -508,7 +509,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
llama_model_params tts_model_params = llama_model_default_params();
|
llama_model_params tts_model_params = llama_model_default_params();
|
||||||
llama_context_params tts_ctx_params = llama_context_default_params();
|
llama_context_params tts_ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
const int nthreads = 4;
|
nthreads = inputs.threads;
|
||||||
|
|
||||||
tts_model_params.use_mmap = false;
|
tts_model_params.use_mmap = false;
|
||||||
tts_model_params.use_mlock = false;
|
tts_model_params.use_mlock = false;
|
||||||
|
@ -686,7 +687,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
{
|
{
|
||||||
printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
|
printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
|
||||||
}
|
}
|
||||||
} else if (speaker_seed>=1 && speaker_seed<=7){ //special seeds
|
} else if (speaker_seed>=1 && speaker_seed<=5){ //special seeds
|
||||||
std::string speaker = "";
|
std::string speaker = "";
|
||||||
switch(speaker_seed)
|
switch(speaker_seed)
|
||||||
{
|
{
|
||||||
|
@ -694,22 +695,16 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
speaker = format_audiotokens("but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>",ttsver);
|
speaker = format_audiotokens("but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>",ttsver);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
speaker = format_audiotokens("but<|t_0.23|><|code_start|><|762|><|612|><|316|><|1128|><|171|><|250|><|1765|><|60|><|1075|><|81|><|1159|><|140|><|81|><|1158|><|678|><|1639|><|970|><|code_end|>\nthat<|t_0.21|><|code_start|><|1254|><|460|><|378|><|1621|><|1477|><|210|><|270|><|571|><|179|><|324|><|408|><|81|><|642|><|408|><|794|><|1506|><|code_end|>\nis<|t_0.16|><|code_start|><|36|><|57|><|1132|><|881|><|844|><|260|><|79|><|1794|><|1195|><|333|><|1808|><|1375|><|code_end|>\nwhat<|t_0.23|><|code_start|><|485|><|1583|><|1091|><|736|><|668|><|1703|><|670|><|832|><|959|><|853|><|983|><|969|><|576|><|697|><|721|><|1032|><|990|><|code_end|>\nit<|t_0.16|><|code_start|><|772|><|741|><|794|><|1015|><|110|><|965|><|1060|><|62|><|1305|><|470|><|284|><|259|><|code_end|>\nis<|t_0.35|><|code_start|><|516|><|1099|><|405|><|1831|><|1051|><|1471|><|26|><|1207|><|809|><|0|><|1303|><|1329|><|1196|><|798|><|679|><|992|><|1358|><|930|><|1065|><|942|><|1573|><|823|><|823|><|1527|><|1617|><|865|><|code_end|>",ttsver);
|
speaker = format_audiotokens("but<|t_0.45|><|code_start|><|920|><|1824|><|1138|><|1387|><|1096|><|1712|><|1642|><|810|><|1685|><|620|><|954|><|584|><|23|><|1467|><|509|><|659|><|1598|><|465|><|567|><|1440|><|3|><|476|><|740|><|288|><|419|><|1440|><|1477|><|254|><|25|><|811|><|882|><|476|><|246|><|246|><|code_end|>\nthat<|t_0.17|><|code_start|><|419|><|1690|><|208|><|1044|><|300|><|1100|><|375|><|1222|><|371|><|1045|><|637|><|1719|><|314|><|code_end|>\nis<|t_0.12|><|code_start|><|319|><|1131|><|794|><|1103|><|1296|><|1615|><|1587|><|233|><|863|><|code_end|>\nwhat<|t_0.16|><|code_start|><|793|><|902|><|391|><|946|><|437|><|95|><|1133|><|110|><|58|><|853|><|1283|><|449|><|code_end|>\nit<|t_0.12|><|code_start|><|774|><|239|><|974|><|213|><|1095|><|1612|><|101|><|1569|><|882|><|code_end|>\nis<|t_0.32|><|code_start|><|1131|><|529|><|1144|><|774|><|1114|><|483|><|693|><|648|><|1112|><|1470|><|1112|><|319|><|1294|><|1417|><|1660|><|729|><|1789|><|1413|><|1728|><|554|><|273|><|736|><|640|><|1549|><|code_end|>",ttsver);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
speaker = format_audiotokens("but<|t_0.32|><|code_start|><|862|><|899|><|1601|><|1749|><|121|><|1176|><|1601|><|1007|><|1722|><|121|><|1142|><|1465|><|696|><|1284|><|1698|><|1275|><|860|><|113|><|590|><|1356|><|577|><|1346|><|1433|><|1779|><|code_end|>\nthat<|t_0.40|><|code_start|><|1248|><|1181|><|1792|><|735|><|1289|><|1346|><|975|><|1751|><|1587|><|1042|><|221|><|29|><|991|><|797|><|1184|><|1171|><|152|><|352|><|1119|><|1282|><|110|><|73|><|524|><|1424|><|1276|><|996|><|777|><|1119|><|1166|><|859|><|code_end|>\nis<|t_0.61|><|code_start|><|1666|><|1819|><|566|><|1333|><|1658|><|981|><|1705|><|1185|><|939|><|1813|><|899|><|1465|><|1176|><|712|><|1390|><|1578|><|1275|><|92|><|1729|><|1200|><|1615|><|1484|><|1200|><|1574|><|1307|><|1221|><|1606|><|1307|><|428|><|1759|><|1127|><|1574|><|1581|><|127|><|1507|><|1060|><|1769|><|34|><|1583|><|1579|><|1828|><|1580|><|652|><|1688|><|1527|><|1547|><|code_end|>\nwhat<|t_0.93|><|code_start|><|1691|><|731|><|1592|><|1573|><|1547|><|1617|><|1528|><|1547|><|1664|><|867|><|1571|><|1637|><|273|><|1354|><|1573|><|34|><|1724|><|1669|><|1538|><|1293|><|1623|><|1536|><|1233|><|1176|><|1348|><|1011|><|1722|><|899|><|1176|><|1419|><|899|><|1763|><|1293|><|1601|><|1543|><|939|><|1543|><|1419|><|799|><|1722|><|1233|><|1011|><|1543|><|1007|><|1176|><|1628|><|1114|><|1763|><|862|><|957|><|1693|><|274|><|1176|><|1719|><|805|><|1706|><|1472|><|1249|><|1365|><|877|><|269|><|197|><|1068|><|969|><|1591|><|1192|><|996|><|1764|><|1455|><|1643|><|code_end|>\nit<|t_0.15|><|code_start|><|804|><|1141|><|1566|><|1013|><|529|><|1650|><|1149|><|1744|><|763|><|1640|><|1692|><|code_end|>\nis<|t_0.40|><|code_start|><|1218|><|774|><|1576|><|1192|><|286|><|1831|><|1407|><|92|><|803|><|1311|><|26|><|546|><|1124|><|978|><|319|><|1062|><|1675|><|1608|><|1158|><|1456|><|1572|><|1199|><|1603|><|1592|><|1664|><|1586|><|1571|><|1354|><|34|><|1627|><|code_end|>",ttsver);
|
speaker = format_audiotokens("but<|t_0.21|><|code_start|><|348|><|1776|><|1620|><|1262|><|118|><|288|><|258|><|1407|><|1331|><|1102|><|664|><|1300|><|1647|><|1536|><|71|><|23|><|code_end|> \nthat<|t_0.19|><|code_start|><|3|><|1740|><|1253|><|1122|><|549|><|715|><|718|><|657|><|1136|><|1247|><|517|><|1333|><|815|><|634|><|code_end|>\nis<|t_0.12|><|code_start|><|1330|><|839|><|753|><|1826|><|1602|><|50|><|1441|><|889|><|948|><|code_end|>\nwhat<|t_0.16|><|code_start|><|899|><|869|><|250|><|894|><|876|><|1471|><|1308|><|1436|><|1328|><|1700|><|1425|><|1330|><|code_end|>\nit<|t_0.12|><|code_start|><|1027|><|1162|><|1344|><|1170|><|86|><|1562|><|1575|><|176|><|1186|><|code_end|>\nis<|t_0.25|><|code_start|><|361|><|1533|><|1697|><|903|><|333|><|1232|><|1337|><|1611|><|1196|><|0|><|1328|><|1245|><|1718|><|1635|><|1616|><|1599|><|1363|><|962|><|328|><|code_end|>",ttsver);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
speaker = format_audiotokens("but<|t_0.24|><|code_start|><|710|><|505|><|555|><|1255|><|1474|><|1315|><|1740|><|530|><|1446|><|1651|><|991|><|186|><|1310|><|816|><|175|><|935|><|776|><|672|><|code_end|>\nthat<|t_0.40|><|code_start|><|1440|><|807|><|712|><|1525|><|177|><|584|><|1006|><|1288|><|1664|><|1732|><|951|><|79|><|797|><|790|><|172|><|1111|><|106|><|1222|><|186|><|186|><|1122|><|1153|><|81|><|1055|><|1355|><|1757|><|861|><|1067|><|971|><|563|><|code_end|>\nis<|t_0.36|><|code_start|><|915|><|396|><|869|><|1779|><|805|><|1489|><|1157|><|1142|><|1011|><|555|><|686|><|1578|><|1428|><|1624|><|1252|><|949|><|175|><|239|><|154|><|1280|><|716|><|1729|><|1445|><|1791|><|1679|><|1769|><|884|><|code_end|>\nwhat<|t_0.36|><|code_start|><|1710|><|1734|><|1364|><|1789|><|1805|><|1628|><|1025|><|859|><|1595|><|987|><|136|><|1584|><|635|><|1006|><|1789|><|552|><|871|><|1505|><|1206|><|474|><|705|><|803|><|1305|><|1595|><|627|><|1137|><|486|><|code_end|>\nit<|t_0.47|><|code_start|><|676|><|1746|><|1672|><|1465|><|1346|><|673|><|957|><|1293|><|1348|><|1628|><|710|><|1233|><|1628|><|727|><|1338|><|1536|><|673|><|686|><|1273|><|1114|><|1523|><|1338|><|1510|><|273|><|1487|><|1656|><|1573|><|1786|><|813|><|1284|><|1442|><|17|><|325|><|975|><|555|><|code_end|>\nis<|t_0.47|><|code_start|><|1747|><|1419|><|1465|><|1538|><|17|><|862|><|1419|><|986|><|1628|><|1157|><|933|><|1176|><|939|><|899|><|625|><|939|><|1085|><|101|><|1224|><|1744|><|1777|><|1462|><|176|><|1618|><|972|><|1623|><|1580|><|1252|><|1479|><|1702|><|1802|><|895|><|1673|><|1510|><|1513|><|code_end|>",ttsver);
|
|
||||||
break;
|
|
||||||
case 5:
|
|
||||||
speaker = format_audiotokens("but<|t_0.20|><|code_start|><|686|><|1288|><|1251|><|1428|><|481|><|702|><|1812|><|829|><|81|><|756|><|76|><|104|><|952|><|1723|><|1632|><|code_end|>\nthat<|t_0.20|><|code_start|><|1006|><|1067|><|1614|><|1810|><|887|><|43|><|1192|><|106|><|400|><|43|><|730|><|660|><|186|><|87|><|467|><|code_end|>\nis<|t_0.27|><|code_start|><|648|><|1625|><|9|><|685|><|243|><|106|><|996|><|990|><|228|><|809|><|1009|><|2|><|806|><|1325|><|1332|><|1766|><|202|><|725|><|416|><|822|><|code_end|>\nwhat<|t_0.36|><|code_start|><|1287|><|328|><|1241|><|1661|><|1651|><|1708|><|1740|><|1685|><|1715|><|1787|><|1381|><|197|><|1769|><|525|><|1000|><|234|><|364|><|115|><|212|><|632|><|1153|><|228|><|73|><|1002|><|1800|><|1277|><|1117|><|code_end|>\nit<|t_0.40|><|code_start|><|1830|><|1199|><|1282|><|1163|><|1195|><|1752|><|1092|><|1481|><|1003|><|513|><|1639|><|1805|><|1485|><|1645|><|195|><|1464|><|181|><|195|><|123|><|87|><|433|><|878|><|170|><|1265|><|375|><|1708|><|1739|><|1519|><|1185|><|1099|><|code_end|>\nis<|t_0.76|><|code_start|><|1748|><|1422|><|276|><|1337|><|1322|><|1519|><|1779|><|1067|><|1724|><|891|><|1205|><|1419|><|1144|><|1667|><|591|><|1003|><|1543|><|566|><|1390|><|426|><|1824|><|182|><|1138|><|52|><|129|><|1056|><|155|><|1056|><|1298|><|919|><|155|><|125|><|500|><|1022|><|571|><|315|><|400|><|100|><|617|><|295|><|757|><|324|><|592|><|1298|><|1310|><|57|><|876|><|1175|><|1353|><|1770|><|1649|><|1828|><|1637|><|362|><|1744|><|884|><|1027|><|code_end|>",ttsver);
|
speaker = format_audiotokens("but<|t_0.20|><|code_start|><|686|><|1288|><|1251|><|1428|><|481|><|702|><|1812|><|829|><|81|><|756|><|76|><|104|><|952|><|1723|><|1632|><|code_end|>\nthat<|t_0.20|><|code_start|><|1006|><|1067|><|1614|><|1810|><|887|><|43|><|1192|><|106|><|400|><|43|><|730|><|660|><|186|><|87|><|467|><|code_end|>\nis<|t_0.27|><|code_start|><|648|><|1625|><|9|><|685|><|243|><|106|><|996|><|990|><|228|><|809|><|1009|><|2|><|806|><|1325|><|1332|><|1766|><|202|><|725|><|416|><|822|><|code_end|>\nwhat<|t_0.36|><|code_start|><|1287|><|328|><|1241|><|1661|><|1651|><|1708|><|1740|><|1685|><|1715|><|1787|><|1381|><|197|><|1769|><|525|><|1000|><|234|><|364|><|115|><|212|><|632|><|1153|><|228|><|73|><|1002|><|1800|><|1277|><|1117|><|code_end|>\nit<|t_0.40|><|code_start|><|1830|><|1199|><|1282|><|1163|><|1195|><|1752|><|1092|><|1481|><|1003|><|513|><|1639|><|1805|><|1485|><|1645|><|195|><|1464|><|181|><|195|><|123|><|87|><|433|><|878|><|170|><|1265|><|375|><|1708|><|1739|><|1519|><|1185|><|1099|><|code_end|>\nis<|t_0.76|><|code_start|><|1748|><|1422|><|276|><|1337|><|1322|><|1519|><|1779|><|1067|><|1724|><|891|><|1205|><|1419|><|1144|><|1667|><|591|><|1003|><|1543|><|566|><|1390|><|426|><|1824|><|182|><|1138|><|52|><|129|><|1056|><|155|><|1056|><|1298|><|919|><|155|><|125|><|500|><|1022|><|571|><|315|><|400|><|100|><|617|><|295|><|757|><|324|><|592|><|1298|><|1310|><|57|><|876|><|1175|><|1353|><|1770|><|1649|><|1828|><|1637|><|362|><|1744|><|884|><|1027|><|code_end|>",ttsver);
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 5:
|
||||||
speaker = format_audiotokens("but<|t_0.39|><|code_start|><|1338|><|1319|><|805|><|1176|><|799|><|591|><|325|><|1023|><|274|><|1348|><|1246|><|1176|><|591|><|555|><|758|><|591|><|438|><|710|><|727|><|1419|><|1157|><|1157|><|1293|><|633|><|1003|><|832|><|871|><|1399|><|1315|><|code_end|>\nthat<|t_0.20|><|code_start|><|1352|><|668|><|859|><|1793|><|1455|><|260|><|1117|><|260|><|186|><|1209|><|106|><|1098|><|260|><|1088|><|752|><|code_end|>\nis<|t_0.17|><|code_start|><|949|><|869|><|352|><|821|><|475|><|788|><|1150|><|1286|><|1079|><|1726|><|328|><|1624|><|1641|><|code_end|>\nwhat<|t_0.47|><|code_start|><|1175|><|1710|><|640|><|231|><|1781|><|884|><|1649|><|930|><|1270|><|1824|><|1383|><|1748|><|1011|><|1176|><|1023|><|986|><|1419|><|1425|><|686|><|899|><|627|><|1419|><|1023|><|799|><|1338|><|1163|><|1464|><|627|><|840|><|361|><|693|><|159|><|1041|><|562|><|1444|><|code_end|>\nit<|t_0.12|><|code_start|><|1078|><|685|><|982|><|277|><|1494|><|793|><|229|><|853|><|308|><|code_end|>\nis<|t_0.23|><|code_start|><|1291|><|1308|><|902|><|531|><|1022|><|231|><|992|><|1671|><|967|><|992|><|1646|><|1654|><|1791|><|701|><|1624|><|1565|><|1532|><|code_end|>",ttsver);
|
speaker = format_audiotokens("but<|t_0.68|><|code_start|><|1761|><|1164|><|1543|><|1677|><|1120|><|1634|><|1496|><|1639|><|1717|><|1306|><|1016|><|1713|><|976|><|1474|><|1817|><|976|><|1595|><|1255|><|584|><|1440|><|1121|><|287|><|91|><|44|><|246|><|160|><|1233|><|247|><|776|><|44|><|246|><|12|><|1352|><|866|><|168|><|71|><|246|><|246|><|804|><|933|><|168|><|193|><|44|><|1663|><|1097|><|411|><|1393|><|1326|><|21|><|342|><|118|><|code_end|>\nthat<|t_0.17|><|code_start|><|220|><|1750|><|1160|><|260|><|1738|><|300|><|291|><|989|><|147|><|1150|><|947|><|803|><|930|><|code_end|>\nis<|t_0.15|><|code_start|><|798|><|1632|><|412|><|1084|><|1166|><|1014|><|416|><|1637|><|415|><|1|><|1660|><|code_end|>\nwhat<|t_0.21|><|code_start|><|1412|><|707|><|572|><|1092|><|898|><|673|><|770|><|1787|><|994|><|983|><|1096|><|221|><|924|><|1323|><|1726|><|387|><|code_end|>\nit<|t_0.12|><|code_start|><|798|><|665|><|513|><|695|><|1410|><|337|><|237|><|1717|><|1353|><|code_end|>\nis<|t_0.24|><|code_start|><|1355|><|1084|><|65|><|1422|><|674|><|1280|><|940|><|1752|><|396|><|1431|><|1761|><|957|><|1440|><|634|><|333|><|1627|><|821|><|788|><|code_end|>",ttsver);
|
||||||
break;
|
|
||||||
case 7:
|
|
||||||
speaker = format_audiotokens("but<|t_0.31|><|code_start|><|174|><|544|><|68|><|391|><|131|><|187|><|559|><|534|><|223|><|1185|><|612|><|301|><|387|><|94|><|1224|><|1159|><|162|><|236|><|1133|><|774|><|888|><|144|><|1038|><|code_end|>\nthat<|t_0.20|><|code_start|><|223|><|77|><|1517|><|446|><|1207|><|140|><|873|><|147|><|1051|><|210|><|1216|><|147|><|1148|><|678|><|501|><|code_end|>\nis<|t_0.13|><|code_start|><|912|><|822|><|622|><|519|><|1017|><|546|><|1740|><|1823|><|1561|><|273|><|code_end|>\nwhat<|t_0.16|><|code_start|><|1571|><|1597|><|486|><|1417|><|130|><|747|><|1088|><|1045|><|580|><|239|><|431|><|40|><|code_end|>\nit<|t_0.12|><|code_start|><|1736|><|878|><|1159|><|1004|><|1168|><|594|><|544|><|77|><|1032|><|code_end|>\nis<|t_0.28|><|code_start|><|1088|><|873|><|1726|><|1099|><|1095|><|1412|><|1106|><|1317|><|1292|><|149|><|1429|><|967|><|873|><|1754|><|229|><|1046|><|1595|><|1003|><|1603|><|1529|><|101|><|code_end|>",ttsver);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
last_speaker_codes = common_tokenize(ttcvocab, speaker, false, true);
|
last_speaker_codes = common_tokenize(ttcvocab, speaker, false, true);
|
||||||
|
@ -910,7 +905,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
}
|
}
|
||||||
if(!inputs.quiet)
|
if(!inputs.quiet)
|
||||||
{
|
{
|
||||||
printf("\rTTS Generating (%d AudioTokens)", n_decode);
|
printf("\rTTS Generating (%d outputs)", n_decode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -937,6 +932,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
|
kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
|
||||||
|
printf("\nRunning Vocoder (%d AudioTokens)", codes.size());
|
||||||
|
|
||||||
if (llama_decode(cts_ctx, codebatch.batch) != 0) {
|
if (llama_decode(cts_ctx, codebatch.batch) != 0) {
|
||||||
printf("\nError: TTS vocoder generation failed!\n");
|
printf("\nError: TTS vocoder generation failed!\n");
|
||||||
|
@ -949,15 +945,15 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
|
||||||
// spectral operations
|
// spectral operations
|
||||||
const int n_embd = llama_model_n_embd(model_cts);
|
const int n_embd = llama_model_n_embd(model_cts);
|
||||||
const float * embd = llama_get_embeddings(cts_ctx);
|
const float * embd = llama_get_embeddings(cts_ctx);
|
||||||
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
|
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, nthreads);
|
||||||
|
|
||||||
const int n_sr = 24000; // original sampling rate
|
const int n_sr = 24000; // original sampling rate
|
||||||
const int t_sr = 16000; //final target sampling rate
|
const int t_sr = 24000; //final target sampling rate
|
||||||
|
|
||||||
// zero out first x seconds depending on whether its seeded
|
// zero out first x seconds depending on whether its seeded
|
||||||
const int cutout = t_sr/4;
|
const int cutout = t_sr/4;
|
||||||
|
|
||||||
audio = resample_wav(audio,n_sr,t_sr); //resample to 16k
|
//audio = resample_wav(audio,n_sr,t_sr); //resample to 16k
|
||||||
|
|
||||||
for (int i = 0; i < cutout; ++i) {
|
for (int i = 0; i < cutout; ++i) {
|
||||||
audio[i] = 0.0f;
|
audio[i] = 0.0f;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue