diff --git a/kcpp_docs.embd b/kcpp_docs.embd index 77b6c9a35..f8f88fd96 100644 --- a/kcpp_docs.embd +++ b/kcpp_docs.embd @@ -616,6 +616,8 @@ "vision": false, "transcribe":false, "multiplayer": false, + "websearch":false, + "tts":false, }, "schema": { "$ref": "#/components/schemas/KcppVersion" @@ -1443,6 +1445,52 @@ ] } }, + "/api/extra/tts": { + "post": { + "description": "Creates text-to-speech audio from input text.", + "requestBody": { + "content": { + "application/json": { + "example": { + "input": "hello world, how are you today?", + "voice": "fire", + }, + "schema": { + "properties": { + "input": { + "type": "string", + "description": "The text to generate audio for. Try to keep it short." + }, + "voice": { + "type": "string", + "description": "The voice to use when generating the audio. You can enter anything you like, a qunique speaker will be generated." + } + }, + "type": "object" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "audio/wav": { + "schema": { + "type": "string", + "format": "binary" + } + } + }, + "description": "Successful request" + } + }, + "summary": "Creates text-to-speech audio from input text.", + "tags": [ + "api/extra" + ] + } + }, "/props": { "get": { "summary": "Returns the Jinja template stored in the GGUF model, if found.", @@ -1840,6 +1888,16 @@ "responses": {"default": {"description": ""}} } }, + "/v1/audio/speech": { + "post": { + "summary": "Generates Text-To-Speech audio from input text. Please refer to OpenAI documentation", + "description": "Generates Text-To-Speech audio from input text.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/audio/createSpeech](https://platform.openai.com/docs/api-reference/audio/createSpeech)", + "tags": [ + "v1" + ], + "responses": {"default": {"description": ""}} + } + }, }, "servers": [ { diff --git a/klite.embd b/klite.embd index 84ae4d91b..7896c073c 100644 --- a/klite.embd +++ b/klite.embd @@ -2030,6 +2030,15 @@ Current version indicated by LITEVER below. .color_orangeurl:focus { color: #ffedd3; } + .color_grayurl { + color: #9e9e9e; + } + .color_grayurl:hover { + color: #9f9f9f; + } + .color_grayurl:focus { + color: #9e9e9e; + } .color_orange { color: #f7a223; @@ -2793,7 +2802,8 @@ Current version indicated by LITEVER below. const koboldcpp_transcribe_endpoint = "/api/extra/transcribe"; const koboldcpp_tokenize_endpoint = "/api/extra/tokencount"; const koboldcpp_perf_endpoint = "/api/extra/perf"; - const koboldcpp_websearch_endpoint = "/api/extra/websearch" + const koboldcpp_websearch_endpoint = "/api/extra/websearch"; + const koboldcpp_tts_endpoint = "/api/extra/tts"; const oai_models_endpoint = "/models"; const oai_submit_endpoint = "/completions"; @@ -2853,6 +2863,7 @@ Current version indicated by LITEVER below. const XTTS_ID = 1000; const ALLTALK_ID = 1001; const OAI_TTS_ID = 1002; + const KCPP_TTS_ID = 1003; const HD_RES_PX = 768; const NO_HD_RES_PX = 512; const AVATAR_PX = 384; @@ -2965,6 +2976,7 @@ Current version indicated by LITEVER below. var voice_is_processing = false; //currently processing voice? let voiceprerecorder = null, voicerecorder = null, voice_is_speaking = false, voice_speaking_counter = 0; let preaudiobuffers = [], preaudioblobs = []; //will store 2 preblobs at a time + var koboldcpp_has_tts = false; var no_escape_html = false; var timetaken_timestamp = performance.now(); var bg_silence = null; @@ -3587,7 +3599,7 @@ Current version indicated by LITEVER below. document.getElementById("lastreq1").innerHTML = document.getElementById("lastreq2").innerHTML = document.getElementById("lastreq3").innerHTML = - `KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`; + `KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`; trigger_abort_controller(); //first trigger sets it up @@ -5840,6 +5852,10 @@ initializeInstructUIFunctionality(); { return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.80") >= 0 && koboldcpp_has_websearch); } + function is_using_kcpp_with_tts() + { + return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.81") >= 0 && koboldcpp_has_tts); + } function is_using_web_lite() { return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("lostruins.github.io")); @@ -9207,6 +9223,7 @@ initializeInstructUIFunctionality(); koboldcpp_has_whisper = (data.transcribe?true:false); koboldcpp_has_multiplayer = (data.multiplayer?true:false); koboldcpp_has_websearch = (data.websearch?true:false); + koboldcpp_has_tts = (data.tts?true:false); let has_password = (data.protected?true:false); let has_txt2img = (data.txt2img?true:false); let no_txt_model = (mdlname=="inactive"); @@ -9315,7 +9332,7 @@ initializeInstructUIFunctionality(); },()=>{ }); } - else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper) + else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper && !koboldcpp_has_tts) { msgboxYesNo("This KoboldCpp instance has no models loaded. You can still use the WebUI to edit or view existing stories.

Would you like to connect to an external API service?","No Models Loaded", ()=>{ @@ -10311,6 +10328,8 @@ initializeInstructUIFunctionality(); ttshtml += ""; ttshtml += ""; ttshtml += ""; + ttshtml += ""; + if ('speechSynthesis' in window) { let voices = window.speechSynthesis.getVoices(); console.log("speech synth available: " + voices.length); @@ -11894,6 +11913,7 @@ initializeInstructUIFunctionality(); document.getElementById("xtts_container").classList.add("hidden"); document.getElementById("oai_tts_container").classList.add("hidden"); document.getElementById("alltalk_specific_controls").classList.add("hidden"); + document.getElementById("kcpp_tts_container").classList.add("hidden"); const selectedTTS = document.getElementById("ttsselect").value; @@ -11910,6 +11930,15 @@ initializeInstructUIFunctionality(); else if(selectedTTS == OAI_TTS_ID) { document.getElementById("oai_tts_container").classList.remove("hidden"); } + else if(selectedTTS == KCPP_TTS_ID) { + document.getElementById("kcpp_tts_container").classList.remove("hidden"); + if(is_using_kcpp_with_tts()) + { + document.getElementById("nokcpptts").classList.add("hidden"); + }else{ + document.getElementById("nokcpptts").classList.remove("hidden"); + } + } } // Fetch RVC voices for AllTalk @@ -12014,27 +12043,44 @@ initializeInstructUIFunctionality(); } } - if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID) //xtts api server + if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID || ssval==KCPP_TTS_ID) //xtts api server { let is_xtts = (ssval==XTTS_ID); let is_oai_tts = (ssval==OAI_TTS_ID); + let is_kcpp_tts = (ssval==KCPP_TTS_ID); const audioContext = new (window.AudioContext || window.webkitAudioContext)(); - if(is_oai_tts) + if(is_oai_tts || is_kcpp_tts) { - let payload = + let payload = {}; + let ttsheaders = {}; + let sub_endpt = ""; + if(is_oai_tts) { - "model": document.getElementById("oai_tts_model").value, - "input": text, - "voice": document.getElementById("oai_tts_voice").value - }; - let oaiheaders = { - 'Content-Type': 'application/json', - 'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key - }; - fetch(localsettings.saved_oai_tts_url, { + sub_endpt = localsettings.saved_oai_tts_url; + payload = + { + "model": document.getElementById("oai_tts_model").value, + "input": text, + "voice": document.getElementById("oai_tts_voice").value + }; + ttsheaders = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key + }; + } else { + sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint); + payload = + { + "input": text, + "voice": document.getElementById("kcpp_tts_voice").value + }; + ttsheaders = get_kobold_header(); + } + + fetch(sub_endpt, { method: 'POST', - headers: oaiheaders, + headers: ttsheaders, body: JSON.stringify(payload), }) .then(response => response.arrayBuffer()) @@ -20199,6 +20245,14 @@ initializeInstructUIFunctionality(); TTS Voice +
Narrate Both Sides
diff --git a/koboldcpp.py b/koboldcpp.py index e70a28139..b79c72649 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -783,7 +783,6 @@ def fetch_gpu_properties(testCL,testCU,testVK): FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()] FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()] except Exception: - FetchedCUdevices = [] FetchedCUdeviceMem = [] FetchedCUfreeMem = [] faileddetectvram = True @@ -806,7 +805,6 @@ def fetch_gpu_properties(testCL,testCU,testVK): if getamdvram: FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()] except Exception: - FetchedCUdevices = [] FetchedCUdeviceMem = [] FetchedCUfreeMem = [] faileddetectvram = True @@ -817,6 +815,8 @@ def fetch_gpu_properties(testCL,testCU,testVK): for idx in range(0,4): if(len(FetchedCUdevices)>idx): CUDevicesNames[idx] = FetchedCUdevices[idx] + for idx in range(0,4): + if(len(FetchedCUdevices)>idx): if len(FetchedCUdeviceMem)>idx: dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024) lowestcumem = dmem if lowestcumem==0 else (dmem if dmem llama_v2_tokenize(struct llama_v2_context * ctx, const res.resize(n); return res; -} +} \ No newline at end of file diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp index d0dd2f994..b1e976afc 100644 --- a/otherarch/llama_v3.cpp +++ b/otherarch/llama_v3.cpp @@ -4414,3 +4414,16 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char * fputs(text, stderr); fflush(stderr); } + +//// stuff this here since it's just some obsolete junk //// +static std::vector kcpp_compute_buf; +void kcpp_graph_compute_helper(struct ggml_v3_cgraph *graph, int n_threads) +{ + struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads); + if (plan.work_size > 0) + { + kcpp_compute_buf.resize(plan.work_size); + plan.work_data = kcpp_compute_buf.data(); + } + ggml_v3_graph_compute(graph, &plan); +} \ No newline at end of file diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 4311346ac..7db7dbffb 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -574,6 +574,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) } } + double ttstime = 0; + timer_start(); + + if(!inputs.quiet && ttsdebugmode==1) { printf("\nInput: %s\n", prompt_clean.c_str()); @@ -591,6 +595,14 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) { printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size()); } + } else if (speaker_seed==1){ //1 is a special seed + std::string speaker = "but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>"; + last_speaker_codes = common_tokenize(model_ttc, speaker, false, true); + last_speaker_seed = speaker_seed; + if(!inputs.quiet && ttsdebugmode==1) + { + printf("\nSpecial ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size()); + } } else { //generate the voice texture of our new speaker last_speaker_codes.clear(); @@ -800,8 +812,8 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) const int n_sr = 24000; // sampling rate - // zero out first 0.25 seconds or 0.05 depending on whether its seeded - const int cutout = (speaker_seed>0?(24000/4):(24000/20)); + // zero out first 0.2 seconds or 0.05 depending on whether its seeded + const int cutout = (speaker_seed>0?(24000/5):(24000/20)); for (int i = 0; i < cutout; ++i) { audio[i] = 0.0f; } @@ -811,10 +823,11 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs) } last_generated_audio = save_wav16_base64(audio, n_sr); + ttstime = timer_check(); if(!inputs.quiet) { - printf("\nTTS Generated %d audio tokens.\n",(int) codes.size()); + printf("\nTTS Generated %d audio tokens in %.2fs.\n",(int) codes.size(),ttstime); } output.data = last_generated_audio.c_str(); diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index f848eeabd..12296360e 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -225,18 +225,6 @@ bool should_transpose_layer(std::string name) return false; } -static std::vector kcpp_compute_buf; -void kcpp_graph_compute_helper(struct ggml_v3_cgraph *graph, int n_threads) -{ - struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads); - if (plan.work_size > 0) - { - kcpp_compute_buf.resize(plan.work_size); - plan.work_data = kcpp_compute_buf.data(); - } - ggml_v3_graph_compute(graph, &plan); -} - static const std::string kcpp_base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"