qwen3tts support reference audio

2026-05-19 08:00:25 +00:00 · 2026-02-21 17:30:21 +08:00 · 2026-02-21 17:30:21 +08:00 · 2db018a1d7
commit 2db018a1d7
parent 72219fdbf5
7 changed files with 409 additions and 419 deletions
--- a/embd_res/klite.embd
+++ b/embd_res/klite.embd
@ -4317,6 +4317,7 @@ Current version indicated by LITEVER below.
 	var websearch_in_progress = false;
 	var kcpp_tts_json = "";
 	var avoidwelcome = false;
+	var voicecloneb64 = "";

 	var localsettings = {
 		my_api_key: "0000000000", //put here so it can be saved and loaded in persistent mode
@ -5480,8 +5481,9 @@ Current version indicated by LITEVER below.
 		indexeddb_load("savedcustomcss",""),
 		indexeddb_load("savedusermod",""),
 		indexeddb_load("usermodprops",""),
-		indexeddb_load("samplerpresets","")
-	]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson]) => {
+		indexeddb_load("samplerpresets",""),
+		indexeddb_load("voiceclone","")
+	]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson, loadedvoiceclone]) => {
 	try
 	{
 		if (loadedsettingsjson != null && loadedsettingsjson != "" && loadedstorycompressed != null && loadedstorycompressed != "") {
@ -5529,6 +5531,10 @@ Current version indicated by LITEVER below.
 				document.getElementById("enhancedchatinterface").classList.add("transparentbg");
 				document.getElementById("enhancedchatinterface_inner").classList.add("transparentbg");
 			}
+			if(loadedvoiceclone && loadedvoiceclone!="")
+			{
+				voicecloneb64 = loadedvoiceclone;
+			}
 			loadok = true;
 		} else {
 			console.log("Skipped missing local save");
@ -17364,9 +17370,9 @@ Current version indicated by LITEVER below.
 		}
 	}

-	function set_voice_clone()
+	function set_voice_json()
 	{
-		inputBoxOkCancel("Set the Voice Clone JSON to clone an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice clone JSONs, or make your own.</span><br>","Apply Voice Clone JSON",kcpp_tts_json,"Paste JSON Here",()=>{
+		inputBoxOkCancel("OuteTTS ONLY - Set the OuteTTS Voice JSON to copy an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice JSONs, or make your own.</span><br>","Apply OuteTTS Voice JSON",kcpp_tts_json,"Paste JSON Here",()=>{
 		let userinput = getInputBoxValue().trim();
 		try
 		{
@ -17385,6 +17391,35 @@ Current version indicated by LITEVER below.
 		},true,true);
 	}

+	function set_voice_clone()
+	{
+		let finput = document.getElementById('addimgfileinput');
+		finput.click();
+		finput.onchange = (event) => {
+			if (event.target.files.length > 0 && event.target.files[0]) {
+				const file = event.target.files[0];
+				const fname = file.name;
+				const reader = new FileReader();
+				reader.onload = function(audio) {
+					let origAudio = audio.target.result;
+					convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{
+						indexeddb_save("voiceclone", newAudio);
+						voicecloneb64 = newAudio;
+						adjust_kcpptts_controls();
+					},64);
+				}
+				reader.readAsDataURL(file);
+			}
+			finput.value = "";
+		};
+	}
+	function clear_voice_clone()
+	{
+		indexeddb_save("voiceclone", "");
+		voicecloneb64 = "";
+		adjust_kcpptts_controls();
+	}
+
 	function restore_retried_text()
 	{
 		if(retry_in_progress)
@ -17517,6 +17552,8 @@ Current version indicated by LITEVER below.
 			indexeddb_save("savedusermod","");
 			indexeddb_save("usermodprops","");
 			indexeddb_save("savedcustomcss", "");
+			indexeddb_save("voiceclone", "");
+			voicecloneb64 = "";
 			let styleElement = document.getElementById('custom_css');
 			styleElement.innerHTML = "";
 			show_welcome_panel();
@ -18593,10 +18630,23 @@ Current version indicated by LITEVER below.
 		} else {
 			document.getElementById("kcpp_tts_voice_custom").classList.add("hidden");
 		}
-		if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
-			document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
+		if (document.getElementById("kcpp_tts_voice").value == "voicejson") {
+			document.getElementById("kcpp_tts_voice_json").classList.remove("hidden");
 		} else {
-			document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
+			document.getElementById("kcpp_tts_voice_json").classList.add("hidden");
+		}
+
+		document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
+		document.getElementById("kcpp_tts_voice_clone_clear").classList.add("hidden");
+		if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
+			if(voicecloneb64=="")
+			{
+				document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
+			}
+			else
+			{
+				document.getElementById("kcpp_tts_voice_clone_clear").classList.remove("hidden");
+			}
 		}

 	}
@ -18779,6 +18829,7 @@ Current version indicated by LITEVER below.
 					};
 				} else {
 					sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
+					let is_voicejson = (document.getElementById("kcpp_tts_voice").value == "voicejson");
 					let is_voiceclone = (document.getElementById("kcpp_tts_voice").value == "voiceclone");
 					let is_custom = (document.getElementById("kcpp_tts_voice").value == "custom");
 					payload =
@ -18786,10 +18837,14 @@ Current version indicated by LITEVER below.
 						"input": text,
 						"voice": (is_custom)?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
 					};
-					if(is_voiceclone && vcjson)
+					if(is_voicejson && vcjson)
 					{
 						payload.speaker_json = vcjson;
 					}
+					if(is_voiceclone && voicecloneb64!="")
+					{
+						payload.reference_audio = voicecloneb64;
+					}
 					ttsheaders = get_kobold_header();
 				}

@ -22887,7 +22942,7 @@ Current version indicated by LITEVER below.

 	// AUDIO MANIPULATION FUNCTIONS
 	//convert any audio to a webm blob (high compression)
-	function convertAudioToCompressedBase64(inputBase64, onDone) {
+	function convertAudioToCompressedBase64(inputBase64, onDone, audio_quality=40) { //quality is kbps
 		// Step 1: Convert base64 string to Blob
 		const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/);
 		if (!matches) {
@ -22927,7 +22982,7 @@ Current version indicated by LITEVER below.
 				}

 				const durationInSeconds = buffer.duration;
-				const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps
+				const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, audio_quality); // mono, 16kHz, 40kbps
 				const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier
 				let mp3Data = [];
 				for (let i = 0; i < samples.length; i += sampleBlockSize) {
@ -29793,12 +29848,15 @@ Current version indicated by LITEVER below.
 								<option value="shouty">shouty</option>
 								<option value="chatty">chatty</option>
 								<option value="custom">custom</option>
+								<option value="voicejson">voicejson</option>
 								<option value="voiceclone">voiceclone</option>
 								</select>
 							</div>
 							<div>
 							<input type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; width:56px;">
-							<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Setup</button>
+							<button id="kcpp_tts_voice_json" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_json()">Setup</button>
+							<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Load</button>
+							<button id="kcpp_tts_voice_clone_clear" type="button" class="btn btn-primary bg_red" style="margin-left:3px; width:56px;" onclick="clear_voice_clone()">Clear</button>
 							</div>
 							</div>
 						</div>
--- a/expose.h
+++ b/expose.h
@ -291,6 +291,7 @@ struct tts_generation_inputs
    const char * custom_speaker_voice = "";
    const char * custom_speaker_text = "";
    const char * custom_speaker_data = "";
+    const char * reference_audio = "";
 };
 struct tts_generation_outputs
 {
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -403,7 +403,8 @@ class tts_generation_inputs(ctypes.Structure):
                ("audio_seed", ctypes.c_int),
                ("custom_speaker_voice", ctypes.c_char_p),
                ("custom_speaker_text", ctypes.c_char_p),
-                ("custom_speaker_data", ctypes.c_char_p)]
+                ("custom_speaker_data", ctypes.c_char_p),
+                ("reference_audio", ctypes.c_char_p)]

 class tts_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
@ -2248,7 +2249,8 @@ def tts_generate(genparams):
    prompt = genparams.get("input", genparams.get("text", ""))
    prompt = prompt.strip()
    voice = 1
-    speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom cloned voices
+    speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom json voices
+    reference_audio = genparams.get("reference_audio","") #for cloned voices in qwen3tts
    voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
    oai_voicemap = ["alloy","onyx","echo","nova","shimmer"] # map to kcpp defaults
    voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
@ -2278,6 +2280,9 @@ def tts_generate(genparams):
    else:
        inputs.custom_speaker_text = "".encode("UTF-8")
        inputs.custom_speaker_data = "".encode("UTF-8")
+    if reference_audio and reference_audio.startswith("data:audio"):
+        reference_audio = reference_audio.split(",", 1)[1]
+    inputs.reference_audio = reference_audio.encode("UTF-8")
    ret = handle.tts_generate(inputs)
    outstr = ""
    if ret.status==1:
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@ -90,8 +90,7 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
    transformer_loaded_ = false;
    decoder_loaded_ = false;

-    const char * low_mem_env = std::getenv("QWEN3_TTS_LOW_MEM");
-    low_mem_mode_ = low_mem_env && low_mem_env[0] != '\0' && low_mem_env[0] != '0';
+    low_mem_mode_ = false;
    if (low_mem_mode_) {
        fprintf(stderr, "  Low-memory mode enabled (lazy decoder + component unloads)\n");
    }
--- a/otherarch/qwen3tts/tts_transformer.cpp
+++ b/otherarch/qwen3tts/tts_transformer.cpp
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -1216,23 +1216,40 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
    }
    else
    {
+        double ttstime = 0;
+        timer_start();
+
        qwen3_tts::tts_result result;
        std::string prompt = inputs.prompt;
        qwen3_tts::tts_params qwen3tts_params;
-        double ttstime = 0;
-        timer_start();
+        std::string custom_reference_audio_str = inputs.reference_audio;
+        std::vector<float> custom_reference_audio_pcmf32;
+
+        if(custom_reference_audio_str!="")
+        {
+            std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(custom_reference_audio_str);
+
+            //qwen3tts uses 24khz
+            bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 24000, custom_reference_audio_pcmf32);
+            if (!ok) {
+                printf("\nError: Cannot read input audio file.\n");
+                output.data = "";
+                output.status = 0;
+                return output;
+            }
+        }
+
        if(!tts_is_quiet)
        {
            printf("\nTTS Generating...");
        }

-       // if (reference_audio.empty()) {
-        result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
-        // } else {
-        //     fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
-        //     fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
-        //     result = tts.synthesize_with_voice(text, reference_audio, params);
-        // }
+        if (custom_reference_audio_pcmf32.empty()) {
+            result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
+        } else {
+            printf("\nUsing reference voice... (Warning, lengthy sample audio will be very slow. Use short clips!)\n");
+            result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params);
+        }

        if (!result.success) {
            printf("\nError: TTS vocoder generation failed : %s\n", result.error_msg.c_str());
--- a/otherarch/whispercpp/whisper_adapter.cpp
+++ b/otherarch/whispercpp/whisper_adapter.cpp
@ -27,20 +27,8 @@ static std::string whisper_output_text = "";

 int total_transcribe_gens = 0;

-static bool is_wav_buffer(const std::string buf) {
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
-        return false;
-    }
-    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
-    if (chunk_size + 8 != buf.size()) {
-        return false;
-    }
-    return true;
-}

-static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
+static bool read_audio(const std::string & b64data, std::vector<float>& pcmf32)
 {
    std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);

@ -141,7 +129,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs

    std::vector<float> pcmf32;               // mono-channel F32 PCM

-    if (!::read_wav(b64data, pcmf32)) {
+    if (!::read_audio(b64data, pcmf32)) {
        printf("\nWhisper: Failed to read input wav data!\n");
        output.text = "";
        output.status = 0;