qwen3tts support reference audio

This commit is contained in:
Concedo 2026-02-21 17:30:21 +08:00
parent 72219fdbf5
commit 2db018a1d7
7 changed files with 409 additions and 419 deletions

View file

@ -4317,6 +4317,7 @@ Current version indicated by LITEVER below.
var websearch_in_progress = false;
var kcpp_tts_json = "";
var avoidwelcome = false;
var voicecloneb64 = "";
var localsettings = {
my_api_key: "0000000000", //put here so it can be saved and loaded in persistent mode
@ -5480,8 +5481,9 @@ Current version indicated by LITEVER below.
indexeddb_load("savedcustomcss",""),
indexeddb_load("savedusermod",""),
indexeddb_load("usermodprops",""),
indexeddb_load("samplerpresets","")
]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson]) => {
indexeddb_load("samplerpresets",""),
indexeddb_load("voiceclone","")
]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson, loadedvoiceclone]) => {
try
{
if (loadedsettingsjson != null && loadedsettingsjson != "" && loadedstorycompressed != null && loadedstorycompressed != "") {
@ -5529,6 +5531,10 @@ Current version indicated by LITEVER below.
document.getElementById("enhancedchatinterface").classList.add("transparentbg");
document.getElementById("enhancedchatinterface_inner").classList.add("transparentbg");
}
if(loadedvoiceclone && loadedvoiceclone!="")
{
voicecloneb64 = loadedvoiceclone;
}
loadok = true;
} else {
console.log("Skipped missing local save");
@ -17364,9 +17370,9 @@ Current version indicated by LITEVER below.
}
}
function set_voice_clone()
function set_voice_json()
{
inputBoxOkCancel("Set the Voice Clone JSON to clone an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice clone JSONs, or make your own.</span><br>","Apply Voice Clone JSON",kcpp_tts_json,"Paste JSON Here",()=>{
inputBoxOkCancel("OuteTTS ONLY - Set the OuteTTS Voice JSON to copy an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice JSONs, or make your own.</span><br>","Apply OuteTTS Voice JSON",kcpp_tts_json,"Paste JSON Here",()=>{
let userinput = getInputBoxValue().trim();
try
{
@ -17385,6 +17391,35 @@ Current version indicated by LITEVER below.
},true,true);
}
function set_voice_clone()
{
let finput = document.getElementById('addimgfileinput');
finput.click();
finput.onchange = (event) => {
if (event.target.files.length > 0 && event.target.files[0]) {
const file = event.target.files[0];
const fname = file.name;
const reader = new FileReader();
reader.onload = function(audio) {
let origAudio = audio.target.result;
convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{
indexeddb_save("voiceclone", newAudio);
voicecloneb64 = newAudio;
adjust_kcpptts_controls();
},64);
}
reader.readAsDataURL(file);
}
finput.value = "";
};
}
function clear_voice_clone()
{
indexeddb_save("voiceclone", "");
voicecloneb64 = "";
adjust_kcpptts_controls();
}
function restore_retried_text()
{
if(retry_in_progress)
@ -17517,6 +17552,8 @@ Current version indicated by LITEVER below.
indexeddb_save("savedusermod","");
indexeddb_save("usermodprops","");
indexeddb_save("savedcustomcss", "");
indexeddb_save("voiceclone", "");
voicecloneb64 = "";
let styleElement = document.getElementById('custom_css');
styleElement.innerHTML = "";
show_welcome_panel();
@ -18593,10 +18630,23 @@ Current version indicated by LITEVER below.
} else {
document.getElementById("kcpp_tts_voice_custom").classList.add("hidden");
}
if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
if (document.getElementById("kcpp_tts_voice").value == "voicejson") {
document.getElementById("kcpp_tts_voice_json").classList.remove("hidden");
} else {
document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
document.getElementById("kcpp_tts_voice_json").classList.add("hidden");
}
document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
document.getElementById("kcpp_tts_voice_clone_clear").classList.add("hidden");
if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
if(voicecloneb64=="")
{
document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
}
else
{
document.getElementById("kcpp_tts_voice_clone_clear").classList.remove("hidden");
}
}
}
@ -18779,6 +18829,7 @@ Current version indicated by LITEVER below.
};
} else {
sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
let is_voicejson = (document.getElementById("kcpp_tts_voice").value == "voicejson");
let is_voiceclone = (document.getElementById("kcpp_tts_voice").value == "voiceclone");
let is_custom = (document.getElementById("kcpp_tts_voice").value == "custom");
payload =
@ -18786,10 +18837,14 @@ Current version indicated by LITEVER below.
"input": text,
"voice": (is_custom)?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
};
if(is_voiceclone && vcjson)
if(is_voicejson && vcjson)
{
payload.speaker_json = vcjson;
}
if(is_voiceclone && voicecloneb64!="")
{
payload.reference_audio = voicecloneb64;
}
ttsheaders = get_kobold_header();
}
@ -22887,7 +22942,7 @@ Current version indicated by LITEVER below.
// AUDIO MANIPULATION FUNCTIONS
//convert any audio to a webm blob (high compression)
function convertAudioToCompressedBase64(inputBase64, onDone) {
function convertAudioToCompressedBase64(inputBase64, onDone, audio_quality=40) { //quality is kbps
// Step 1: Convert base64 string to Blob
const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/);
if (!matches) {
@ -22927,7 +22982,7 @@ Current version indicated by LITEVER below.
}
const durationInSeconds = buffer.duration;
const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps
const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, audio_quality); // mono, 16kHz, 40kbps
const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier
let mp3Data = [];
for (let i = 0; i < samples.length; i += sampleBlockSize) {
@ -29793,12 +29848,15 @@ Current version indicated by LITEVER below.
<option value="shouty">shouty</option>
<option value="chatty">chatty</option>
<option value="custom">custom</option>
<option value="voicejson">voicejson</option>
<option value="voiceclone">voiceclone</option>
</select>
</div>
<div>
<input type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; width:56px;">
<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Setup</button>
<button id="kcpp_tts_voice_json" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_json()">Setup</button>
<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Load</button>
<button id="kcpp_tts_voice_clone_clear" type="button" class="btn btn-primary bg_red" style="margin-left:3px; width:56px;" onclick="clear_voice_clone()">Clear</button>
</div>
</div>
</div>

View file

@ -291,6 +291,7 @@ struct tts_generation_inputs
const char * custom_speaker_voice = "";
const char * custom_speaker_text = "";
const char * custom_speaker_data = "";
const char * reference_audio = "";
};
struct tts_generation_outputs
{

View file

@ -403,7 +403,8 @@ class tts_generation_inputs(ctypes.Structure):
("audio_seed", ctypes.c_int),
("custom_speaker_voice", ctypes.c_char_p),
("custom_speaker_text", ctypes.c_char_p),
("custom_speaker_data", ctypes.c_char_p)]
("custom_speaker_data", ctypes.c_char_p),
("reference_audio", ctypes.c_char_p)]
class tts_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
@ -2248,7 +2249,8 @@ def tts_generate(genparams):
prompt = genparams.get("input", genparams.get("text", ""))
prompt = prompt.strip()
voice = 1
speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom cloned voices
speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom json voices
reference_audio = genparams.get("reference_audio","") #for cloned voices in qwen3tts
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
oai_voicemap = ["alloy","onyx","echo","nova","shimmer"] # map to kcpp defaults
voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
@ -2278,6 +2280,9 @@ def tts_generate(genparams):
else:
inputs.custom_speaker_text = "".encode("UTF-8")
inputs.custom_speaker_data = "".encode("UTF-8")
if reference_audio and reference_audio.startswith("data:audio"):
reference_audio = reference_audio.split(",", 1)[1]
inputs.reference_audio = reference_audio.encode("UTF-8")
ret = handle.tts_generate(inputs)
outstr = ""
if ret.status==1:

View file

@ -90,8 +90,7 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
transformer_loaded_ = false;
decoder_loaded_ = false;
const char * low_mem_env = std::getenv("QWEN3_TTS_LOW_MEM");
low_mem_mode_ = low_mem_env && low_mem_env[0] != '\0' && low_mem_env[0] != '0';
low_mem_mode_ = false;
if (low_mem_mode_) {
fprintf(stderr, " Low-memory mode enabled (lazy decoder + component unloads)\n");
}

File diff suppressed because it is too large Load diff

View file

@ -1216,23 +1216,40 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
}
else
{
double ttstime = 0;
timer_start();
qwen3_tts::tts_result result;
std::string prompt = inputs.prompt;
qwen3_tts::tts_params qwen3tts_params;
double ttstime = 0;
timer_start();
std::string custom_reference_audio_str = inputs.reference_audio;
std::vector<float> custom_reference_audio_pcmf32;
if(custom_reference_audio_str!="")
{
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(custom_reference_audio_str);
//qwen3tts uses 24khz
bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 24000, custom_reference_audio_pcmf32);
if (!ok) {
printf("\nError: Cannot read input audio file.\n");
output.data = "";
output.status = 0;
return output;
}
}
if(!tts_is_quiet)
{
printf("\nTTS Generating...");
}
// if (reference_audio.empty()) {
result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
// } else {
// fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
// fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
// result = tts.synthesize_with_voice(text, reference_audio, params);
// }
if (custom_reference_audio_pcmf32.empty()) {
result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
} else {
printf("\nUsing reference voice... (Warning, lengthy sample audio will be very slow. Use short clips!)\n");
result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params);
}
if (!result.success) {
printf("\nError: TTS vocoder generation failed : %s\n", result.error_msg.c_str());

View file

@ -27,20 +27,8 @@ static std::string whisper_output_text = "";
int total_transcribe_gens = 0;
static bool is_wav_buffer(const std::string buf) {
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
return false;
}
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
if (chunk_size + 8 != buf.size()) {
return false;
}
return true;
}
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
static bool read_audio(const std::string & b64data, std::vector<float>& pcmf32)
{
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);
@ -141,7 +129,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
std::vector<float> pcmf32; // mono-channel F32 PCM
if (!::read_wav(b64data, pcmf32)) {
if (!::read_audio(b64data, pcmf32)) {
printf("\nWhisper: Failed to read input wav data!\n");
output.text = "";
output.status = 0;