mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
qwen3tts support reference audio
This commit is contained in:
parent
72219fdbf5
commit
2db018a1d7
7 changed files with 409 additions and 419 deletions
|
|
@ -4317,6 +4317,7 @@ Current version indicated by LITEVER below.
|
|||
var websearch_in_progress = false;
|
||||
var kcpp_tts_json = "";
|
||||
var avoidwelcome = false;
|
||||
var voicecloneb64 = "";
|
||||
|
||||
var localsettings = {
|
||||
my_api_key: "0000000000", //put here so it can be saved and loaded in persistent mode
|
||||
|
|
@ -5480,8 +5481,9 @@ Current version indicated by LITEVER below.
|
|||
indexeddb_load("savedcustomcss",""),
|
||||
indexeddb_load("savedusermod",""),
|
||||
indexeddb_load("usermodprops",""),
|
||||
indexeddb_load("samplerpresets","")
|
||||
]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson]) => {
|
||||
indexeddb_load("samplerpresets",""),
|
||||
indexeddb_load("voiceclone","")
|
||||
]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson, loadedvoiceclone]) => {
|
||||
try
|
||||
{
|
||||
if (loadedsettingsjson != null && loadedsettingsjson != "" && loadedstorycompressed != null && loadedstorycompressed != "") {
|
||||
|
|
@ -5529,6 +5531,10 @@ Current version indicated by LITEVER below.
|
|||
document.getElementById("enhancedchatinterface").classList.add("transparentbg");
|
||||
document.getElementById("enhancedchatinterface_inner").classList.add("transparentbg");
|
||||
}
|
||||
if(loadedvoiceclone && loadedvoiceclone!="")
|
||||
{
|
||||
voicecloneb64 = loadedvoiceclone;
|
||||
}
|
||||
loadok = true;
|
||||
} else {
|
||||
console.log("Skipped missing local save");
|
||||
|
|
@ -17364,9 +17370,9 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
}
|
||||
|
||||
function set_voice_clone()
|
||||
function set_voice_json()
|
||||
{
|
||||
inputBoxOkCancel("Set the Voice Clone JSON to clone an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice clone JSONs, or make your own.</span><br>","Apply Voice Clone JSON",kcpp_tts_json,"Paste JSON Here",()=>{
|
||||
inputBoxOkCancel("OuteTTS ONLY - Set the OuteTTS Voice JSON to copy an existing voice.<br><br><a href='https://github.com/LostRuins/koboldcpp/tree/concedo/examples/outetts/speakers' target='_blank'>You can download existing voice JSONs, or make your own.</span><br>","Apply OuteTTS Voice JSON",kcpp_tts_json,"Paste JSON Here",()=>{
|
||||
let userinput = getInputBoxValue().trim();
|
||||
try
|
||||
{
|
||||
|
|
@ -17385,6 +17391,35 @@ Current version indicated by LITEVER below.
|
|||
},true,true);
|
||||
}
|
||||
|
||||
function set_voice_clone()
|
||||
{
|
||||
let finput = document.getElementById('addimgfileinput');
|
||||
finput.click();
|
||||
finput.onchange = (event) => {
|
||||
if (event.target.files.length > 0 && event.target.files[0]) {
|
||||
const file = event.target.files[0];
|
||||
const fname = file.name;
|
||||
const reader = new FileReader();
|
||||
reader.onload = function(audio) {
|
||||
let origAudio = audio.target.result;
|
||||
convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{
|
||||
indexeddb_save("voiceclone", newAudio);
|
||||
voicecloneb64 = newAudio;
|
||||
adjust_kcpptts_controls();
|
||||
},64);
|
||||
}
|
||||
reader.readAsDataURL(file);
|
||||
}
|
||||
finput.value = "";
|
||||
};
|
||||
}
|
||||
function clear_voice_clone()
|
||||
{
|
||||
indexeddb_save("voiceclone", "");
|
||||
voicecloneb64 = "";
|
||||
adjust_kcpptts_controls();
|
||||
}
|
||||
|
||||
function restore_retried_text()
|
||||
{
|
||||
if(retry_in_progress)
|
||||
|
|
@ -17517,6 +17552,8 @@ Current version indicated by LITEVER below.
|
|||
indexeddb_save("savedusermod","");
|
||||
indexeddb_save("usermodprops","");
|
||||
indexeddb_save("savedcustomcss", "");
|
||||
indexeddb_save("voiceclone", "");
|
||||
voicecloneb64 = "";
|
||||
let styleElement = document.getElementById('custom_css');
|
||||
styleElement.innerHTML = "";
|
||||
show_welcome_panel();
|
||||
|
|
@ -18593,10 +18630,23 @@ Current version indicated by LITEVER below.
|
|||
} else {
|
||||
document.getElementById("kcpp_tts_voice_custom").classList.add("hidden");
|
||||
}
|
||||
if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
|
||||
document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
|
||||
if (document.getElementById("kcpp_tts_voice").value == "voicejson") {
|
||||
document.getElementById("kcpp_tts_voice_json").classList.remove("hidden");
|
||||
} else {
|
||||
document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
|
||||
document.getElementById("kcpp_tts_voice_json").classList.add("hidden");
|
||||
}
|
||||
|
||||
document.getElementById("kcpp_tts_voice_clone").classList.add("hidden");
|
||||
document.getElementById("kcpp_tts_voice_clone_clear").classList.add("hidden");
|
||||
if (document.getElementById("kcpp_tts_voice").value == "voiceclone") {
|
||||
if(voicecloneb64=="")
|
||||
{
|
||||
document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden");
|
||||
}
|
||||
else
|
||||
{
|
||||
document.getElementById("kcpp_tts_voice_clone_clear").classList.remove("hidden");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -18779,6 +18829,7 @@ Current version indicated by LITEVER below.
|
|||
};
|
||||
} else {
|
||||
sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
|
||||
let is_voicejson = (document.getElementById("kcpp_tts_voice").value == "voicejson");
|
||||
let is_voiceclone = (document.getElementById("kcpp_tts_voice").value == "voiceclone");
|
||||
let is_custom = (document.getElementById("kcpp_tts_voice").value == "custom");
|
||||
payload =
|
||||
|
|
@ -18786,10 +18837,14 @@ Current version indicated by LITEVER below.
|
|||
"input": text,
|
||||
"voice": (is_custom)?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
|
||||
};
|
||||
if(is_voiceclone && vcjson)
|
||||
if(is_voicejson && vcjson)
|
||||
{
|
||||
payload.speaker_json = vcjson;
|
||||
}
|
||||
if(is_voiceclone && voicecloneb64!="")
|
||||
{
|
||||
payload.reference_audio = voicecloneb64;
|
||||
}
|
||||
ttsheaders = get_kobold_header();
|
||||
}
|
||||
|
||||
|
|
@ -22887,7 +22942,7 @@ Current version indicated by LITEVER below.
|
|||
|
||||
// AUDIO MANIPULATION FUNCTIONS
|
||||
//convert any audio to a webm blob (high compression)
|
||||
function convertAudioToCompressedBase64(inputBase64, onDone) {
|
||||
function convertAudioToCompressedBase64(inputBase64, onDone, audio_quality=40) { //quality is kbps
|
||||
// Step 1: Convert base64 string to Blob
|
||||
const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/);
|
||||
if (!matches) {
|
||||
|
|
@ -22927,7 +22982,7 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
|
||||
const durationInSeconds = buffer.duration;
|
||||
const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps
|
||||
const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, audio_quality); // mono, 16kHz, 40kbps
|
||||
const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier
|
||||
let mp3Data = [];
|
||||
for (let i = 0; i < samples.length; i += sampleBlockSize) {
|
||||
|
|
@ -29793,12 +29848,15 @@ Current version indicated by LITEVER below.
|
|||
<option value="shouty">shouty</option>
|
||||
<option value="chatty">chatty</option>
|
||||
<option value="custom">custom</option>
|
||||
<option value="voicejson">voicejson</option>
|
||||
<option value="voiceclone">voiceclone</option>
|
||||
</select>
|
||||
</div>
|
||||
<div>
|
||||
<input type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; width:56px;">
|
||||
<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Setup</button>
|
||||
<button id="kcpp_tts_voice_json" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_json()">Setup</button>
|
||||
<button id="kcpp_tts_voice_clone" type="button" class="btn btn-primary" style="margin-left:3px; width:56px;" onclick="set_voice_clone()">Load</button>
|
||||
<button id="kcpp_tts_voice_clone_clear" type="button" class="btn btn-primary bg_red" style="margin-left:3px; width:56px;" onclick="clear_voice_clone()">Clear</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
1
expose.h
1
expose.h
|
|
@ -291,6 +291,7 @@ struct tts_generation_inputs
|
|||
const char * custom_speaker_voice = "";
|
||||
const char * custom_speaker_text = "";
|
||||
const char * custom_speaker_data = "";
|
||||
const char * reference_audio = "";
|
||||
};
|
||||
struct tts_generation_outputs
|
||||
{
|
||||
|
|
|
|||
|
|
@ -403,7 +403,8 @@ class tts_generation_inputs(ctypes.Structure):
|
|||
("audio_seed", ctypes.c_int),
|
||||
("custom_speaker_voice", ctypes.c_char_p),
|
||||
("custom_speaker_text", ctypes.c_char_p),
|
||||
("custom_speaker_data", ctypes.c_char_p)]
|
||||
("custom_speaker_data", ctypes.c_char_p),
|
||||
("reference_audio", ctypes.c_char_p)]
|
||||
|
||||
class tts_generation_outputs(ctypes.Structure):
|
||||
_fields_ = [("status", ctypes.c_int),
|
||||
|
|
@ -2248,7 +2249,8 @@ def tts_generate(genparams):
|
|||
prompt = genparams.get("input", genparams.get("text", ""))
|
||||
prompt = prompt.strip()
|
||||
voice = 1
|
||||
speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom cloned voices
|
||||
speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom json voices
|
||||
reference_audio = genparams.get("reference_audio","") #for cloned voices in qwen3tts
|
||||
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
|
||||
oai_voicemap = ["alloy","onyx","echo","nova","shimmer"] # map to kcpp defaults
|
||||
voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
|
||||
|
|
@ -2278,6 +2280,9 @@ def tts_generate(genparams):
|
|||
else:
|
||||
inputs.custom_speaker_text = "".encode("UTF-8")
|
||||
inputs.custom_speaker_data = "".encode("UTF-8")
|
||||
if reference_audio and reference_audio.startswith("data:audio"):
|
||||
reference_audio = reference_audio.split(",", 1)[1]
|
||||
inputs.reference_audio = reference_audio.encode("UTF-8")
|
||||
ret = handle.tts_generate(inputs)
|
||||
outstr = ""
|
||||
if ret.status==1:
|
||||
|
|
|
|||
|
|
@ -90,8 +90,7 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
|
|||
transformer_loaded_ = false;
|
||||
decoder_loaded_ = false;
|
||||
|
||||
const char * low_mem_env = std::getenv("QWEN3_TTS_LOW_MEM");
|
||||
low_mem_mode_ = low_mem_env && low_mem_env[0] != '\0' && low_mem_env[0] != '0';
|
||||
low_mem_mode_ = false;
|
||||
if (low_mem_mode_) {
|
||||
fprintf(stderr, " Low-memory mode enabled (lazy decoder + component unloads)\n");
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1216,23 +1216,40 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
|
|||
}
|
||||
else
|
||||
{
|
||||
double ttstime = 0;
|
||||
timer_start();
|
||||
|
||||
qwen3_tts::tts_result result;
|
||||
std::string prompt = inputs.prompt;
|
||||
qwen3_tts::tts_params qwen3tts_params;
|
||||
double ttstime = 0;
|
||||
timer_start();
|
||||
std::string custom_reference_audio_str = inputs.reference_audio;
|
||||
std::vector<float> custom_reference_audio_pcmf32;
|
||||
|
||||
if(custom_reference_audio_str!="")
|
||||
{
|
||||
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(custom_reference_audio_str);
|
||||
|
||||
//qwen3tts uses 24khz
|
||||
bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 24000, custom_reference_audio_pcmf32);
|
||||
if (!ok) {
|
||||
printf("\nError: Cannot read input audio file.\n");
|
||||
output.data = "";
|
||||
output.status = 0;
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
if(!tts_is_quiet)
|
||||
{
|
||||
printf("\nTTS Generating...");
|
||||
}
|
||||
|
||||
// if (reference_audio.empty()) {
|
||||
result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
|
||||
// } else {
|
||||
// fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
|
||||
// fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
|
||||
// result = tts.synthesize_with_voice(text, reference_audio, params);
|
||||
// }
|
||||
if (custom_reference_audio_pcmf32.empty()) {
|
||||
result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
|
||||
} else {
|
||||
printf("\nUsing reference voice... (Warning, lengthy sample audio will be very slow. Use short clips!)\n");
|
||||
result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params);
|
||||
}
|
||||
|
||||
if (!result.success) {
|
||||
printf("\nError: TTS vocoder generation failed : %s\n", result.error_msg.c_str());
|
||||
|
|
|
|||
|
|
@ -27,20 +27,8 @@ static std::string whisper_output_text = "";
|
|||
|
||||
int total_transcribe_gens = 0;
|
||||
|
||||
static bool is_wav_buffer(const std::string buf) {
|
||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
|
||||
return false;
|
||||
}
|
||||
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
|
||||
if (chunk_size + 8 != buf.size()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
|
||||
static bool read_audio(const std::string & b64data, std::vector<float>& pcmf32)
|
||||
{
|
||||
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);
|
||||
|
||||
|
|
@ -141,7 +129,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
|||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
|
||||
if (!::read_wav(b64data, pcmf32)) {
|
||||
if (!::read_audio(b64data, pcmf32)) {
|
||||
printf("\nWhisper: Failed to read input wav data!\n");
|
||||
output.text = "";
|
||||
output.status = 0;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue