From 2db018a1d7cc3b06b2692497c7aa4dd5d74d3db5 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 21 Feb 2026 17:30:21 +0800 Subject: [PATCH] qwen3tts support reference audio --- embd_res/klite.embd | 80 ++- expose.h | 1 + koboldcpp.py | 9 +- otherarch/qwen3tts/qwen3_tts.cpp | 3 +- otherarch/qwen3tts/tts_transformer.cpp | 684 ++++++++++------------- otherarch/tts_adapter.cpp | 35 +- otherarch/whispercpp/whisper_adapter.cpp | 16 +- 7 files changed, 409 insertions(+), 419 deletions(-) diff --git a/embd_res/klite.embd b/embd_res/klite.embd index 7d9890925..f35687c75 100644 --- a/embd_res/klite.embd +++ b/embd_res/klite.embd @@ -4317,6 +4317,7 @@ Current version indicated by LITEVER below. var websearch_in_progress = false; var kcpp_tts_json = ""; var avoidwelcome = false; + var voicecloneb64 = ""; var localsettings = { my_api_key: "0000000000", //put here so it can be saved and loaded in persistent mode @@ -5480,8 +5481,9 @@ Current version indicated by LITEVER below. indexeddb_load("savedcustomcss",""), indexeddb_load("savedusermod",""), indexeddb_load("usermodprops",""), - indexeddb_load("samplerpresets","") - ]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson]) => { + indexeddb_load("samplerpresets",""), + indexeddb_load("voiceclone","") + ]).then(([loadedsettingsjson, loadedstorycompressed, loadedbackgroundimg, currcss, currmod, modpropsstr, loadedsamplerpresetsjson, loadedvoiceclone]) => { try { if (loadedsettingsjson != null && loadedsettingsjson != "" && loadedstorycompressed != null && loadedstorycompressed != "") { @@ -5529,6 +5531,10 @@ Current version indicated by LITEVER below. document.getElementById("enhancedchatinterface").classList.add("transparentbg"); document.getElementById("enhancedchatinterface_inner").classList.add("transparentbg"); } + if(loadedvoiceclone && loadedvoiceclone!="") + { + voicecloneb64 = loadedvoiceclone; + } loadok = true; } else { console.log("Skipped missing local save"); @@ -17364,9 +17370,9 @@ Current version indicated by LITEVER below. } } - function set_voice_clone() + function set_voice_json() { - inputBoxOkCancel("Set the Voice Clone JSON to clone an existing voice.

You can download existing voice clone JSONs, or make your own.
","Apply Voice Clone JSON",kcpp_tts_json,"Paste JSON Here",()=>{ + inputBoxOkCancel("OuteTTS ONLY - Set the OuteTTS Voice JSON to copy an existing voice.

You can download existing voice JSONs, or make your own.
","Apply OuteTTS Voice JSON",kcpp_tts_json,"Paste JSON Here",()=>{ let userinput = getInputBoxValue().trim(); try { @@ -17385,6 +17391,35 @@ Current version indicated by LITEVER below. },true,true); } + function set_voice_clone() + { + let finput = document.getElementById('addimgfileinput'); + finput.click(); + finput.onchange = (event) => { + if (event.target.files.length > 0 && event.target.files[0]) { + const file = event.target.files[0]; + const fname = file.name; + const reader = new FileReader(); + reader.onload = function(audio) { + let origAudio = audio.target.result; + convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{ + indexeddb_save("voiceclone", newAudio); + voicecloneb64 = newAudio; + adjust_kcpptts_controls(); + },64); + } + reader.readAsDataURL(file); + } + finput.value = ""; + }; + } + function clear_voice_clone() + { + indexeddb_save("voiceclone", ""); + voicecloneb64 = ""; + adjust_kcpptts_controls(); + } + function restore_retried_text() { if(retry_in_progress) @@ -17517,6 +17552,8 @@ Current version indicated by LITEVER below. indexeddb_save("savedusermod",""); indexeddb_save("usermodprops",""); indexeddb_save("savedcustomcss", ""); + indexeddb_save("voiceclone", ""); + voicecloneb64 = ""; let styleElement = document.getElementById('custom_css'); styleElement.innerHTML = ""; show_welcome_panel(); @@ -18593,10 +18630,23 @@ Current version indicated by LITEVER below. } else { document.getElementById("kcpp_tts_voice_custom").classList.add("hidden"); } - if (document.getElementById("kcpp_tts_voice").value == "voiceclone") { - document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden"); + if (document.getElementById("kcpp_tts_voice").value == "voicejson") { + document.getElementById("kcpp_tts_voice_json").classList.remove("hidden"); } else { - document.getElementById("kcpp_tts_voice_clone").classList.add("hidden"); + document.getElementById("kcpp_tts_voice_json").classList.add("hidden"); + } + + document.getElementById("kcpp_tts_voice_clone").classList.add("hidden"); + document.getElementById("kcpp_tts_voice_clone_clear").classList.add("hidden"); + if (document.getElementById("kcpp_tts_voice").value == "voiceclone") { + if(voicecloneb64=="") + { + document.getElementById("kcpp_tts_voice_clone").classList.remove("hidden"); + } + else + { + document.getElementById("kcpp_tts_voice_clone_clear").classList.remove("hidden"); + } } } @@ -18779,6 +18829,7 @@ Current version indicated by LITEVER below. }; } else { sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint); + let is_voicejson = (document.getElementById("kcpp_tts_voice").value == "voicejson"); let is_voiceclone = (document.getElementById("kcpp_tts_voice").value == "voiceclone"); let is_custom = (document.getElementById("kcpp_tts_voice").value == "custom"); payload = @@ -18786,10 +18837,14 @@ Current version indicated by LITEVER below. "input": text, "voice": (is_custom)?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value }; - if(is_voiceclone && vcjson) + if(is_voicejson && vcjson) { payload.speaker_json = vcjson; } + if(is_voiceclone && voicecloneb64!="") + { + payload.reference_audio = voicecloneb64; + } ttsheaders = get_kobold_header(); } @@ -22887,7 +22942,7 @@ Current version indicated by LITEVER below. // AUDIO MANIPULATION FUNCTIONS //convert any audio to a webm blob (high compression) - function convertAudioToCompressedBase64(inputBase64, onDone) { + function convertAudioToCompressedBase64(inputBase64, onDone, audio_quality=40) { //quality is kbps // Step 1: Convert base64 string to Blob const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/); if (!matches) { @@ -22927,7 +22982,7 @@ Current version indicated by LITEVER below. } const durationInSeconds = buffer.duration; - const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps + const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, audio_quality); // mono, 16kHz, 40kbps const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier let mp3Data = []; for (let i = 0; i < samples.length; i += sampleBlockSize) { @@ -29793,12 +29848,15 @@ Current version indicated by LITEVER below. +
- + + +
diff --git a/expose.h b/expose.h index 2b91bc23f..923e23c54 100644 --- a/expose.h +++ b/expose.h @@ -291,6 +291,7 @@ struct tts_generation_inputs const char * custom_speaker_voice = ""; const char * custom_speaker_text = ""; const char * custom_speaker_data = ""; + const char * reference_audio = ""; }; struct tts_generation_outputs { diff --git a/koboldcpp.py b/koboldcpp.py index 119f24eb3..4b723ac6f 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -403,7 +403,8 @@ class tts_generation_inputs(ctypes.Structure): ("audio_seed", ctypes.c_int), ("custom_speaker_voice", ctypes.c_char_p), ("custom_speaker_text", ctypes.c_char_p), - ("custom_speaker_data", ctypes.c_char_p)] + ("custom_speaker_data", ctypes.c_char_p), + ("reference_audio", ctypes.c_char_p)] class tts_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), @@ -2248,7 +2249,8 @@ def tts_generate(genparams): prompt = genparams.get("input", genparams.get("text", "")) prompt = prompt.strip() voice = 1 - speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom cloned voices + speaker_json = tts_prepare_voice_json(genparams.get("speaker_json","")) #handle custom json voices + reference_audio = genparams.get("reference_audio","") #for cloned voices in qwen3tts voicestr = genparams.get("voice", genparams.get("speaker_wav", "")) oai_voicemap = ["alloy","onyx","echo","nova","shimmer"] # map to kcpp defaults voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"] @@ -2278,6 +2280,9 @@ def tts_generate(genparams): else: inputs.custom_speaker_text = "".encode("UTF-8") inputs.custom_speaker_data = "".encode("UTF-8") + if reference_audio and reference_audio.startswith("data:audio"): + reference_audio = reference_audio.split(",", 1)[1] + inputs.reference_audio = reference_audio.encode("UTF-8") ret = handle.tts_generate(inputs) outstr = "" if ret.status==1: diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp index f789388b8..7fbbdf351 100644 --- a/otherarch/qwen3tts/qwen3_tts.cpp +++ b/otherarch/qwen3tts/qwen3_tts.cpp @@ -90,8 +90,7 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string transformer_loaded_ = false; decoder_loaded_ = false; - const char * low_mem_env = std::getenv("QWEN3_TTS_LOW_MEM"); - low_mem_mode_ = low_mem_env && low_mem_env[0] != '\0' && low_mem_env[0] != '0'; + low_mem_mode_ = false; if (low_mem_mode_) { fprintf(stderr, " Low-memory mode enabled (lazy decoder + component unloads)\n"); } diff --git a/otherarch/qwen3tts/tts_transformer.cpp b/otherarch/qwen3tts/tts_transformer.cpp index cff2b5423..8242ba3ee 100644 --- a/otherarch/qwen3tts/tts_transformer.cpp +++ b/otherarch/qwen3tts/tts_transformer.cpp @@ -53,73 +53,41 @@ bool TTSTransformer::load_model(const std::string & model_path) { unload_model(); skip_ggml_code_pred_layers_ = false; -#if defined(__APPLE__) - const char * use_coreml_env = std::getenv("QWEN3_TTS_USE_COREML"); - bool coreml_disabled = false; - if (use_coreml_env && use_coreml_env[0] != '\0') { - std::string use_coreml = use_coreml_env; - std::transform(use_coreml.begin(), use_coreml.end(), use_coreml.begin(), - [](unsigned char c) { return (char) std::tolower(c); }); - coreml_disabled = use_coreml == "0" || use_coreml == "false" || - use_coreml == "off" || use_coreml == "no"; - } - - if (!coreml_disabled) { - std::string coreml_path; - const char * override_env = std::getenv("QWEN3_TTS_COREML_MODEL"); - if (override_env && override_env[0] != '\0') { - coreml_path = override_env; - } else { - size_t slash = model_path.find_last_of("/\\"); - const std::string model_dir = (slash == std::string::npos) ? "." : model_path.substr(0, slash); - coreml_path = model_dir + "/coreml/code_predictor.mlpackage"; - } - - struct stat st = {}; - if (stat(coreml_path.c_str(), &st) == 0) { - // Skip GGML code-predictor weights when CoreML package is present. - skip_ggml_code_pred_layers_ = true; - } else if (use_coreml_env && use_coreml_env[0] != '\0') { - // Explicit opt-in should remain strict to surface configuration errors. - skip_ggml_code_pred_layers_ = true; - } - } -#endif struct ggml_context * meta_ctx = nullptr; struct gguf_init_params params = { /*.no_alloc =*/ true, /*.ctx =*/ &meta_ctx, }; - + struct gguf_context * ctx = gguf_init_from_file(model_path.c_str(), params); if (!ctx) { error_msg_ = "Failed to open GGUF file: " + model_path; return false; } - + if (!parse_config(ctx)) { gguf_free(ctx); if (meta_ctx) ggml_free(meta_ctx); return false; } - + if (!create_tensors(ctx)) { gguf_free(ctx); if (meta_ctx) ggml_free(meta_ctx); return false; } - + if (!load_tensor_data(model_path, ctx)) { free_transformer_model(model_); gguf_free(ctx); if (meta_ctx) ggml_free(meta_ctx); return false; } - + gguf_free(ctx); if (meta_ctx) ggml_free(meta_ctx); - + state_.backend = init_preferred_backend("TTSTransformer", &error_msg_); if (!state_.backend) { return false; @@ -135,7 +103,7 @@ bool TTSTransformer::load_model(const std::string & model_path) { return false; } } - + std::vector backends; backends.push_back(state_.backend); if (state_.backend_cpu) { @@ -146,13 +114,13 @@ bool TTSTransformer::load_model(const std::string & model_path) { error_msg_ = "Failed to create backend scheduler"; return false; } - + state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_MAX_NODES + ggml_graph_overhead()); if (!try_init_coreml_code_predictor(model_path)) { return false; } - + return true; } @@ -160,53 +128,7 @@ bool TTSTransformer::try_init_coreml_code_predictor(const std::string & model_pa use_coreml_code_predictor_ = false; coreml_code_predictor_path_.clear(); - const char * use_coreml_env = std::getenv("QWEN3_TTS_USE_COREML"); - bool coreml_disabled = false; - if (use_coreml_env && use_coreml_env[0] != '\0') { - std::string use_coreml = use_coreml_env; - std::transform(use_coreml.begin(), use_coreml.end(), use_coreml.begin(), - [](unsigned char c) { return (char) std::tolower(c); }); - coreml_disabled = use_coreml == "0" || use_coreml == "false" || - use_coreml == "off" || use_coreml == "no"; - } - - if (coreml_disabled) { - return true; - } - -#if !defined(__APPLE__) - if (use_coreml_env && use_coreml_env[0] != '\0') { - fprintf(stderr, " CoreML code predictor requested but this build is not on Apple platform\n"); - } return true; -#else - std::string coreml_path; - const char * override_env = std::getenv("QWEN3_TTS_COREML_MODEL"); - if (override_env && override_env[0] != '\0') { - coreml_path = override_env; - } else { - size_t slash = model_path.find_last_of("/\\"); - const std::string model_dir = (slash == std::string::npos) ? "." : model_path.substr(0, slash); - coreml_path = model_dir + "/coreml/code_predictor.mlpackage"; - } - - if (!coreml_code_predictor_.load(coreml_path, model_.config.n_codebooks - 1)) { - if (skip_ggml_code_pred_layers_) { - error_msg_ = "CoreML code predictor load failed in strict mode: " + coreml_code_predictor_.get_error(); - return false; - } else { - fprintf(stderr, " CoreML code predictor load failed: %s\n", - coreml_code_predictor_.get_error().c_str()); - fprintf(stderr, " Falling back to GGML code predictor\n"); - return true; - } - } - - use_coreml_code_predictor_ = true; - coreml_code_predictor_path_ = coreml_path; - fprintf(stderr, " CoreML code predictor enabled: %s\n", coreml_code_predictor_path_.c_str()); - return true; -#endif } bool TTSTransformer::parse_config(struct gguf_context * ctx) { @@ -219,7 +141,7 @@ bool TTSTransformer::parse_config(struct gguf_context * ctx) { } return default_val; }; - + auto get_f32_any = [&](std::initializer_list keys, float default_val) -> float { for (const char * key : keys) { int64_t idx = gguf_find_key(ctx, key); @@ -229,7 +151,7 @@ bool TTSTransformer::parse_config(struct gguf_context * ctx) { } return default_val; }; - + auto & cfg = model_.config; cfg.text_vocab_size = get_u32_any({ "qwen3-tts.text.vocab_size", @@ -339,43 +261,43 @@ bool TTSTransformer::parse_config(struct gguf_context * ctx) { "qwen3-tts.codec.language.english_id", "qwen3-tts.language_id", }, 2050); - + return true; } bool TTSTransformer::create_tensors(struct gguf_context * ctx) { const int64_t n_tensors = gguf_get_n_tensors(ctx); const auto & cfg = model_.config; - + const size_t ctx_size = n_tensors * ggml_tensor_overhead(); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - + model_.ctx = ggml_init(params); if (!model_.ctx) { error_msg_ = "Failed to create GGML context"; return false; } - + model_.layers.resize(cfg.n_layers); model_.code_pred_layers.resize(cfg.code_pred_layers); model_.code_pred_embd.resize(cfg.n_codebooks - 1); model_.code_pred_head.resize(cfg.n_codebooks - 1); - + for (int64_t i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); enum ggml_type type = gguf_get_tensor_type(ctx, i); - + int64_t ne[GGML_MAX_DIMS] = {1, 1, 1, 1}; int n_dims = 0; - + if (strstr(name, "spk_enc.") || strstr(name, "tok_")) { continue; } - + if (strstr(name, "talker.text_embd.weight")) { ne[0] = cfg.text_embd_dim; ne[1] = cfg.text_vocab_size; @@ -407,9 +329,9 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) { n_dims = 1; } else if (strstr(name, "talker.blk.")) { int layer_idx = -1; - if (sscanf(name, "talker.blk.%d.", &layer_idx) == 1 && + if (sscanf(name, "talker.blk.%d.", &layer_idx) == 1 && layer_idx >= 0 && layer_idx < cfg.n_layers) { - + if (strstr(name, "attn_norm.weight")) { ne[0] = cfg.hidden_size; n_dims = 1; @@ -461,9 +383,9 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) { continue; } int layer_idx = -1; - if (sscanf(name, "code_pred.blk.%d.", &layer_idx) == 1 && + if (sscanf(name, "code_pred.blk.%d.", &layer_idx) == 1 && layer_idx >= 0 && layer_idx < cfg.code_pred_layers) { - + if (strstr(name, "attn_norm.weight")) { ne[0] = cfg.hidden_size; n_dims = 1; @@ -542,7 +464,7 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) { } else { continue; } - + struct ggml_tensor * tensor = ggml_new_tensor(model_.ctx, type, n_dims, ne); if (!tensor) { error_msg_ = "Failed to create tensor: " + std::string(name); @@ -550,7 +472,7 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) { } ggml_set_name(tensor, name); model_.tensors[name] = tensor; - + if (strstr(name, "talker.text_embd.weight")) { model_.text_embd = tensor; } else if (strstr(name, "talker.text_proj.fc1.weight")) { @@ -617,7 +539,7 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) { model_.code_pred_output_norm = tensor; } } - + return true; } @@ -626,109 +548,109 @@ bool TTSTransformer::load_tensor_data(const std::string & path, struct gguf_cont if (!backend) { return false; } - + model_.buffer = ggml_backend_alloc_ctx_tensors(model_.ctx, backend); if (!model_.buffer) { error_msg_ = "Failed to allocate tensor buffer"; release_preferred_backend(backend); return false; } - + FILE * f = fopen(path.c_str(), "rb"); if (!f) { error_msg_ = "Failed to open file for reading: " + path; release_preferred_backend(backend); return false; } - + const size_t data_offset = gguf_get_data_offset(ctx); const int64_t n_tensors = gguf_get_n_tensors(ctx); std::vector read_buf; - + for (int64_t i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); size_t offset = gguf_get_tensor_offset(ctx, i); - + auto it = model_.tensors.find(name); if (it == model_.tensors.end()) { continue; } - + struct ggml_tensor * tensor = it->second; size_t nbytes = ggml_nbytes(tensor); - + read_buf.resize(nbytes); - + if (fseek(f, data_offset + offset, SEEK_SET) != 0) { error_msg_ = "Failed to seek to tensor data: " + std::string(name); fclose(f); release_preferred_backend(backend); return false; } - + if (fread(read_buf.data(), 1, nbytes, f) != nbytes) { error_msg_ = "Failed to read tensor data: " + std::string(name); fclose(f); release_preferred_backend(backend); return false; } - + ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes); } - + fclose(f); release_preferred_backend(backend); - + return true; } bool TTSTransformer::init_kv_cache(int32_t n_ctx) { const auto & cfg = model_.config; - + free_tts_kv_cache(state_.cache); - + state_.cache.n_ctx = n_ctx; state_.cache.n_used = 0; state_.cache.head_dim = cfg.head_dim; state_.cache.n_kv_heads = cfg.n_key_value_heads; state_.cache.n_layers = cfg.n_layers; - + const size_t n_tensors = cfg.n_layers * 2; const size_t ctx_size = n_tensors * ggml_tensor_overhead(); - + struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - + state_.cache.ctx = ggml_init(params); if (!state_.cache.ctx) { error_msg_ = "Failed to create KV cache context"; return false; } - + state_.cache.k_cache.resize(cfg.n_layers); state_.cache.v_cache.resize(cfg.n_layers); - + for (int il = 0; il < cfg.n_layers; ++il) { state_.cache.k_cache[il] = ggml_new_tensor_3d( state_.cache.ctx, GGML_TYPE_F16, cfg.head_dim, cfg.n_key_value_heads, n_ctx); ggml_format_name(state_.cache.k_cache[il], "k_cache_%d", il); - + state_.cache.v_cache[il] = ggml_new_tensor_3d( state_.cache.ctx, GGML_TYPE_F16, cfg.head_dim, cfg.n_key_value_heads, n_ctx); ggml_format_name(state_.cache.v_cache[il], "v_cache_%d", il); } - + state_.cache.buffer = ggml_backend_alloc_ctx_tensors(state_.cache.ctx, state_.backend); if (!state_.cache.buffer) { error_msg_ = "Failed to allocate KV cache buffer"; return false; } - + return true; } @@ -738,51 +660,51 @@ void TTSTransformer::clear_kv_cache() { bool TTSTransformer::init_code_pred_kv_cache(int32_t n_ctx) { const auto & cfg = model_.config; - + free_tts_kv_cache(state_.code_pred_cache); - + state_.code_pred_cache.n_ctx = n_ctx; state_.code_pred_cache.n_used = 0; state_.code_pred_cache.head_dim = cfg.head_dim; state_.code_pred_cache.n_kv_heads = cfg.n_key_value_heads; state_.code_pred_cache.n_layers = cfg.code_pred_layers; - + const size_t n_tensors = cfg.code_pred_layers * 2; const size_t ctx_size = n_tensors * ggml_tensor_overhead(); - + struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - + state_.code_pred_cache.ctx = ggml_init(params); if (!state_.code_pred_cache.ctx) { error_msg_ = "Failed to create code predictor KV cache context"; return false; } - + state_.code_pred_cache.k_cache.resize(cfg.code_pred_layers); state_.code_pred_cache.v_cache.resize(cfg.code_pred_layers); - + for (int il = 0; il < cfg.code_pred_layers; ++il) { state_.code_pred_cache.k_cache[il] = ggml_new_tensor_3d( state_.code_pred_cache.ctx, GGML_TYPE_F16, cfg.head_dim, cfg.n_key_value_heads, n_ctx); ggml_format_name(state_.code_pred_cache.k_cache[il], "code_pred_k_cache_%d", il); - + state_.code_pred_cache.v_cache[il] = ggml_new_tensor_3d( state_.code_pred_cache.ctx, GGML_TYPE_F16, cfg.head_dim, cfg.n_key_value_heads, n_ctx); ggml_format_name(state_.code_pred_cache.v_cache[il], "code_pred_v_cache_%d", il); } - + state_.code_pred_cache.buffer = ggml_backend_alloc_ctx_tensors(state_.code_pred_cache.ctx, state_.backend); if (!state_.code_pred_cache.buffer) { error_msg_ = "Failed to allocate code predictor KV cache buffer"; return false; } - + return true; } @@ -1130,125 +1052,125 @@ struct ggml_cgraph * TTSTransformer::build_prefill_forward_graph(int32_t n_token const float eps = cfg.rms_norm_eps; const float rope_theta = cfg.rope_theta; const int n_layer = cfg.n_layers; - + struct ggml_init_params params = { /*.mem_size =*/ state_.compute_meta.size(), /*.mem_buffer =*/ state_.compute_meta.data(), /*.no_alloc =*/ true, }; - + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false); struct ggml_tensor * inp_prefill_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, n_tokens); ggml_set_name(inp_prefill_embd, "inp_prefill_embd"); ggml_set_input(inp_prefill_embd); - + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_name(inp_pos, "inp_pos"); ggml_set_input(inp_pos); struct ggml_tensor * cur = inp_prefill_embd; - + struct ggml_tensor * inpL = cur; - + const float KQscale = 1.0f / sqrtf(float(head_dim)); - + for (int il = 0; il < n_layer; ++il) { const auto & layer = model_.layers[il]; - + cur = ggml_rms_norm(ctx0, inpL, eps); cur = ggml_mul(ctx0, cur, layer.attn_norm); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v, cur); - + Qcur = ggml_reshape_3d(ctx0, Qcur, head_dim, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, head_dim, n_kv_head, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, head_dim, n_kv_head, n_tokens); - + if (layer.attn_q_norm) { Qcur = ggml_rms_norm(ctx0, Qcur, eps); Qcur = ggml_mul(ctx0, Qcur, layer.attn_q_norm); } - + if (layer.attn_k_norm) { Kcur = ggml_rms_norm(ctx0, Kcur, eps); Kcur = ggml_mul(ctx0, Kcur, layer.attn_k_norm); } - + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + struct ggml_tensor * k_cache = state_.cache.k_cache[il]; struct ggml_tensor * v_cache = state_.cache.v_cache[il]; - + struct ggml_tensor * k_cache_view = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_tokens, k_cache->nb[1], k_cache->nb[2], n_past * k_cache->nb[2]); - + struct ggml_tensor * v_cache_view = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_tokens, v_cache->nb[1], v_cache->nb[2], n_past * v_cache->nb[2]); - + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k_cache_view)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v_cache_view)); - + int n_kv = n_past + n_tokens; - + struct ggml_tensor * K = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_kv, k_cache->nb[1], k_cache->nb[2], 0); - + struct ggml_tensor * V = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_kv, v_cache->nb[1], v_cache->nb[2], 0); - + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); K = ggml_permute(ctx0, K, 0, 2, 1, 3); V = ggml_permute(ctx0, V, 0, 2, 1, 3); - + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_scale(ctx0, KQ, KQscale); KQ = ggml_diag_mask_inf(ctx0, KQ, n_past); KQ = ggml_soft_max(ctx0, KQ); - + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, KQV, n_head * head_dim, n_tokens); - + cur = ggml_mul_mat(ctx0, layer.attn_output, cur); cur = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = cur; - + cur = ggml_rms_norm(ctx0, inpFF, eps); cur = ggml_mul(ctx0, cur, layer.ffn_norm); - + struct ggml_tensor * gate = ggml_mul_mat(ctx0, layer.ffn_gate, cur); struct ggml_tensor * up = ggml_mul_mat(ctx0, layer.ffn_up, cur); - + gate = ggml_silu(ctx0, gate); - + cur = ggml_mul(ctx0, gate, up); - + struct ggml_tensor * ffn_down_f32 = ggml_cast(ctx0, layer.ffn_down, GGML_TYPE_F32); cur = ggml_mul_mat(ctx0, ffn_down_f32, cur); - + inpL = ggml_add(ctx0, cur, inpFF); } - + cur = inpL; - + cur = ggml_rms_norm(ctx0, cur, eps); cur = ggml_mul(ctx0, cur, model_.output_norm); ggml_set_name(cur, "hidden_states"); @@ -1257,11 +1179,11 @@ struct ggml_cgraph * TTSTransformer::build_prefill_forward_graph(int32_t n_token struct ggml_tensor * logits = ggml_mul_mat(ctx0, model_.codec_head, cur); ggml_set_name(logits, "logits"); ggml_set_output(logits); - + ggml_build_forward_expand(gf, logits); - + ggml_free(ctx0); - + return gf; } @@ -1275,138 +1197,138 @@ struct ggml_cgraph * TTSTransformer::build_step_graph(int32_t n_past) { const float rope_theta = cfg.rope_theta; const int n_layer = cfg.n_layers; const int n_tokens = 1; - + struct ggml_init_params params = { /*.mem_size =*/ state_.compute_meta.size(), /*.mem_buffer =*/ state_.compute_meta.data(), /*.no_alloc =*/ true, }; - + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false); struct ggml_tensor * inp_step_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_size, 1); ggml_set_name(inp_step_embd, "inp_step_embd"); ggml_set_input(inp_step_embd); - + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 1); ggml_set_name(inp_pos, "inp_pos"); ggml_set_input(inp_pos); struct ggml_tensor * cur = inp_step_embd; - + struct ggml_tensor * inpL = cur; - + const float KQscale = 1.0f / sqrtf(float(head_dim)); - + for (int il = 0; il < n_layer; ++il) { const auto & layer = model_.layers[il]; - + cur = ggml_rms_norm(ctx0, inpL, eps); cur = ggml_mul(ctx0, cur, layer.attn_norm); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v, cur); - + Qcur = ggml_reshape_3d(ctx0, Qcur, head_dim, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, head_dim, n_kv_head, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, head_dim, n_kv_head, n_tokens); - + if (layer.attn_q_norm) { Qcur = ggml_rms_norm(ctx0, Qcur, eps); Qcur = ggml_mul(ctx0, Qcur, layer.attn_q_norm); } - + if (layer.attn_k_norm) { Kcur = ggml_rms_norm(ctx0, Kcur, eps); Kcur = ggml_mul(ctx0, Kcur, layer.attn_k_norm); } - + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + struct ggml_tensor * k_cache = state_.cache.k_cache[il]; struct ggml_tensor * v_cache = state_.cache.v_cache[il]; - + struct ggml_tensor * k_cache_view = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_tokens, k_cache->nb[1], k_cache->nb[2], n_past * k_cache->nb[2]); - + struct ggml_tensor * v_cache_view = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_tokens, v_cache->nb[1], v_cache->nb[2], n_past * v_cache->nb[2]); - + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k_cache_view)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v_cache_view)); - + int n_kv = n_past + n_tokens; - + struct ggml_tensor * K = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_kv, k_cache->nb[1], k_cache->nb[2], 0); - + struct ggml_tensor * V = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_kv, v_cache->nb[1], v_cache->nb[2], 0); - + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); K = ggml_permute(ctx0, K, 0, 2, 1, 3); V = ggml_permute(ctx0, V, 0, 2, 1, 3); - + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_scale(ctx0, KQ, KQscale); KQ = ggml_diag_mask_inf(ctx0, KQ, n_past); KQ = ggml_soft_max(ctx0, KQ); - + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, KQV, n_head * head_dim, n_tokens); - + cur = ggml_mul_mat(ctx0, layer.attn_output, cur); cur = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = cur; - + cur = ggml_rms_norm(ctx0, inpFF, eps); cur = ggml_mul(ctx0, cur, layer.ffn_norm); - + struct ggml_tensor * gate = ggml_mul_mat(ctx0, layer.ffn_gate, cur); struct ggml_tensor * up = ggml_mul_mat(ctx0, layer.ffn_up, cur); - + gate = ggml_silu(ctx0, gate); - + cur = ggml_mul(ctx0, gate, up); - + struct ggml_tensor * ffn_down_f32 = ggml_cast(ctx0, layer.ffn_down, GGML_TYPE_F32); cur = ggml_mul_mat(ctx0, ffn_down_f32, cur); - + inpL = ggml_add(ctx0, cur, inpFF); } - + cur = inpL; - + cur = ggml_rms_norm(ctx0, cur, eps); cur = ggml_mul(ctx0, cur, model_.output_norm); ggml_set_name(cur, "hidden_states"); ggml_set_output(cur); - + struct ggml_tensor * logits = ggml_mul_mat(ctx0, model_.codec_head, cur); ggml_set_name(logits, "logits"); ggml_set_output(logits); - + ggml_build_forward_expand(gf, logits); - + ggml_free(ctx0); - + return gf; } @@ -1419,29 +1341,29 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_graph(int32_t n_prev_codes) const float eps = cfg.rms_norm_eps; const int n_layer = cfg.code_pred_layers; const int n_codebooks = cfg.n_codebooks; - + struct ggml_init_params params = { /*.mem_size =*/ state_.compute_meta.size(), /*.mem_buffer =*/ state_.compute_meta.data(), /*.no_alloc =*/ true, }; - + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false); - + struct ggml_tensor * inp_hidden = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_size); ggml_set_name(inp_hidden, "inp_hidden"); ggml_set_input(inp_hidden); - + struct ggml_tensor * inp_prev_codes = nullptr; if (n_prev_codes > 0) { inp_prev_codes = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_prev_codes); ggml_set_name(inp_prev_codes, "inp_prev_codes"); ggml_set_input(inp_prev_codes); } - + struct ggml_tensor * cur = ggml_reshape_2d(ctx0, inp_hidden, hidden_size, 1); - + if (n_prev_codes > 0 && inp_prev_codes) { for (int cb = 0; cb < n_prev_codes && cb < n_codebooks - 1; ++cb) { struct ggml_tensor * code_idx = ggml_view_1d(ctx0, inp_prev_codes, 1, cb * sizeof(int32_t)); @@ -1449,71 +1371,71 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_graph(int32_t n_prev_codes) cur = ggml_add(ctx0, cur, code_embd); } } - + struct ggml_tensor * inpL = cur; - + const float KQscale = 1.0f / sqrtf(float(head_dim)); - + for (int il = 0; il < n_layer; ++il) { const auto & layer = model_.code_pred_layers[il]; - + cur = ggml_rms_norm(ctx0, inpL, eps); cur = ggml_mul(ctx0, cur, layer.attn_norm); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v, cur); - + Qcur = ggml_reshape_3d(ctx0, Qcur, head_dim, n_head, 1); Kcur = ggml_reshape_3d(ctx0, Kcur, head_dim, n_kv_head, 1); Vcur = ggml_reshape_3d(ctx0, Vcur, head_dim, n_kv_head, 1); - + if (layer.attn_q_norm) { Qcur = ggml_rms_norm(ctx0, Qcur, eps); Qcur = ggml_mul(ctx0, Qcur, layer.attn_q_norm); } - + if (layer.attn_k_norm) { Kcur = ggml_rms_norm(ctx0, Kcur, eps); Kcur = ggml_mul(ctx0, Kcur, layer.attn_k_norm); } - + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); struct ggml_tensor * K = ggml_permute(ctx0, Kcur, 0, 2, 1, 3); struct ggml_tensor * V = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); - + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_scale(ctx0, KQ, KQscale); KQ = ggml_soft_max(ctx0, KQ); - + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, KQV, n_head * head_dim, 1); - + cur = ggml_mul_mat(ctx0, layer.attn_output, cur); cur = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = cur; - + cur = ggml_rms_norm(ctx0, inpFF, eps); cur = ggml_mul(ctx0, cur, layer.ffn_norm); - + struct ggml_tensor * gate = ggml_mul_mat(ctx0, layer.ffn_gate, cur); struct ggml_tensor * up = ggml_mul_mat(ctx0, layer.ffn_up, cur); - + gate = ggml_silu(ctx0, gate); - + cur = ggml_mul(ctx0, gate, up); - + struct ggml_tensor * old_ffn_down_f32 = ggml_cast(ctx0, layer.ffn_down, GGML_TYPE_F32); cur = ggml_mul_mat(ctx0, old_ffn_down_f32, cur); - + inpL = ggml_add(ctx0, cur, inpFF); } - + cur = inpL; - + std::vector all_logits; for (int cb = 0; cb < n_codebooks - 1; ++cb) { struct ggml_tensor * cb_logits = ggml_mul_mat(ctx0, model_.code_pred_head[cb], cur); @@ -1521,13 +1443,13 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_graph(int32_t n_prev_codes) ggml_set_output(cb_logits); all_logits.push_back(cb_logits); } - + for (auto * logits : all_logits) { ggml_build_forward_expand(gf, logits); } - + ggml_free(ctx0); - + return gf; } @@ -1541,137 +1463,137 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_prefill_graph() { const float rope_theta = cfg.rope_theta; const int n_layer = cfg.code_pred_layers; const int n_tokens = 2; - + struct ggml_init_params params = { /*.mem_size =*/ state_.compute_meta.size(), /*.mem_buffer =*/ state_.compute_meta.data(), /*.no_alloc =*/ true, }; - + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false); - + // Input: past_hidden from talker [hidden_size] struct ggml_tensor * inp_hidden = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_size); ggml_set_name(inp_hidden, "inp_hidden"); ggml_set_input(inp_hidden); - + // Input: codebook 0 token embedding [hidden_size] (pre-computed using talker's codec_embd) struct ggml_tensor * inp_cb0_embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_size); ggml_set_name(inp_cb0_embd, "inp_cb0_embd"); ggml_set_input(inp_cb0_embd); - + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_name(inp_pos, "inp_pos"); ggml_set_input(inp_pos); - + // Concatenate [past_hidden, cb0_embd] -> [2, hidden_size] struct ggml_tensor * hidden_2d = ggml_reshape_2d(ctx0, inp_hidden, hidden_size, 1); struct ggml_tensor * cb0_2d = ggml_reshape_2d(ctx0, inp_cb0_embd, hidden_size, 1); struct ggml_tensor * cur = ggml_concat(ctx0, hidden_2d, cb0_2d, 1); - + struct ggml_tensor * inpL = cur; - + const float KQscale = 1.0f / sqrtf(float(head_dim)); - + for (int il = 0; il < n_layer; ++il) { const auto & layer = model_.code_pred_layers[il]; - + cur = ggml_rms_norm(ctx0, inpL, eps); cur = ggml_mul(ctx0, cur, layer.attn_norm); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v, cur); - + Qcur = ggml_reshape_3d(ctx0, Qcur, head_dim, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, head_dim, n_kv_head, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, head_dim, n_kv_head, n_tokens); - + if (layer.attn_q_norm) { Qcur = ggml_rms_norm(ctx0, Qcur, eps); Qcur = ggml_mul(ctx0, Qcur, layer.attn_q_norm); } - + if (layer.attn_k_norm) { Kcur = ggml_rms_norm(ctx0, Kcur, eps); Kcur = ggml_mul(ctx0, Kcur, layer.attn_k_norm); } - + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + struct ggml_tensor * k_cache = state_.code_pred_cache.k_cache[il]; struct ggml_tensor * v_cache = state_.code_pred_cache.v_cache[il]; - + // Store at position 0 (prefill starts fresh) struct ggml_tensor * k_cache_view = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_tokens, k_cache->nb[1], k_cache->nb[2], 0); - + struct ggml_tensor * v_cache_view = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_tokens, v_cache->nb[1], v_cache->nb[2], 0); - + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k_cache_view)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v_cache_view)); - + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); struct ggml_tensor * K = ggml_permute(ctx0, Kcur, 0, 2, 1, 3); struct ggml_tensor * V = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); - + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_scale(ctx0, KQ, KQscale); KQ = ggml_diag_mask_inf(ctx0, KQ, 0); KQ = ggml_soft_max(ctx0, KQ); - + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, KQV, n_head * head_dim, n_tokens); - + cur = ggml_mul_mat(ctx0, layer.attn_output, cur); cur = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = cur; - + cur = ggml_rms_norm(ctx0, inpFF, eps); cur = ggml_mul(ctx0, cur, layer.ffn_norm); - + struct ggml_tensor * gate = ggml_mul_mat(ctx0, layer.ffn_gate, cur); struct ggml_tensor * up = ggml_mul_mat(ctx0, layer.ffn_up, cur); - + gate = ggml_silu(ctx0, gate); - + cur = ggml_mul(ctx0, gate, up); - + struct ggml_tensor * ffn_down_f32 = ggml_cast(ctx0, layer.ffn_down, GGML_TYPE_F32); cur = ggml_mul_mat(ctx0, ffn_down_f32, cur); - + inpL = ggml_add(ctx0, cur, inpFF); } - + cur = inpL; - + cur = ggml_rms_norm(ctx0, cur, eps); cur = ggml_mul(ctx0, cur, model_.code_pred_output_norm); - - struct ggml_tensor * last_hidden = ggml_view_2d(ctx0, cur, hidden_size, 1, + + struct ggml_tensor * last_hidden = ggml_view_2d(ctx0, cur, hidden_size, 1, cur->nb[1], hidden_size * sizeof(float)); - + struct ggml_tensor * logits = ggml_mul_mat(ctx0, model_.code_pred_head[0], last_hidden); ggml_set_name(logits, "logits"); ggml_set_output(logits); - + ggml_build_forward_expand(gf, logits); - + ggml_free(ctx0); - + return gf; } @@ -1685,28 +1607,28 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_step_graph(int32_t n_past, const float rope_theta = cfg.rope_theta; const int n_layer = cfg.code_pred_layers; const int n_tokens = 1; - + struct ggml_init_params params = { /*.mem_size =*/ state_.compute_meta.size(), /*.mem_buffer =*/ state_.compute_meta.data(), /*.no_alloc =*/ true, }; - + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false); - + struct ggml_tensor * inp_hidden = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_size); ggml_set_name(inp_hidden, "inp_hidden"); ggml_set_input(inp_hidden); - + struct ggml_tensor * inp_code = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 1); ggml_set_name(inp_code, "inp_code"); ggml_set_input(inp_code); - + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 1); ggml_set_name(inp_pos, "inp_pos"); ggml_set_input(inp_pos); - + struct ggml_tensor * cur; if (generation_step == 0) { cur = ggml_reshape_2d(ctx0, inp_hidden, hidden_size, 1); @@ -1714,117 +1636,117 @@ struct ggml_cgraph * TTSTransformer::build_code_pred_step_graph(int32_t n_past, cur = ggml_get_rows(ctx0, model_.code_pred_embd[generation_step - 1], inp_code); cur = ggml_reshape_2d(ctx0, cur, hidden_size, 1); } - + struct ggml_tensor * inpL = cur; - + const float KQscale = 1.0f / sqrtf(float(head_dim)); - + for (int il = 0; il < n_layer; ++il) { const auto & layer = model_.code_pred_layers[il]; - + cur = ggml_rms_norm(ctx0, inpL, eps); cur = ggml_mul(ctx0, cur, layer.attn_norm); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v, cur); - + Qcur = ggml_reshape_3d(ctx0, Qcur, head_dim, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, head_dim, n_kv_head, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, head_dim, n_kv_head, n_tokens); - + if (layer.attn_q_norm) { Qcur = ggml_rms_norm(ctx0, Qcur, eps); Qcur = ggml_mul(ctx0, Qcur, layer.attn_q_norm); } - + if (layer.attn_k_norm) { Kcur = ggml_rms_norm(ctx0, Kcur, eps); Kcur = ggml_mul(ctx0, Kcur, layer.attn_k_norm); } - + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - + struct ggml_tensor * k_cache = state_.code_pred_cache.k_cache[il]; struct ggml_tensor * v_cache = state_.code_pred_cache.v_cache[il]; - + struct ggml_tensor * k_cache_view = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_tokens, k_cache->nb[1], k_cache->nb[2], n_past * k_cache->nb[2]); - + struct ggml_tensor * v_cache_view = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_tokens, v_cache->nb[1], v_cache->nb[2], n_past * v_cache->nb[2]); - + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k_cache_view)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v_cache_view)); - + int n_kv = n_past + n_tokens; - + struct ggml_tensor * K = ggml_view_3d(ctx0, k_cache, head_dim, n_kv_head, n_kv, k_cache->nb[1], k_cache->nb[2], 0); - + struct ggml_tensor * V = ggml_view_3d(ctx0, v_cache, head_dim, n_kv_head, n_kv, v_cache->nb[1], v_cache->nb[2], 0); - + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); K = ggml_permute(ctx0, K, 0, 2, 1, 3); V = ggml_permute(ctx0, V, 0, 2, 1, 3); - + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_scale(ctx0, KQ, KQscale); KQ = ggml_diag_mask_inf(ctx0, KQ, n_past); KQ = ggml_soft_max(ctx0, KQ); - + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, KQV, n_head * head_dim, n_tokens); - + cur = ggml_mul_mat(ctx0, layer.attn_output, cur); cur = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = cur; - + cur = ggml_rms_norm(ctx0, inpFF, eps); cur = ggml_mul(ctx0, cur, layer.ffn_norm); - + struct ggml_tensor * gate = ggml_mul_mat(ctx0, layer.ffn_gate, cur); struct ggml_tensor * up = ggml_mul_mat(ctx0, layer.ffn_up, cur); - + gate = ggml_silu(ctx0, gate); - + cur = ggml_mul(ctx0, gate, up); - + struct ggml_tensor * step_ffn_down_f32 = ggml_cast(ctx0, layer.ffn_down, GGML_TYPE_F32); cur = ggml_mul_mat(ctx0, step_ffn_down_f32, cur); - + inpL = ggml_add(ctx0, cur, inpFF); } - + cur = inpL; - + cur = ggml_rms_norm(ctx0, cur, eps); cur = ggml_mul(ctx0, cur, model_.code_pred_output_norm); - + struct ggml_tensor * logits = ggml_mul_mat(ctx0, model_.code_pred_head[generation_step], cur); ggml_set_name(logits, "logits"); ggml_set_output(logits); - + ggml_build_forward_expand(gf, logits); - + ggml_free(ctx0); - + return gf; } @@ -1843,19 +1765,19 @@ bool TTSTransformer::forward_prefill(const float * prefill_embd, int32_t n_token error_msg_ = "n_tokens must be > 0"; return false; } - + if (state_.cache.n_ctx == 0) { const int32_t min_ctx = std::max(256, n_past + n_tokens + 16); if (!init_kv_cache(min_ctx)) { return false; } } - + if (n_past + n_tokens > state_.cache.n_ctx) { error_msg_ = "Context length exceeded"; return false; } - + #ifdef QWEN3_TTS_TIMING using clk = std::chrono::high_resolution_clock; auto t0 = clk::now(), t1 = t0; @@ -1890,7 +1812,7 @@ bool TTSTransformer::forward_prefill(const float * prefill_embd, int32_t n_token ggml_backend_tensor_set(inp_prefill, prefill_embd, 0, (size_t)n_tokens * model_.config.hidden_size * sizeof(float)); } - + struct ggml_tensor * inp_pos = ggml_graph_get_tensor(gf, "inp_pos"); if (inp_pos) { std::vector positions(n_tokens); @@ -1916,7 +1838,7 @@ bool TTSTransformer::forward_prefill(const float * prefill_embd, int32_t n_token t1 = clk::now(); if (timing_) timing_->t_prefill_compute_ms += std::chrono::duration(t1 - t0).count(); #endif - + struct ggml_tensor * hidden = ggml_graph_get_tensor(gf, "hidden_states"); if (!hidden) { error_msg_ = "Failed to find hidden_states tensor"; @@ -1929,9 +1851,9 @@ bool TTSTransformer::forward_prefill(const float * prefill_embd, int32_t n_token #endif output.resize(n_tokens * model_.config.hidden_size); ggml_backend_tensor_get(hidden, output.data(), 0, output.size() * sizeof(float)); - + last_hidden_.resize(model_.config.hidden_size); - ggml_backend_tensor_get(hidden, last_hidden_.data(), + ggml_backend_tensor_get(hidden, last_hidden_.data(), (n_tokens - 1) * model_.config.hidden_size * sizeof(float), model_.config.hidden_size * sizeof(float)); @@ -1948,15 +1870,15 @@ bool TTSTransformer::forward_prefill(const float * prefill_embd, int32_t n_token (n_tokens - 1) * model_.config.codec_vocab_size * sizeof(float), model_.config.codec_vocab_size * sizeof(float)); } - + state_.cache.n_used = n_past + n_tokens; - + ggml_backend_sched_reset(state_.sched); #ifdef QWEN3_TTS_TIMING t1 = clk::now(); if (timing_) timing_->t_prefill_data_ms += std::chrono::duration(t1 - t0).count(); #endif - + return true; } @@ -2013,7 +1935,7 @@ bool TTSTransformer::forward_step(const float * step_embd, int32_t n_past, error_msg_ = "Context length exceeded"; return false; } - + #ifdef QWEN3_TTS_TIMING using clk = std::chrono::high_resolution_clock; auto t0 = clk::now(), t1 = t0; @@ -2048,7 +1970,7 @@ bool TTSTransformer::forward_step(const float * step_embd, int32_t n_past, ggml_backend_tensor_set(inp_step, step_embd, 0, model_.config.hidden_size * sizeof(float)); } - + struct ggml_tensor * inp_pos = ggml_graph_get_tensor(gf, "inp_pos"); if (inp_pos) { int32_t pos = n_past; @@ -2071,7 +1993,7 @@ bool TTSTransformer::forward_step(const float * step_embd, int32_t n_past, t1 = clk::now(); if (timing_) timing_->t_talker_compute_ms += std::chrono::duration(t1 - t0).count(); #endif - + struct ggml_tensor * hidden = ggml_graph_get_tensor(gf, "hidden_states"); #ifdef QWEN3_TTS_TIMING @@ -2079,31 +2001,31 @@ bool TTSTransformer::forward_step(const float * step_embd, int32_t n_past, #endif if (hidden) { last_hidden_.resize(model_.config.hidden_size); - ggml_backend_tensor_get(hidden, last_hidden_.data(), 0, + ggml_backend_tensor_get(hidden, last_hidden_.data(), 0, model_.config.hidden_size * sizeof(float)); if (hidden_out) { *hidden_out = last_hidden_; } } - + struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); if (!logits) { error_msg_ = "Failed to find logits tensor"; ggml_backend_sched_reset(state_.sched); return false; } - + output.resize(model_.config.codec_vocab_size); ggml_backend_tensor_get(logits, output.data(), 0, output.size() * sizeof(float)); - + state_.cache.n_used = n_past + 1; - + ggml_backend_sched_reset(state_.sched); #ifdef QWEN3_TTS_TIMING t1 = clk::now(); if (timing_) timing_->t_talker_data_ms += std::chrono::duration(t1 - t0).count(); #endif - + return true; } @@ -2133,37 +2055,37 @@ bool TTSTransformer::predict_codes(const float * hidden, const int32_t * prev_co error_msg_ = "Model not loaded"; return false; } - + const auto & cfg = model_.config; int n_prev = (prev_codes != nullptr) ? cfg.n_codebooks - 1 : 0; - + struct ggml_cgraph * gf = build_code_pred_graph(n_prev); - + if (!ggml_backend_sched_alloc_graph(state_.sched, gf)) { error_msg_ = "Failed to allocate code predictor graph"; return false; } - + struct ggml_tensor * inp_hidden = ggml_graph_get_tensor(gf, "inp_hidden"); if (inp_hidden) { ggml_backend_tensor_set(inp_hidden, hidden, 0, cfg.hidden_size * sizeof(float)); } - + if (n_prev > 0) { struct ggml_tensor * inp_prev = ggml_graph_get_tensor(gf, "inp_prev_codes"); if (inp_prev) { ggml_backend_tensor_set(inp_prev, prev_codes, 0, n_prev * sizeof(int32_t)); } } - + if (ggml_backend_sched_graph_compute(state_.sched, gf) != GGML_STATUS_SUCCESS) { error_msg_ = "Failed to compute code predictor graph"; ggml_backend_sched_reset(state_.sched); return false; } - + output.resize((cfg.n_codebooks - 1) * cfg.code_pred_vocab_size); - + for (int cb = 0; cb < cfg.n_codebooks - 1; ++cb) { char name[32]; snprintf(name, sizeof(name), "logits_cb%d", cb + 1); @@ -2173,9 +2095,9 @@ bool TTSTransformer::predict_codes(const float * hidden, const int32_t * prev_co 0, cfg.code_pred_vocab_size * sizeof(float)); } } - + ggml_backend_sched_reset(state_.sched); - + return true; } @@ -2314,7 +2236,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t error_msg_ = "Model not loaded"; return false; } - + const auto & cfg = model_.config; #ifdef QWEN3_TTS_TIMING @@ -2332,19 +2254,19 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t fprintf(stderr, " CoreML code predictor failed, falling back to GGML: %s\n", error_msg_.c_str()); use_coreml_code_predictor_ = false; } - + if (state_.code_pred_cache.n_ctx < 16) { if (!init_code_pred_kv_cache(16)) { return false; } } clear_code_pred_kv_cache(); - + output.resize(15); std::vector logits_data(cfg.code_pred_vocab_size); - + std::vector code_probs(cfg.code_pred_vocab_size); - + // Helper lambda: temperature + top-k sampling (or greedy if temperature <= 0) auto sample_or_argmax = [&](float * logits_ptr, int32_t vocab_size) -> int32_t { if (temperature <= 0.0f) { @@ -2385,7 +2307,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t std::discrete_distribution dist(code_probs.begin(), code_probs.begin() + vocab_size); return dist(rng_); }; - + std::vector cb0_embd(cfg.hidden_size); if (!lookup_single_embedding_row(model_.codec_embd, codebook_0_token, cb0_embd.data())) { return false; @@ -2394,7 +2316,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t t1 = clk::now(); if (timing_) timing_->t_code_pred_init_ms += std::chrono::duration(t1 - t0).count(); #endif - + // Prefill with 2 tokens [past_hidden, cb0_embd] { #ifdef QWEN3_TTS_TIMING @@ -2429,12 +2351,12 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t if (inp_hidden) { ggml_backend_tensor_set(inp_hidden, hidden, 0, cfg.hidden_size * sizeof(float)); } - + struct ggml_tensor * inp_cb0_embd = ggml_graph_get_tensor(gf, "inp_cb0_embd"); if (inp_cb0_embd) { ggml_backend_tensor_set(inp_cb0_embd, cb0_embd.data(), 0, cfg.hidden_size * sizeof(float)); } - + struct ggml_tensor * inp_pos = ggml_graph_get_tensor(gf, "inp_pos"); if (inp_pos) { int32_t positions[2] = {0, 1}; @@ -2457,7 +2379,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t t1 = clk::now(); if (timing_) timing_->t_code_pred_compute_ms += std::chrono::duration(t1 - t0).count(); #endif - + struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); if (!logits) { error_msg_ = "Failed to find logits tensor in prefill"; @@ -2468,11 +2390,11 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t #ifdef QWEN3_TTS_TIMING t0 = clk::now(); #endif - ggml_backend_tensor_get(logits, logits_data.data(), 0, + ggml_backend_tensor_get(logits, logits_data.data(), 0, cfg.code_pred_vocab_size * sizeof(float)); - + output[0] = sample_or_argmax(logits_data.data(), cfg.code_pred_vocab_size); - + ggml_backend_sched_reset(state_.sched); #ifdef QWEN3_TTS_TIMING t1 = clk::now(); @@ -2480,7 +2402,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t if (timing_) timing_->t_code_pred_prefill_ms += std::chrono::duration(t1 - t_pf_start).count(); #endif } - + // Generate 14 more tokens autoregressively #ifdef QWEN3_TTS_TIMING auto t_steps_start = clk::now(); @@ -2516,13 +2438,13 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t if (inp_hidden) { ggml_backend_tensor_set(inp_hidden, hidden, 0, cfg.hidden_size * sizeof(float)); } - + struct ggml_tensor * inp_code = ggml_graph_get_tensor(gf, "inp_code"); if (inp_code) { int32_t prev_code = output[step - 1]; ggml_backend_tensor_set(inp_code, &prev_code, 0, sizeof(int32_t)); } - + struct ggml_tensor * inp_pos = ggml_graph_get_tensor(gf, "inp_pos"); if (inp_pos) { int32_t pos = n_past; @@ -2545,7 +2467,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t t1 = clk::now(); if (timing_) timing_->t_code_pred_compute_ms += std::chrono::duration(t1 - t0).count(); #endif - + struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); if (!logits) { error_msg_ = "Failed to find logits tensor"; @@ -2556,11 +2478,11 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t #ifdef QWEN3_TTS_TIMING t0 = clk::now(); #endif - ggml_backend_tensor_get(logits, logits_data.data(), 0, + ggml_backend_tensor_get(logits, logits_data.data(), 0, cfg.code_pred_vocab_size * sizeof(float)); - + output[step] = sample_or_argmax(logits_data.data(), cfg.code_pred_vocab_size); - + ggml_backend_sched_reset(state_.sched); #ifdef QWEN3_TTS_TIMING t1 = clk::now(); @@ -2570,7 +2492,7 @@ bool TTSTransformer::predict_codes_autoregressive(const float * hidden, int32_t #ifdef QWEN3_TTS_TIMING if (timing_) timing_->t_code_pred_steps_ms += std::chrono::duration(clk::now() - t_steps_start).count(); #endif - + return true; } @@ -2605,7 +2527,7 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, output.clear(); return true; } - + const auto & cfg = model_.config; std::vector prefill_embd; @@ -2634,7 +2556,7 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, } } clear_kv_cache(); - + std::vector hidden_out; std::vector logits; @@ -2648,19 +2570,19 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, t1 = clk::now(); timing.t_prefill_forward_ms = std::chrono::duration(t1 - t0).count(); #endif - + output.clear(); output.reserve(max_len * cfg.n_codebooks); - + int32_t n_past = prefill_len; std::vector frame_codes(cfg.n_codebooks); std::unordered_set generated_cb0_tokens; const int32_t suppress_start = cfg.codec_vocab_size - 1024; - + std::vector probs(cfg.codec_vocab_size); std::vector step_embd(cfg.hidden_size, 0.0f); std::vector embd_row(cfg.hidden_size); - + for (int frame = 0; frame < max_len; ++frame) { // Suppress tokens in [codec_vocab_size - 1024, codec_vocab_size), except codec_eos_id for (int32_t i = suppress_start; i < cfg.codec_vocab_size; ++i) { @@ -2720,14 +2642,14 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, std::discrete_distribution dist(probs.begin(), probs.end()); next_token = dist(rng_); } - + if (next_token == cfg.codec_eos_id) { break; } - + frame_codes[0] = next_token; generated_cb0_tokens.insert(next_token); - + #ifdef QWEN3_TTS_TIMING t0 = clk::now(); #endif @@ -2739,11 +2661,11 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, t1 = clk::now(); timing.t_code_pred_ms += std::chrono::duration(t1 - t0).count(); #endif - + for (int cb = 1; cb < cfg.n_codebooks; ++cb) { frame_codes[cb] = codes_1_15[cb - 1]; } - + for (int cb = 0; cb < cfg.n_codebooks; ++cb) { output.push_back(frame_codes[cb]); } @@ -2799,10 +2721,10 @@ bool TTSTransformer::generate(const int32_t * text_tokens, int32_t n_tokens, t1 = clk::now(); timing.t_talker_forward_ms += std::chrono::duration(t1 - t0).count(); #endif - + n_past++; } - + #ifdef QWEN3_TTS_TIMING timing.t_generate_total_ms = std::chrono::duration(clk::now() - t_gen_start).count(); timing_ = nullptr; diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index c31dcc4ba..c689c76de 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -1216,23 +1216,40 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp } else { + double ttstime = 0; + timer_start(); + qwen3_tts::tts_result result; std::string prompt = inputs.prompt; qwen3_tts::tts_params qwen3tts_params; - double ttstime = 0; - timer_start(); + std::string custom_reference_audio_str = inputs.reference_audio; + std::vector custom_reference_audio_pcmf32; + + if(custom_reference_audio_str!="") + { + std::vector media_data_buffer = kcpp_base64_decode(custom_reference_audio_str); + + //qwen3tts uses 24khz + bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 24000, custom_reference_audio_pcmf32); + if (!ok) { + printf("\nError: Cannot read input audio file.\n"); + output.data = ""; + output.status = 0; + return output; + } + } + if(!tts_is_quiet) { printf("\nTTS Generating..."); } - // if (reference_audio.empty()) { - result = qwen3tts_runner.synthesize(prompt, qwen3tts_params); - // } else { - // fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str()); - // fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str()); - // result = tts.synthesize_with_voice(text, reference_audio, params); - // } + if (custom_reference_audio_pcmf32.empty()) { + result = qwen3tts_runner.synthesize(prompt, qwen3tts_params); + } else { + printf("\nUsing reference voice... (Warning, lengthy sample audio will be very slow. Use short clips!)\n"); + result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params); + } if (!result.success) { printf("\nError: TTS vocoder generation failed : %s\n", result.error_msg.c_str()); diff --git a/otherarch/whispercpp/whisper_adapter.cpp b/otherarch/whispercpp/whisper_adapter.cpp index abdee881b..b8f7237d4 100644 --- a/otherarch/whispercpp/whisper_adapter.cpp +++ b/otherarch/whispercpp/whisper_adapter.cpp @@ -27,20 +27,8 @@ static std::string whisper_output_text = ""; int total_transcribe_gens = 0; -static bool is_wav_buffer(const std::string buf) { - // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format - // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html - if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { - return false; - } - uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); - if (chunk_size + 8 != buf.size()) { - return false; - } - return true; -} -static bool read_wav(const std::string & b64data, std::vector& pcmf32) +static bool read_audio(const std::string & b64data, std::vector& pcmf32) { std::vector media_data_buffer = kcpp_base64_decode(b64data); @@ -141,7 +129,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs std::vector pcmf32; // mono-channel F32 PCM - if (!::read_wav(b64data, pcmf32)) { + if (!::read_audio(b64data, pcmf32)) { printf("\nWhisper: Failed to read input wav data!\n"); output.text = ""; output.status = 0;