From 313d37a6022b2c2ed713f8e7ecaf2fc4f61cc196 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 22 Feb 2026 00:43:57 +0800 Subject: [PATCH] cache used voices --- otherarch/qwen3tts/qwen3_tts.cpp | 21 +++++++++++++++++---- otherarch/qwen3tts/qwen3_tts.h | 2 +- otherarch/tts_adapter.cpp | 17 ++++------------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp index 86a6ec183..14a96a519 100644 --- a/otherarch/qwen3tts/qwen3_tts.cpp +++ b/otherarch/qwen3tts/qwen3_tts.cpp @@ -202,10 +202,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text, return synthesize_with_voice(text, ref_samples.data(), (int32_t)ref_samples.size(), params); } -static std::vector speaker_embedding; +static std::map> speaker_embeddings; tts_result Qwen3TTS::synthesize_with_voice(const std::string & text, const float * ref_samples, int32_t n_ref_samples, - const tts_params & params, bool regenerate) { + const tts_params & params, std::size_t reuse_hash_val) { tts_result result; if (!models_loaded_) { @@ -232,14 +232,27 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text, } int64_t t_encode_start = get_time_ms(); + std::vector speaker_embedding; - if(speaker_embedding.size()==0 || regenerate) + if(speaker_embeddings.size()>0 || reuse_hash_val>0) { - speaker_embedding.clear(); + auto it = speaker_embeddings.find(reuse_hash_val); + if (it != speaker_embeddings.end()) { + speaker_embedding = it->second; + } + } + + if(speaker_embedding.size()==0) + { + printf("Creating Voice Embedding ID=%u... (Warning, lengthy sample audio will be very slow. Use short clips!)\n",reuse_hash_val); if (!audio_encoder_.encode(ref_samples, n_ref_samples, speaker_embedding)) { result.error_msg = "Failed to extract speaker embedding: " + audio_encoder_.get_error(); return result; } + if(reuse_hash_val!=0) + { + speaker_embeddings[reuse_hash_val] = speaker_embedding; + } } result.t_encode_ms = get_time_ms() - t_encode_start; diff --git a/otherarch/qwen3tts/qwen3_tts.h b/otherarch/qwen3tts/qwen3_tts.h index 7934b1c72..b41e1347e 100644 --- a/otherarch/qwen3tts/qwen3_tts.h +++ b/otherarch/qwen3tts/qwen3_tts.h @@ -109,7 +109,7 @@ public: // params: generation parameters tts_result synthesize_with_voice(const std::string & text, const float * ref_samples, int32_t n_ref_samples, - const tts_params & params = tts_params(), bool regenerate=true); + const tts_params & params = tts_params(), std::size_t reuse_hash_val=0); // Set progress callback void set_progress_callback(tts_progress_callback_t callback); diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index cf345be15..8ed4c1066 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -549,7 +549,6 @@ static std::string detectedarch = ""; //qwen3tts specific static bool is_qwen3tts_file = false; static qwen3_tts::Qwen3TTS qwen3tts_runner; -static std::string last_qwen3tts_reference_audio_str = ""; int total_tts_gens = 0; static std::string tts_executable_path = ""; @@ -1282,19 +1281,11 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp if (custom_reference_audio_pcmf32.empty()) { result = qwen3tts_runner.synthesize(prompt, qwen3tts_params); } else { - bool regenerate = (last_qwen3tts_reference_audio_str=="" || last_qwen3tts_reference_audio_str!=custom_reference_audio_str); - std::string msg = "\nUsing reference voice..."; - if(regenerate) - { - msg += "Regenerating... (Warning, lengthy sample audio will be very slow. Use short clips!)"; - } - msg += "\n"; + std::size_t reuse_hash_value = std::hash{}(custom_reference_audio_str); + + std::string msg = "\nUsing reference voice...\n"; printf("%s",msg.c_str()); - result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, regenerate); - if(regenerate) - { - last_qwen3tts_reference_audio_str = custom_reference_audio_str; - } + result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, reuse_hash_value); } if (!result.success) {