From 313d37a6022b2c2ed713f8e7ecaf2fc4f61cc196 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 22 Feb 2026 00:43:57 +0800
Subject: [PATCH] cache used voices

---
 otherarch/qwen3tts/qwen3_tts.cpp | 21 +++++++++++++++++----
 otherarch/qwen3tts/qwen3_tts.h   |  2 +-
 otherarch/tts_adapter.cpp        | 17 ++++-------------
 3 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp
index 86a6ec183..14a96a519 100644
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@@ -202,10 +202,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
     return synthesize_with_voice(text, ref_samples.data(), (int32_t)ref_samples.size(), params);
 }
 
-static std::vector<float> speaker_embedding;
+static std::map<std::size_t, std::vector<float>> speaker_embeddings;
 tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
                                             const float * ref_samples, int32_t n_ref_samples,
-                                            const tts_params & params, bool regenerate) {
+                                            const tts_params & params, std::size_t reuse_hash_val) {
     tts_result result;
 
     if (!models_loaded_) {
@@ -232,14 +232,27 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
     }
 
     int64_t t_encode_start = get_time_ms();
+    std::vector<float> speaker_embedding;
 
-    if(speaker_embedding.size()==0 || regenerate)
+    if(speaker_embeddings.size()>0 || reuse_hash_val>0)
     {
-        speaker_embedding.clear();
+        auto it = speaker_embeddings.find(reuse_hash_val);
+        if (it != speaker_embeddings.end()) {
+            speaker_embedding = it->second;
+        }
+    }
+
+    if(speaker_embedding.size()==0)
+    {
+        printf("Creating Voice Embedding ID=%u... (Warning, lengthy sample audio will be very slow. Use short clips!)\n",reuse_hash_val);
         if (!audio_encoder_.encode(ref_samples, n_ref_samples, speaker_embedding)) {
             result.error_msg = "Failed to extract speaker embedding: " + audio_encoder_.get_error();
             return result;
         }
+        if(reuse_hash_val!=0)
+        {
+            speaker_embeddings[reuse_hash_val] = speaker_embedding;
+        }
     }
     result.t_encode_ms = get_time_ms() - t_encode_start;
 
diff --git a/otherarch/qwen3tts/qwen3_tts.h b/otherarch/qwen3tts/qwen3_tts.h
index 7934b1c72..b41e1347e 100644
--- a/otherarch/qwen3tts/qwen3_tts.h
+++ b/otherarch/qwen3tts/qwen3_tts.h
@@ -109,7 +109,7 @@ public:
     // params: generation parameters
     tts_result synthesize_with_voice(const std::string & text,
                                       const float * ref_samples, int32_t n_ref_samples,
-                                      const tts_params & params = tts_params(), bool regenerate=true);
+                                      const tts_params & params = tts_params(), std::size_t reuse_hash_val=0);
 
     // Set progress callback
     void set_progress_callback(tts_progress_callback_t callback);
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index cf345be15..8ed4c1066 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -549,7 +549,6 @@ static std::string detectedarch = "";
 //qwen3tts specific
 static bool is_qwen3tts_file = false;
 static qwen3_tts::Qwen3TTS qwen3tts_runner;
-static std::string last_qwen3tts_reference_audio_str = "";
 
 int total_tts_gens = 0;
 static std::string tts_executable_path = "";
@@ -1282,19 +1281,11 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
         if (custom_reference_audio_pcmf32.empty()) {
             result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
         } else {
-            bool regenerate = (last_qwen3tts_reference_audio_str=="" || last_qwen3tts_reference_audio_str!=custom_reference_audio_str);
-            std::string msg = "\nUsing reference voice...";
-            if(regenerate)
-            {
-                msg += "Regenerating... (Warning, lengthy sample audio will be very slow. Use short clips!)";
-            }
-            msg += "\n";
+            std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);
+
+            std::string msg = "\nUsing reference voice...\n";
             printf("%s",msg.c_str());
-            result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, regenerate);
-            if(regenerate)
-            {
-                last_qwen3tts_reference_audio_str = custom_reference_audio_str;
-            }
+            result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, reuse_hash_value);
         }
 
         if (!result.success) {