cache used voices

2026-05-30 03:43:40 +00:00 · 2026-02-22 00:43:57 +08:00 · 2026-02-22 00:43:57 +08:00 · 313d37a602
commit 313d37a602
parent 5536fb29f2
3 changed files with 22 additions and 18 deletions
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@ -202,10 +202,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
    return synthesize_with_voice(text, ref_samples.data(), (int32_t)ref_samples.size(), params);
 }

-static std::vector<float> speaker_embedding;
+static std::map<std::size_t, std::vector<float>> speaker_embeddings;
 tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
                                            const float * ref_samples, int32_t n_ref_samples,
-                                            const tts_params & params, bool regenerate) {
+                                            const tts_params & params, std::size_t reuse_hash_val) {
    tts_result result;

    if (!models_loaded_) {
@ -232,14 +232,27 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
    }

    int64_t t_encode_start = get_time_ms();
+    std::vector<float> speaker_embedding;

-    if(speaker_embedding.size()==0 || regenerate)
+    if(speaker_embeddings.size()>0 || reuse_hash_val>0)
    {
-        speaker_embedding.clear();
+        auto it = speaker_embeddings.find(reuse_hash_val);
+        if (it != speaker_embeddings.end()) {
+            speaker_embedding = it->second;
+        }
+    }
+
+    if(speaker_embedding.size()==0)
+    {
+        printf("Creating Voice Embedding ID=%u... (Warning, lengthy sample audio will be very slow. Use short clips!)\n",reuse_hash_val);
        if (!audio_encoder_.encode(ref_samples, n_ref_samples, speaker_embedding)) {
            result.error_msg = "Failed to extract speaker embedding: " + audio_encoder_.get_error();
            return result;
        }
+        if(reuse_hash_val!=0)
+        {
+            speaker_embeddings[reuse_hash_val] = speaker_embedding;
+        }
    }
    result.t_encode_ms = get_time_ms() - t_encode_start;

--- a/otherarch/qwen3tts/qwen3_tts.h
+++ b/otherarch/qwen3tts/qwen3_tts.h
@ -109,7 +109,7 @@ public:
    // params: generation parameters
    tts_result synthesize_with_voice(const std::string & text,
                                      const float * ref_samples, int32_t n_ref_samples,
-                                      const tts_params & params = tts_params(), bool regenerate=true);
+                                      const tts_params & params = tts_params(), std::size_t reuse_hash_val=0);

    // Set progress callback
    void set_progress_callback(tts_progress_callback_t callback);
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -549,7 +549,6 @@ static std::string detectedarch = "";
 //qwen3tts specific
 static bool is_qwen3tts_file = false;
 static qwen3_tts::Qwen3TTS qwen3tts_runner;
-static std::string last_qwen3tts_reference_audio_str = "";

 int total_tts_gens = 0;
 static std::string tts_executable_path = "";
@ -1282,19 +1281,11 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
        if (custom_reference_audio_pcmf32.empty()) {
            result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
        } else {
-            bool regenerate = (last_qwen3tts_reference_audio_str=="" || last_qwen3tts_reference_audio_str!=custom_reference_audio_str);
-            std::string msg = "\nUsing reference voice...";
-            if(regenerate)
-            {
-                msg += "Regenerating... (Warning, lengthy sample audio will be very slow. Use short clips!)";
-            }
-            msg += "\n";
+            std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);
+
+            std::string msg = "\nUsing reference voice...\n";
            printf("%s",msg.c_str());
-            result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, regenerate);
-            if(regenerate)
-            {
-                last_qwen3tts_reference_audio_str = custom_reference_audio_str;
-            }
+            result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, reuse_hash_value);
        }

        if (!result.success) {