diff --git a/otherarch/qwen3tts/q3ttsmain.cpp b/otherarch/qwen3tts/q3ttsmain.cpp
index dd6f72838..db46ec529 100644
--- a/otherarch/qwen3tts/q3ttsmain.cpp
+++ b/otherarch/qwen3tts/q3ttsmain.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
     if (reference_audio.empty()) {
         fprintf(stderr, "Synthesizing: \"%s\"\n", text.c_str());
-        result = tts.synthesize(text,"", params);
+        result = tts.synthesize(text,"", -1, params);
     } else {
         fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
         fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp
index fcec1052d..c3c92a155 100644
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@@ -163,7 +163,26 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
     return true;
 }
 
-tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction,
+bool Qwen3TTS::load_speaker_enc()
+{
+    if (!encoder_loaded_) {
+        if (tts_model_path_.empty()) {
+            fprintf(stderr, "Failed to load speaker encoder, file not found");
+            return false;
+        }
+        int64_t t_encoder_load_start = get_time_ms();
+        if (!audio_encoder_.load_model(tts_model_path_)) {
+            return false; //no encoder
+        }
+        encoder_loaded_ = true;
+        fprintf(stderr, "  Speaker encoder lazy-loaded in %lld ms\n",
+                (long long)(get_time_ms() - t_encoder_load_start));
+        return true;
+    }
+    return true;
+}
+
+tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
                                  const tts_params & params) {
     tts_result result;
 
@@ -176,7 +195,7 @@ tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & in
     // This will use the model's default voice characteristics
     std::vector<float> zero_embedding(transformer_.get_config().hidden_size, 0.0f);
 
-    return synthesize_internal(text, instruction, zero_embedding.data(), params, result);
+    return synthesize_internal(text, instruction, speaker_id, zero_embedding.data(), params, result);
 }
 
 tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
@@ -260,10 +279,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
         fprintf(stderr, "Speaker embedding extracted: %zu floats\n", speaker_embedding.size());
     }
 
-    return synthesize_internal(text, "", speaker_embedding.data(), params, result);
+    return synthesize_internal(text, "", -1, speaker_embedding.data(), params, result);
 }
 
-tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction,
+tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
                                           const float * speaker_embedding,
                                           const tts_params & params,
                                           tts_result & result) {
@@ -325,7 +344,7 @@ tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::st
     if (!transformer_.generate(text_tokens.data(), (int32_t)text_tokens.size(),
                                speaker_embedding, params.max_audio_tokens, speech_codes,
                                2050, params.repetition_penalty,
-                               params.temperature, params.top_k, -1, instruct_tok_data, instruct_tok_count)) {
+                               params.temperature, params.top_k, speakerid, instruct_tok_data, instruct_tok_count)) {
         result.error_msg = "Failed to generate speech codes: " + transformer_.get_error();
         return result;
     }
diff --git a/otherarch/qwen3tts/qwen3_tts.h b/otherarch/qwen3tts/qwen3_tts.h
index 54879f234..f8b535ed0 100644
--- a/otherarch/qwen3tts/qwen3_tts.h
+++ b/otherarch/qwen3tts/qwen3_tts.h
@@ -91,7 +91,7 @@ public:
     // Generate speech from text
     // text: input text to synthesize
     // params: generation parameters
-    tts_result synthesize(const std::string & text, const std::string & instruction,
+    tts_result synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
                           const tts_params & params = tts_params());
 
     // Generate speech with voice cloning
@@ -114,6 +114,8 @@ public:
     // Set progress callback
     void set_progress_callback(tts_progress_callback_t callback);
 
+    bool load_speaker_enc(); //return false if failed to load encoder
+
     // Get error message
     const std::string & get_error() const { return error_msg_; }
 
@@ -121,7 +123,7 @@ public:
     bool is_loaded() const { return models_loaded_; }
 
 private:
-    tts_result synthesize_internal(const std::string & text, const std::string & instruction,
+    tts_result synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
                                    const float * speaker_embedding,
                                    const tts_params & params,
                                    tts_result & result);
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index e61634db4..7e761adbd 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -1186,6 +1186,15 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
         std::vector<float> custom_reference_audio_pcmf32;
         std::string speaker_instruction = inputs.speaker_instruction;
 
+        int speakerID = inputs.speaker_seed;
+        int speakermap[] = {2861,3066,2873,3061,2864,2875,2878,3065,3010};
+
+        if (speakerID > 0 && speakerID <= 5) {
+            speakerID = speakermap[speakerID-1];
+        } else {
+            speakerID = -1;
+        }
+
         int audio_seed = inputs.audio_seed;
         if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
         {
@@ -1194,7 +1203,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
 
         if(ttsdebugmode==1 && !tts_is_quiet)
         {
-            printf("\nUsing Audio Seed: %d", audio_seed);
+            printf("\nUsing Audio Seed: %d, SpeakerID: %d", audio_seed, speakerID);
         }
         qwen3tts_runner.set_seed(audio_seed);
 
@@ -1221,11 +1230,13 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
             qwen3tts_params.print_progress = true;
         }
 
-        if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty()) {
+        bool has_speaker_enc = qwen3tts_runner.load_speaker_enc();
+
+        if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty() || !has_speaker_enc) {
             if (speaker_instruction != "" && !tts_is_quiet) {
                 printf("\nApply VoiceDesign Instruction: %s", speaker_instruction.c_str());
             }
-            result = qwen3tts_runner.synthesize(prompt, speaker_instruction, qwen3tts_params);
+            result = qwen3tts_runner.synthesize(prompt, speaker_instruction, speakerID, qwen3tts_params);
         } else {
             std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);