diff --git a/expose.h b/expose.h
index 266ca09a4..5ac536e60 100644
--- a/expose.h
+++ b/expose.h
@@ -298,6 +298,7 @@ struct tts_generation_inputs
     const char * custom_speaker_text = "";
     const char * custom_speaker_data = "";
     const char * reference_audio = "";
+    const char * speaker_instruction = "";
 };
 struct tts_generation_outputs
 {
diff --git a/koboldcpp.py b/koboldcpp.py
index c6aa0a675..86943b9e3 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -424,7 +424,8 @@ class tts_generation_inputs(ctypes.Structure):
                 ("custom_speaker_voice", ctypes.c_char_p),
                 ("custom_speaker_text", ctypes.c_char_p),
                 ("custom_speaker_data", ctypes.c_char_p),
-                ("reference_audio", ctypes.c_char_p)]
+                ("reference_audio", ctypes.c_char_p),
+                ("speaker_instruction", ctypes.c_char_p)]
 
 class tts_generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
@@ -2538,6 +2539,14 @@ def tts_prepare_voice_json(jsonstr):
     except Exception:
         return None
 
+def tts_extract_instruction(x):
+    match = re.match(r'^\[([^\]]+)\]\s*(.+)$', x)
+    if match:
+        instruction = match.group(1)
+        x1 = match.group(2)
+        return x1, instruction
+    return x, ""
+
 def tts_generate(genparams):
     global args, voicebank, voicelist
     prompt = genparams.get("input", genparams.get("text", ""))
@@ -2558,6 +2567,11 @@ def tts_generate(genparams):
         voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
     inputs = tts_generation_inputs()
     inputs.custom_speaker_voice = normalized_voice.encode("UTF-8")
+    ttsinstruction = genparams.get("instruction", "")
+    # if no instruction provided, extract from text
+    if not genparams.get("instruction", ""):
+        prompt, ttsinstruction = tts_extract_instruction(prompt)
+    inputs.speaker_instruction = ttsinstruction.encode("UTF-8")
     inputs.prompt = prompt.encode("UTF-8")
     inputs.speaker_seed = voice
     aseed = -1
@@ -9738,6 +9752,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
 
             voicelist.append("random")
             voicebank["random"] = ""
+            voicelist.append("instruct")
+            voicebank["instruct"] = ""
 
             if args.ttsdir and os.path.isdir(args.ttsdir):
                 for filename in os.listdir(args.ttsdir):
diff --git a/otherarch/qwen3tts/q3ttsmain.cpp b/otherarch/qwen3tts/q3ttsmain.cpp
index 77f81ce0e..dd6f72838 100644
--- a/otherarch/qwen3tts/q3ttsmain.cpp
+++ b/otherarch/qwen3tts/q3ttsmain.cpp
@@ -30,13 +30,13 @@ int main(int argc, char ** argv) {
     std::string text;
     std::string output_file = "output.wav";
     std::string reference_audio;
-    
+
     qwen3_tts::tts_params params;
-    
+
     // Parse arguments
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
-        
+
         if (arg == "-h" || arg == "--help") {
             print_usage(argv[0]);
             return 0;
@@ -106,63 +106,63 @@ int main(int argc, char ** argv) {
             return 1;
         }
     }
-    
+
     // Validate required arguments
     if (model_dir.empty()) {
         fprintf(stderr, "Error: model directory is required\n");
         print_usage(argv[0]);
         return 1;
     }
-    
+
     if (text.empty()) {
         fprintf(stderr, "Error: text is required\n");
         print_usage(argv[0]);
         return 1;
     }
-    
+
     // Initialize TTS
     qwen3_tts::Qwen3TTS tts;
-    
+
     fprintf(stderr, "Loading models from: %s\n", model_dir.c_str());
     if (!tts.load_models(model_dir)) {
         fprintf(stderr, "Error: %s\n", tts.get_error().c_str());
         return 1;
     }
-    
+
     // Set progress callback
     tts.set_progress_callback([](int tokens, int max_tokens) {
         fprintf(stderr, "\rGenerating: %d/%d tokens", tokens, max_tokens);
     });
-    
+
     // Generate speech
     qwen3_tts::tts_result result;
-    
+
     if (reference_audio.empty()) {
         fprintf(stderr, "Synthesizing: \"%s\"\n", text.c_str());
-        result = tts.synthesize(text, params);
+        result = tts.synthesize(text,"", params);
     } else {
         fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
         fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
         result = tts.synthesize_with_voice(text, reference_audio, params);
     }
-    
+
     if (!result.success) {
         fprintf(stderr, "\nError: %s\n", result.error_msg.c_str());
         return 1;
     }
-    
+
     fprintf(stderr, "\n");
-    
+
     // Save output
     if (!qwen3_tts::save_audio_file(output_file, result.audio, result.sample_rate)) {
         fprintf(stderr, "Error: failed to save output file: %s\n", output_file.c_str());
         return 1;
     }
-    
+
     fprintf(stderr, "Output saved to: %s\n", output_file.c_str());
-    fprintf(stderr, "Audio duration: %.2f seconds\n", 
+    fprintf(stderr, "Audio duration: %.2f seconds\n",
             (float)result.audio.size() / result.sample_rate);
-    
+
     // Print timing
     if (params.print_timing) {
         fprintf(stderr, "\nTiming:\n");
@@ -173,6 +173,6 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "  Decode:    %6lld ms\n", (long long)result.t_decode_ms);
         fprintf(stderr, "  Total:     %6lld ms\n", (long long)result.t_total_ms);
     }
-    
+
     return 0;
 }
diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp
index 3eb9332ab..fcec1052d 100644
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@@ -163,7 +163,7 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
     return true;
 }
 
-tts_result Qwen3TTS::synthesize(const std::string & text,
+tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction,
                                  const tts_params & params) {
     tts_result result;
 
@@ -176,7 +176,7 @@ tts_result Qwen3TTS::synthesize(const std::string & text,
     // This will use the model's default voice characteristics
     std::vector<float> zero_embedding(transformer_.get_config().hidden_size, 0.0f);
 
-    return synthesize_internal(text, zero_embedding.data(), params, result);
+    return synthesize_internal(text, instruction, zero_embedding.data(), params, result);
 }
 
 tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
@@ -260,10 +260,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
         fprintf(stderr, "Speaker embedding extracted: %zu floats\n", speaker_embedding.size());
     }
 
-    return synthesize_internal(text, speaker_embedding.data(), params, result);
+    return synthesize_internal(text, "", speaker_embedding.data(), params, result);
 }
 
-tts_result Qwen3TTS::synthesize_internal(const std::string & text,
+tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction,
                                           const float * speaker_embedding,
                                           const tts_params & params,
                                           tts_result & result) {
@@ -311,11 +311,21 @@ tts_result Qwen3TTS::synthesize_internal(const std::string & text,
     }
     transformer_.clear_kv_cache();
 
+    std::vector<int32_t> alignment_instruct_tokens;
+    int instruct_tok_count = 0;
+    int32_t * instruct_tok_data = nullptr;
+    if(instruction!="")
+    {
+        alignment_instruct_tokens = tokenizer_.encode_instruct(instruction);
+        instruct_tok_data = alignment_instruct_tokens.data();
+        instruct_tok_count = alignment_instruct_tokens.size();
+    }
+
     std::vector<int32_t> speech_codes;
     if (!transformer_.generate(text_tokens.data(), (int32_t)text_tokens.size(),
                                speaker_embedding, params.max_audio_tokens, speech_codes,
                                2050, params.repetition_penalty,
-                               params.temperature, params.top_k)) {
+                               params.temperature, params.top_k, -1, instruct_tok_data, instruct_tok_count)) {
         result.error_msg = "Failed to generate speech codes: " + transformer_.get_error();
         return result;
     }
diff --git a/otherarch/qwen3tts/qwen3_tts.h b/otherarch/qwen3tts/qwen3_tts.h
index b41e1347e..54879f234 100644
--- a/otherarch/qwen3tts/qwen3_tts.h
+++ b/otherarch/qwen3tts/qwen3_tts.h
@@ -91,7 +91,7 @@ public:
     // Generate speech from text
     // text: input text to synthesize
     // params: generation parameters
-    tts_result synthesize(const std::string & text,
+    tts_result synthesize(const std::string & text, const std::string & instruction,
                           const tts_params & params = tts_params());
 
     // Generate speech with voice cloning
@@ -121,7 +121,7 @@ public:
     bool is_loaded() const { return models_loaded_; }
 
 private:
-    tts_result synthesize_internal(const std::string & text,
+    tts_result synthesize_internal(const std::string & text, const std::string & instruction,
                                    const float * speaker_embedding,
                                    const tts_params & params,
                                    tts_result & result);
diff --git a/otherarch/qwen3tts/text_tokenizer.cpp b/otherarch/qwen3tts/text_tokenizer.cpp
index 072cccdb2..ffa275e6e 100644
--- a/otherarch/qwen3tts/text_tokenizer.cpp
+++ b/otherarch/qwen3tts/text_tokenizer.cpp
@@ -290,6 +290,37 @@ std::vector<int32_t> TextTokenizer::encode(const std::string & text) const {
     return tokens;
 }
 
+std::vector<int32_t> TextTokenizer::encode_instruct(const std::string & instruct) const {
+    if (!loaded_ || instruct.empty()) {
+        return {};
+    }
+
+    // Format: <|im_start|>user\n{instruct}<|im_end|>\n
+    std::vector<int32_t> tokens;
+
+    // <|im_start|>
+    tokens.push_back(config_.bos_token_id);
+
+    // user
+    int user_token_id_ = 872;
+    tokens.push_back(user_token_id_);
+
+    // \n
+    tokens.push_back(newline_token_id_);
+
+    // Encode the instruct
+    auto text_tokens = encode(instruct);
+    tokens.insert(tokens.end(), text_tokens.begin(), text_tokens.end());
+
+    // <|im_end|>
+    tokens.push_back(config_.eos_token_id);
+
+    // \n
+    tokens.push_back(newline_token_id_);
+
+    return tokens;
+}
+
 std::vector<int32_t> TextTokenizer::encode_for_tts(const std::string & text) const {
     if (!loaded_) {
         return {};
diff --git a/otherarch/qwen3tts/text_tokenizer.h b/otherarch/qwen3tts/text_tokenizer.h
index 4944f4966..68830ff7b 100644
--- a/otherarch/qwen3tts/text_tokenizer.h
+++ b/otherarch/qwen3tts/text_tokenizer.h
@@ -22,64 +22,66 @@ class TextTokenizer {
 public:
     TextTokenizer();
     ~TextTokenizer();
-    
+
     // Load tokenizer from GGUF file
     bool load_from_gguf(struct gguf_context * ctx);
-    
+
     // Encode text to token IDs
     std::vector<int32_t> encode(const std::string & text) const;
-    
+
     // Encode with TTS format: <|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n
     std::vector<int32_t> encode_for_tts(const std::string & text) const;
-    
+
+    std::vector<int32_t> encode_instruct(const std::string & instruct) const;
+
     // Decode token IDs to text
     std::string decode(const std::vector<int32_t> & tokens) const;
-    
+
     // Decode single token
     std::string decode_token(int32_t token_id) const;
-    
+
     // Get configuration
     const tokenizer_config & get_config() const { return config_; }
-    
+
     // Get error message
     const std::string & get_error() const { return error_msg_; }
-    
+
     // Check if loaded
     bool is_loaded() const { return loaded_; }
-    
+
     // Get special token IDs
     int32_t bos_token_id() const { return config_.bos_token_id; }
     int32_t eos_token_id() const { return config_.eos_token_id; }
     int32_t pad_token_id() const { return config_.pad_token_id; }
-    
+
 private:
     tokenizer_config config_;
     std::string error_msg_;
     bool loaded_ = false;
-    
+
     // Vocabulary: token string -> token ID
     std::unordered_map<std::string, int32_t> vocab_;
-    
+
     // Reverse vocabulary: token ID -> token string
     std::vector<std::string> id_to_token_;
-    
+
     // BPE merges: pair -> rank (lower rank = higher priority)
     std::map<std::pair<std::string, std::string>, int32_t> bpe_ranks_;
-    
+
     // Special token for "assistant" and newline
     int32_t assistant_token_id_ = 77091;
     int32_t newline_token_id_ = 198;  // '\n' encoded
-    
+
     // Helper: convert bytes to unicode (GPT-2 style byte encoding)
     static std::string bytes_to_unicode(const std::string & text);
     static std::string unicode_to_bytes(const std::string & text);
-    
+
     // Helper: get UTF-8 character length
     static size_t utf8_len(char c);
-    
+
     // BPE encoding for a single word
     std::vector<std::string> bpe(const std::string & token) const;
-    
+
     // Find the pair with lowest rank in a sequence
     std::pair<std::string, std::string> get_min_pair(
         const std::vector<std::string> & word) const;
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index e0df7b6e6..e61634db4 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -1184,7 +1184,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
         qwen3_tts::tts_params qwen3tts_params;
         std::string custom_reference_audio_str = inputs.reference_audio;
         std::vector<float> custom_reference_audio_pcmf32;
-        std::string speakerstr = inputs.custom_speaker_voice;
+        std::string speaker_instruction = inputs.speaker_instruction;
 
         int audio_seed = inputs.audio_seed;
         if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
@@ -1194,7 +1194,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
 
         if(ttsdebugmode==1 && !tts_is_quiet)
         {
-            printf("\nUsing Audio Seed: %d, Speaker: %s", audio_seed, speakerstr.c_str());
+            printf("\nUsing Audio Seed: %d", audio_seed);
         }
         qwen3tts_runner.set_seed(audio_seed);
 
@@ -1221,8 +1221,11 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
             qwen3tts_params.print_progress = true;
         }
 
-        if (custom_reference_audio_pcmf32.empty()) {
-            result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
+        if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty()) {
+            if (speaker_instruction != "" && !tts_is_quiet) {
+                printf("\nApply VoiceDesign Instruction: %s", speaker_instruction.c_str());
+            }
+            result = qwen3tts_runner.synthesize(prompt, speaker_instruction, qwen3tts_params);
         } else {
             std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);