diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index ff614b052..5bb1e7a11 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -112,6 +112,7 @@ static std::vector<int> last_media_mem; //for storing dummy tokens that will be
 static std::string media_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
 static int current_media_identifier = MEDIA_TOKEN_IDENTIFIER_A;
 static int vision_max_res = 2048;
+static bool use_mrope = false;
 
 static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
@@ -693,7 +694,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *
 
     std::vector<int> real_embd = drafted_ids;
     real_embd.pop_back();
-    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
+
     kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true);
     auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model
     if(!draftok)
@@ -1801,7 +1802,6 @@ static void load_grammar(const std::string & gammarstr)
 
 static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) {
     int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
 
     for (int i = 0; i < num_img_tokens; i += n_batch) {
         int n_eval = num_img_tokens - i;
@@ -1987,6 +1987,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     guidance_ctx = nullptr;
     audio_multimodal_supported = false;
     vision_multimodal_supported = false;
+    use_mrope = false;
 
     auto clamped_max_context_length = inputs.max_context_length;
 
@@ -2338,6 +2339,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         {
             printf("\nMRope is used, context shift will be disabled!\n");
             kcpp_data->use_contextshift = false;
+            use_mrope = true;
         }
 
         if(overwriteRope)
@@ -3321,8 +3323,25 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             media_object lv;
             lv.b64data = item;
             lv.is_audio = true;
-            TokenizeString("<audio>", lv.chunk_start_seq, file_format, false);
-            TokenizeString("</audio>\n\n", lv.chunk_end_seq, file_format, false);
+            std::string aud_start = "<audio>";
+            std::string aud_end = "</audio>\n\n";
+            if(clp_ctx_a)
+            {
+                int ptype = clip_get_projector_type_ext(clp_ctx_a);
+                if(ptype==14) //qwen omni
+                {
+                    aud_start = "<|audio_bos|>";
+                    aud_end = "<|audio_eos|>\n";
+                }
+                else if(ptype==16) //voxtral
+                {
+                    aud_start = "[INST][BEGIN_AUDIO]";
+                    aud_end = "[/INST]\n";
+                }
+            }
+
+            TokenizeString(aud_start, lv.chunk_start_seq, file_format, false);
+            TokenizeString(aud_end, lv.chunk_end_seq, file_format, false);
             media_objects.push_back(lv);
             new_media_composite += item;
         }
@@ -3502,7 +3521,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     int32_t nctx = kcpp_data->n_ctx;
 
     TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
-    bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
     TokenizeString("\nAttached Media:\n", media_intro, file_format, false);
 
     if(media_composite_image_signature=="")
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 1ff914253..27355a9d9 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4552,6 +4552,11 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
     return ctx->proj_type();
 }
 
+int clip_get_projector_type_ext(clip_ctx * ctx) {
+    return ctx->proj_type();
+}
+
+
 void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
     clip_image_f32 * audio = new clip_image_f32;
     audio->nx = n_frames;
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 22ca53bba..0c6e778fc 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -102,6 +102,7 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
 bool clip_is_pixtral(const struct clip_ctx * ctx);
 void set_clip_uses_gpu(bool usegpu);
+int clip_get_projector_type_ext(clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);