From 4629b49afbef6a75a124d7024d16c5c8e05475b4 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 21 Apr 2026 19:34:32 +0800
Subject: [PATCH] updated to handle changes for clip_is_mrope

---
 gpttype_adapter.cpp  | 24 +++++++++++++++++++++++-
 tools/mtmd/clip.cpp  | 13 -------------
 tools/mtmd/clip.h    |  1 -
 tools/mtmd/llava.cpp |  4 ++--
 tools/mtmd/llava.h   |  2 +-
 5 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index eabe8daa8..522dd39dd 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -3391,6 +3391,28 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
         int outrosize = media_outro.size();
         last_media_mem.clear();
 
+        bool clip_is_mrope = false;
+        if(file_format == FileFormat::GGUF_GENERIC)
+        {
+            //added after https://github.com/ggml-org/llama.cpp/pull/22161, replacing clip_is_mrope function
+            auto decoder_rope_type = llama_model_rope_type(llama_get_model(llama_ctx_v4));
+            switch (decoder_rope_type) {
+                case LLAMA_ROPE_TYPE_NORM:
+                case LLAMA_ROPE_TYPE_NEOX:
+                    {
+                        clip_is_mrope = false;
+                    } break;
+                case LLAMA_ROPE_TYPE_MROPE:
+                case LLAMA_ROPE_TYPE_IMROPE:
+                    {
+                        clip_is_mrope = true;
+                    }
+                    break;
+                default:
+                    printf("\nWARNING: clip unsupported decoder rope type: %d\n", decoder_rope_type);
+            }
+        }
+
         for(int i=0;i<media_objects.size();++i)
         {
             std::string media_obj = media_objects[i].b64data;
@@ -3410,7 +3432,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
                         printf("\nCreating clip image embed...");
                     }
                     media_chunk chunk;
-                    if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny)) {
+                    if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny, clip_is_mrope)) {
                         printf("\nError: Clip image %d failed to create embd!",i);
                     }
                     if(debugmode==1 && !is_quiet)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index bac28a6e9..a0b163a5b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4407,19 +4407,6 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }
 
-bool clip_is_mrope(const struct clip_ctx * ctx) { //kcpp: this was removed in https://github.com/ggml-org/llama.cpp/pull/18793 and moved to mtmd_decode_use_mrope
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_PADDLEOCR:
-            return true;
-        default:
-            return false;
-    }
-}
-
 bool clip_is_llava(const struct clip_ctx * ctx) {
     return ctx->model.hparams.has_llava_projector;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 7440152a3..33512f2ba 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -113,7 +113,6 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_gemma4(const struct clip_ctx * ctx);
-bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 void set_clip_uses_gpu(bool usegpu);
 int clip_get_projector_type_ext(clip_ctx * ctx);
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
index e30cb2465..2fe04d9cd 100644
--- a/tools/mtmd/llava.cpp
+++ b/tools/mtmd/llava.cpp
@@ -49,7 +49,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct cl
     return true;
 }
 
-bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out) {
+bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope) {
     // Granite vision uses up to 10 patches + base patch
     int num_max_patches = 11;
     if (clip_is_minicpmv(ctx_clip)) {
@@ -65,7 +65,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
         return false;
     }
 
-    if (clip_is_mrope(ctx_clip)) {
+    if (clip_is_mrope) {
         // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
         //sometimes they resize the image LARGER than before (padding up), so we must account for that
         int max_nx = img->nx;
diff --git a/tools/mtmd/llava.h b/tools/mtmd/llava.h
index 29e516aa5..18cc485f6 100644
--- a/tools/mtmd/llava.h
+++ b/tools/mtmd/llava.h
@@ -30,7 +30,7 @@ struct llava_image_embed {
 struct mtmd_audio_mel;
 
 
-LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope);
 
 LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const mtmd_audio_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);