From 4629b49afbef6a75a124d7024d16c5c8e05475b4 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:34:32 +0800 Subject: [PATCH] updated to handle changes for clip_is_mrope --- gpttype_adapter.cpp | 24 +++++++++++++++++++++++- tools/mtmd/clip.cpp | 13 ------------- tools/mtmd/clip.h | 1 - tools/mtmd/llava.cpp | 4 ++-- tools/mtmd/llava.h | 2 +- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index eabe8daa8..522dd39dd 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -3391,6 +3391,28 @@ static void PrepareMediaEmbds(const int nctx, const std::vector & media_int int outrosize = media_outro.size(); last_media_mem.clear(); + bool clip_is_mrope = false; + if(file_format == FileFormat::GGUF_GENERIC) + { + //added after https://github.com/ggml-org/llama.cpp/pull/22161, replacing clip_is_mrope function + auto decoder_rope_type = llama_model_rope_type(llama_get_model(llama_ctx_v4)); + switch (decoder_rope_type) { + case LLAMA_ROPE_TYPE_NORM: + case LLAMA_ROPE_TYPE_NEOX: + { + clip_is_mrope = false; + } break; + case LLAMA_ROPE_TYPE_MROPE: + case LLAMA_ROPE_TYPE_IMROPE: + { + clip_is_mrope = true; + } + break; + default: + printf("\nWARNING: clip unsupported decoder rope type: %d\n", decoder_rope_type); + } + } + for(int i=0;i & media_int printf("\nCreating clip image embed..."); } media_chunk chunk; - if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny)) { + if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny, clip_is_mrope)) { printf("\nError: Clip image %d failed to create embd!",i); } if(debugmode==1 && !is_quiet) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bac28a6e9..a0b163a5b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4407,19 +4407,6 @@ bool clip_is_glm(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; } -bool clip_is_mrope(const struct clip_ctx * ctx) { //kcpp: this was removed in https://github.com/ggml-org/llama.cpp/pull/18793 and moved to mtmd_decode_use_mrope - switch (ctx->proj_type()) { - case PROJECTOR_TYPE_QWEN2VL: - case PROJECTOR_TYPE_QWEN25VL: - case PROJECTOR_TYPE_QWEN3VL: - case PROJECTOR_TYPE_GLM4V: - case PROJECTOR_TYPE_PADDLEOCR: - return true; - default: - return false; - } -} - bool clip_is_llava(const struct clip_ctx * ctx) { return ctx->model.hparams.has_llava_projector; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 7440152a3..33512f2ba 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -113,7 +113,6 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct int clip_is_minicpmv(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_gemma4(const struct clip_ctx * ctx); -bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); void set_clip_uses_gpu(bool usegpu); int clip_get_projector_type_ext(clip_ctx * ctx); diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp index e30cb2465..2fe04d9cd 100644 --- a/tools/mtmd/llava.cpp +++ b/tools/mtmd/llava.cpp @@ -49,7 +49,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct cl return true; } -bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out) { +bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope) { // Granite vision uses up to 10 patches + base patch int num_max_patches = 11; if (clip_is_minicpmv(ctx_clip)) { @@ -65,7 +65,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co return false; } - if (clip_is_mrope(ctx_clip)) { + if (clip_is_mrope) { // qwen2vl don't split image into chunks, so `num_max_patches` is not needed. //sometimes they resize the image LARGER than before (padding up), so we must account for that int max_nx = img->nx; diff --git a/tools/mtmd/llava.h b/tools/mtmd/llava.h index 29e516aa5..18cc485f6 100644 --- a/tools/mtmd/llava.h +++ b/tools/mtmd/llava.h @@ -30,7 +30,7 @@ struct llava_image_embed { struct mtmd_audio_mel; -LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope); LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const mtmd_audio_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);