From cefb32df1952057ab9c495de3ee94c73a05a8058 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:58:10 +0800 Subject: [PATCH] track clip img patch nx and ny --- gpttype_adapter.cpp | 2 +- otherarch/otherarch.h | 2 ++ tools/mtmd/llava.cpp | 15 +++++++++------ tools/mtmd/llava.h | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 0230fd6b9..53bfba794 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -3207,7 +3207,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector & media_int printf("\nCreating clip image embed..."); } media_chunk chunk; - if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens)) { + if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny)) { printf("\nError: Clip image %d failed to create embd!",i); } if(debugmode==1 && !is_quiet) diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index 55c0847b7..3855f9f7d 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -508,6 +508,8 @@ struct media_chunk { int32_t clp_image_tokens = 0; //holds number of tokens llava used in this chunk float * clp_img_embd = nullptr; //this holds dynamic memory and must be freed each use! + int32_t nx = 0; //only used for 2d roped images + int32_t ny = 0; }; struct media_object { diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp index 10058ea10..e05f295f3 100644 --- a/tools/mtmd/llava.cpp +++ b/tools/mtmd/llava.cpp @@ -23,22 +23,22 @@ struct clip_image_f32_batch_deleter { typedef std::unique_ptr clip_image_f32_batch_ptr; -static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos) { +static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos, int *nx, int *ny) { const int64_t t_img_enc_start_us = ggml_time_us(); - const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - const size_t n_imgs = clip_image_f32_batch_n_images(preprocessed_img); - clip_image_f32 * img_res = clip_image_f32_get_img(preprocessed_img, 0); *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); + *nx = clip_n_output_tokens_x(ctx_clip,img_res); + *ny = clip_n_output_tokens_y(ctx_clip,img_res); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); return false; } + LOG_INF("%s: CLIP output tokens nx:%d, ny:%d\n", __func__, *nx,*ny); LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); @@ -49,7 +49,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct cl return true; } -bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { +bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out) { // Granite vision uses up to 10 patches + base patch int num_max_patches = 11; if (clip_is_minicpmv(ctx_clip)) { @@ -87,13 +87,16 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co } int n_img_pos; - if (!encode_image_with_clip(ctx_clip, n_threads, preprocessed_img.get(), image_embd, &n_img_pos)) { + int nx = 0, ny = 0; + if (!encode_image_with_clip(ctx_clip, n_threads, preprocessed_img.get(), image_embd, &n_img_pos, &nx, &ny)) { LOG_ERR("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } *image_embd_out = image_embd; *n_img_pos_out = n_img_pos; + *nx_out = nx; + *ny_out = ny; return true; } diff --git a/tools/mtmd/llava.h b/tools/mtmd/llava.h index 9cdc7b68a..29e516aa5 100644 --- a/tools/mtmd/llava.h +++ b/tools/mtmd/llava.h @@ -30,7 +30,7 @@ struct llava_image_embed { struct mtmd_audio_mel; -LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out); LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const mtmd_audio_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);