diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 92f3550be..dbd77b6c1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3560,6 +3560,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; } +bool clip_is_pixtral(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_PIXTRAL; +} + // Determine the number of encoder layers to iterate over int get_deepest_feature_layer(const struct clip_ctx * ctx) { // Get the index of the second to last layer; this is the diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 3c3a37db2..f80e7a30c 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -113,6 +113,7 @@ CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); +CLIP_API bool clip_is_pixtral(const struct clip_ctx * ctx); CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index fcb7749ad..080a438f0 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -340,6 +340,16 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } } + else if (clip_is_pixtral(ctx_clip)){ + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_patches_by_img(ctx_clip, img_res); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 + if (!encoded) { + LOG_ERR("Unable to encode image\n"); + + return false; + } + } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip);