mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
updated to handle changes for clip_is_mrope
This commit is contained in:
parent
19a12bb080
commit
4629b49afb
5 changed files with 26 additions and 18 deletions
|
|
@ -3391,6 +3391,28 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
|
|||
int outrosize = media_outro.size();
|
||||
last_media_mem.clear();
|
||||
|
||||
bool clip_is_mrope = false;
|
||||
if(file_format == FileFormat::GGUF_GENERIC)
|
||||
{
|
||||
//added after https://github.com/ggml-org/llama.cpp/pull/22161, replacing clip_is_mrope function
|
||||
auto decoder_rope_type = llama_model_rope_type(llama_get_model(llama_ctx_v4));
|
||||
switch (decoder_rope_type) {
|
||||
case LLAMA_ROPE_TYPE_NORM:
|
||||
case LLAMA_ROPE_TYPE_NEOX:
|
||||
{
|
||||
clip_is_mrope = false;
|
||||
} break;
|
||||
case LLAMA_ROPE_TYPE_MROPE:
|
||||
case LLAMA_ROPE_TYPE_IMROPE:
|
||||
{
|
||||
clip_is_mrope = true;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
printf("\nWARNING: clip unsupported decoder rope type: %d\n", decoder_rope_type);
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0;i<media_objects.size();++i)
|
||||
{
|
||||
std::string media_obj = media_objects[i].b64data;
|
||||
|
|
@ -3410,7 +3432,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
|
|||
printf("\nCreating clip image embed...");
|
||||
}
|
||||
media_chunk chunk;
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny)) {
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens, &chunk.nx, &chunk.ny, clip_is_mrope)) {
|
||||
printf("\nError: Clip image %d failed to create embd!",i);
|
||||
}
|
||||
if(debugmode==1 && !is_quiet)
|
||||
|
|
|
|||
|
|
@ -4407,19 +4407,6 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
|||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) { //kcpp: this was removed in https://github.com/ggml-org/llama.cpp/pull/18793 and moved to mtmd_decode_use_mrope
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -113,7 +113,6 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
|||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_gemma4(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
void set_clip_uses_gpu(bool usegpu);
|
||||
int clip_get_projector_type_ext(clip_ctx * ctx);
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct cl
|
|||
return true;
|
||||
}
|
||||
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out) {
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope) {
|
||||
// Granite vision uses up to 10 patches + base patch
|
||||
int num_max_patches = 11;
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
|
|
@ -65,7 +65,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|||
return false;
|
||||
}
|
||||
|
||||
if (clip_is_mrope(ctx_clip)) {
|
||||
if (clip_is_mrope) {
|
||||
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
||||
//sometimes they resize the image LARGER than before (padding up), so we must account for that
|
||||
int max_nx = img->nx;
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ struct llava_image_embed {
|
|||
struct mtmd_audio_mel;
|
||||
|
||||
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out);
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, int * nx_out, int * ny_out, bool clip_is_mrope);
|
||||
|
||||
LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const mtmd_audio_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue