diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp index 01cd4ba02..a50552d48 100644 --- a/tools/mtmd/llava.cpp +++ b/tools/mtmd/llava.cpp @@ -275,140 +275,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get()); - if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { - std::vector image_embd_v; - image_embd_v.resize(n_imgs); - clip_image_size load_image_size; - - for (size_t i = 0; i < n_imgs; i++) { - const int64_t t_img_enc_step_start_us = ggml_time_us(); - int nx = clip_image_f32_batch_nx(img_res_v.get(), i); - int ny = clip_image_f32_batch_ny(img_res_v.get(), i); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - int patch_size = 14; - load_image_size.width = nx; - load_image_size.height = ny; - clip_add_load_image_size(ctx_clip, &load_image_size); - - bool encoded = false; - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); - if (clip_is_qwen2vl(ctx_clip)) { - encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); - } - else { - encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]); - } - - if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs); - return false; - } - const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - int n_img_pos_out = 0; - for (size_t i = 0; i < image_embd_v.size(); i++) { - int nx = clip_image_f32_batch_nx(img_res_v.get(), i); - int ny = clip_image_f32_batch_ny(img_res_v.get(), i); - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); - std::memcpy( - image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), - image_embd_v[i], - clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); - } - *n_img_pos = n_img_pos_out; - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - load_image_size.width = img->nx; - load_image_size.height = img->ny; - clip_add_load_image_size(ctx_clip, &load_image_size); - LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height); - } - else if (clip_is_glm(ctx_clip)){ - struct clip_image_size * load_image_size = clip_image_size_init(); - load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0); - load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0); - clip_add_load_image_size(ctx_clip, load_image_size); - - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); - int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2); - *n_img_pos = (pos * pos + 2); - if (!encoded){ - LOG_ERR("Unable to encode image \n"); - return false; - } - } - else if (clip_is_pixtral(ctx_clip)){ - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); - *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 - if (!encoded) { - LOG_ERR("Unable to encode image\n"); - - return false; - } - } - else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { - // flat / default llava-1.5 type embedding - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); - *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 - if (!encoded) { - LOG_ERR("Unable to encode image\n"); - - return false; - } - } - else { - // spatial_unpad llava-1.6 type embedding - // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working - std::vector image_embd_v; - image_embd_v.resize(n_imgs); - for (size_t i = 0; i < n_imgs; i++) { - clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside - if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs); - return false; - } - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - const int32_t * image_grid = clip_image_grid(ctx_clip); - const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip); - - std::vector> grid_pinpoints; - for (size_t i = 0; i < num_gridpoints; i += 2) { - grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); - } - - const int32_t image_size = clip_get_image_size(ctx_clip); - - struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); - - int n_img_pos_out; - clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); - *n_img_pos = n_img_pos_out; - - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - - // debug image/segment/normalization content: - // clip_image_u8 * tmp = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*image_feature, *tmp); - // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 + if (!encoded) { + LOG_ERR("Unable to encode image\n"); + return false; } LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);