mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-09 19:46:11 +00:00
hey what do you know, it worked
This commit is contained in:
parent
d04b4eeb04
commit
c0edde61c5
1 changed files with 6 additions and 134 deletions
|
|
@ -275,140 +275,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
|
||||
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
|
||||
|
||||
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(n_imgs);
|
||||
clip_image_size load_image_size;
|
||||
|
||||
for (size_t i = 0; i < n_imgs; i++) {
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
int patch_size = 14;
|
||||
load_image_size.width = nx;
|
||||
load_image_size.height = ny;
|
||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
||||
|
||||
bool encoded = false;
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
|
||||
}
|
||||
else {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
|
||||
}
|
||||
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
std::memcpy(
|
||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||
image_embd_v[i],
|
||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
load_image_size.width = img->nx;
|
||||
load_image_size.height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
||||
}
|
||||
else if (clip_is_glm(ctx_clip)){
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
|
||||
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
|
||||
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
|
||||
*n_img_pos = (pos * pos + 2);
|
||||
if (!encoded){
|
||||
LOG_ERR("Unable to encode image \n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (clip_is_pixtral(ctx_clip)){
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// spatial_unpad llava-1.6 type embedding
|
||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(n_imgs);
|
||||
for (size_t i = 0; i < n_imgs; i++) {
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
for (size_t i = 0; i < num_gridpoints; i += 2) {
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
||||
}
|
||||
|
||||
const int32_t image_size = clip_get_image_size(ctx_clip);
|
||||
|
||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
int n_img_pos_out;
|
||||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
||||
*n_img_pos = n_img_pos_out;
|
||||
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
|
||||
// debug image/segment/normalization content:
|
||||
// clip_image_u8 * tmp = clip_image_u8_init();
|
||||
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
|
||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue