diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 106d3bd48..074d16e31 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -852,10 +852,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - // ggml_build_forward_expand(gf, inp); - // ggml_free(ctx0); - // return gf; - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] inp = ggml_reshape_4d( ctx0, inp, @@ -867,10 +863,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im inp = ggml_reshape_3d( ctx0, inp, hidden_size, patches_w * patches_h, batch_size); - - // ggml_build_forward_expand(gf, inp); - // ggml_free(ctx0); - // return gf; } else { inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); @@ -959,18 +951,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4); embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx); embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size); - - // positions = ggml_reshape_2d(ctx0, positions, num_position_ids / 4, 4); - // positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); - // positions = ggml_reshape_2d(ctx0, positions, 16, num_position_ids / 16); - // positions = ggml_get_rows(ctx0, positions, inv_window_idx); - // positions = ggml_reshape_2d(ctx0, positions, 4, num_position_ids / 4); - // positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); - // positions = ggml_reshape_1d(ctx0, positions, num_position_ids); - - // ggml_build_forward_expand(gf, embeddings); - // ggml_free(ctx0); - // return gf; } for (int il = 0; il < ctx->max_feature_layer; il++) { @@ -994,12 +974,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b); } - // if ( il == 0) { - // // build the graph - // ggml_build_forward_expand(gf, cur); - // ggml_free(ctx0); - // return gf; - // } // self-attention { @@ -1042,17 +1016,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); } else { KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f, 0.0f); - // KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrt((float)d_head)); // KQ = ggml_add(ctx0, KQ, window_mask); // KQ = ggml_soft_max_inplace(ctx0, KQ); } - // if ( il == 0) { - // // build the graph - // ggml_build_forward_expand(gf, KQ); - // ggml_free(ctx0); - // return gf; - // } struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); @@ -1068,12 +1035,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im cur = ggml_add(ctx0, cur, embeddings); embeddings = cur; // embeddings = residual, cur = hidden_states - // if ( il == 0) { - // // build the graph - // ggml_build_forward_expand(gf, cur); - // ggml_free(ctx0); - // return gf; - // } // layernorm2 if (ctx->use_rms_norm) { @@ -1125,19 +1086,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; - - // if ( il == 0) { - // // build the graph - // ggml_build_forward_expand(gf, embeddings); - // ggml_free(ctx0); - // return gf; - // } } - // ggml_build_forward_expand(gf, embeddings); - // ggml_free(ctx0); - // return gf; - // post-layernorm if (model.post_ln_w) { if (ctx->use_rms_norm) { @@ -3142,9 +3092,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - // const int pw = image_size_width / patch_size; - // const int ph = image_size_height / patch_size; const int mpow = (merge_ratio * merge_ratio); int* positions_data = (int*)malloc(ggml_nbytes(positions)); @@ -3157,6 +3104,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (int dx = 0; dx < 2; dx++) { auto remap = idx[ptr / mpow]; remap = remap * mpow + (ptr % mpow); + // auto remap = ptr; positions_data[remap] = y + dy; positions_data[num_patches + remap] = x + dx; @@ -3168,7 +3116,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } - if (positions) ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py index 8f7a94e5c..9d4ad8932 100644 --- a/examples/llava/qwen2_vl_surgery.py +++ b/examples/llava/qwen2_vl_surgery.py @@ -102,7 +102,7 @@ def main(args): np_dtype = np.float32 ftype = 0 elif args.data_type == 'fp16': - dtype = torch.float32 + dtype = torch.float16 np_dtype = np.float16 ftype = 1 else: diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index f95677eef..4598fab25 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -771,10 +771,10 @@ enum model_output_type { }; static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) { - int ih = 140; - int iw = 196; - // int ih = 56; - // int iw = 56; + constexpr int ih = 140; + constexpr int iw = 196; + // constexpr int ih = 56; + // constexpr int iw = 56; // int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama)); int n_embd = 1280; int merge = 1; @@ -954,7 +954,7 @@ int main(int argc, char ** argv) { // debug_test_mrope_2d(); debug_dump_img_embed(ctx_llava, model_output_type::final_layer); - // debug_dump_img_embed(ctx_llava, model_output_type::conv3d); + // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer); // debug_test_get_rows(); // dump_win_attn_mask(); // debug_patch_layout();