diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index eca7b7f10..a870bc4be 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, @@ -483,33 +486,431 @@ static void debug_test_mrope_2d() { ggml_backend_free(backend); } -static void debug_dump_img_embed(struct llava_context * ctx_llava) { - int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama)); - int ne = n_embd * 4; - float vals[56 * 56 * 3]; +static void debug_patch_layout() { + // 1. Initialize backend + ggml_backend_t backend = NULL; + std::string backend_name = ""; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + backend_name = "cuda"; + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + backend_name = "cpu"; + } + + // Calculate the size needed to allocate + size_t ctx_size = 0; + ctx_size += 2 * ggml_tensor_overhead(); // tensors + // no need to allocate anything else! + + // 2. Allocate `ggml_context` to store tensor data + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() + }; + struct ggml_context * ctx = ggml_init(params); + + const int patches_w = 14; + const int patches_h = 10; + const int c = 2; + const int batch_size = 1; + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, patches_w, patches_h, c, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + + std::vector dummy_q; + dummy_q.resize(patches_w * patches_h * c * batch_size); + for (size_t i = 0; i < patches_h * patches_w * c; i++) + { + dummy_q[i] = i; + } + + // std::fill(dummy_q.begin(), dummy_q.end(), 0.1); + // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); + + // 4. Allocate a `ggml_backend_buffer` to store all tensors + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // 5. Copy tensor data from main memory (RAM) to backend buffer + ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw)); + + // 6. Create a `ggml_cgraph` for mul_mat operation + struct ggml_cgraph * gf = NULL; + struct ggml_context * ctx0 = NULL; + + // create a temporally context to build the graph + struct ggml_init_params params0 = { + /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + ctx0 = ggml_init(params0); + gf = ggml_new_graph(ctx0); + /* + Compute graph + */ + struct ggml_tensor * inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + + inp = ggml_reshape_4d( + ctx0, inp, + c * 2, patches_w / 2, patches_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + c * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + c, patches_w * patches_h, batch_size); + + // Add "result" tensor and all of its dependencies to the cgraph + ggml_build_forward_expand(gf, inp); + + // 7. Create a `ggml_gallocr` for cgraph computation + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + // 9. Run the computation + int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + ggml_backend_graph_compute(backend, gf); + + // 10. Retrieve results (output tensors) + // in this example, output tensor is always the last tensor in the graph + struct ggml_tensor * result = inp; + // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + float * result_data = (float *)malloc(ggml_nbytes(result)); + // because the tensor data is stored in device buffer, we need to copy it back to RAM + ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result)); + const std::string bin_file = "patch_layout_" + backend_name +".bin"; + std::ofstream outFile(bin_file, std::ios::binary); + + if (outFile.is_open()) { + outFile.write(reinterpret_cast(result_data), ggml_nbytes(result)); + outFile.close(); + std::cout << "Data successfully written to " + bin_file << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } + + free(result_data); + // 11. Free memory and exit + ggml_free(ctx0); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); +} + +static void debug_test_get_rows() { + // 1. Initialize backend + ggml_backend_t backend = NULL; + std::string backend_name = ""; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + backend_name = "cuda"; + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + backend_name = "cpu"; + } + + // Calculate the size needed to allocate + size_t ctx_size = 0; + ctx_size += 128 * ggml_tensor_overhead(); // tensors + // no need to allocate anything else! + + // 2. Allocate `ggml_context` to store tensor data + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() + }; + struct ggml_context * ctx = ggml_init(params); + + const int tokens = 30; + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 3, tokens * 2); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * pos = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 4, tokens); + // struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens * 4); + ggml_set_name(pos, "pos"); + ggml_set_input(pos); + + struct ggml_tensor * ind = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens); + ggml_set_name(ind, "ind"); + ggml_set_input(ind); + + struct ggml_tensor * ind_2d = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 1, tokens); + ggml_set_name(ind_2d, "ind_2d"); + ggml_set_input(ind_2d); + + std::vector dummy_q; + dummy_q.resize(128 * 3 * inp_raw->ne[2]); + for (int i = 0; i < inp_raw->ne[2]; i ++) { + for (int j = 0; j < 3; j ++) { + int offset = i * 128 * 3 + j * 128; + std::fill(dummy_q.begin() + offset, dummy_q.begin() + offset + 128, 0.1 * i); + } + } + // std::fill(dummy_q.begin(), dummy_q.end(), 0.1); + // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); + + std::vector pos_id; + pos_id.resize(tokens * 4); + for (int i = 0; i < tokens; i ++) { + pos_id[i] = i; + pos_id[i + tokens * 1] = i + 10; + pos_id[i + tokens * 2] = i + 20; + pos_id[i + tokens * 3] = i + 30; + } + + std::vector remap_ind; + remap_ind.resize(tokens * 4); + for (int i = 0; i < tokens; i ++) { + remap_ind[i] = tokens - i - 1; + } + + // 4. Allocate a `ggml_backend_buffer` to store all tensors + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // 5. Copy tensor data from main memory (RAM) to backend buffer + ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw)); + ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos)); + ggml_backend_tensor_set(ind, remap_ind.data(), 0, ggml_nbytes(ind)); + ggml_backend_tensor_set(ind_2d, remap_ind.data(), 0, ggml_nbytes(ind_2d)); + + // 6. Create a `ggml_cgraph` for mul_mat operation + struct ggml_cgraph * gf = NULL; + struct ggml_context * ctx_cgraph = NULL; + + // create a temporally context to build the graph + struct ggml_init_params params0 = { + /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + ctx_cgraph = ggml_init(params0); + gf = ggml_new_graph(ctx_cgraph); + + // ne = [128, 1, 30, 1] + auto x = ggml_reshape_2d(ctx_cgraph, inp_raw, 128 * 3 * 2, tokens); + struct ggml_tensor * result0 = ggml_get_rows( + ctx_cgraph, x, ind); + result0 = ggml_reshape_3d(ctx_cgraph, result0, 128, 3, tokens * 2); + + struct ggml_tensor * result1 = ggml_get_rows( + ctx_cgraph, pos, ind); + + // Add "result" tensor and all of its dependencies to the cgraph + ggml_build_forward_expand(gf, result0); + ggml_build_forward_expand(gf, result1); + + // 7. Create a `ggml_gallocr` for cgraph computation + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + // 9. Run the computation + int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + ggml_backend_graph_compute(backend, gf); + + // 10. Retrieve results (output tensors) + // in this example, output tensor is always the last tensor in the graph + struct ggml_tensor * result = result0; + // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + float * result_data = (float *)malloc(ggml_nbytes(result)); + // because the tensor data is stored in device buffer, we need to copy it back to RAM + ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result)); + const std::string bin_file = "getrows_" + backend_name +"_0.bin"; + std::ofstream outFile(bin_file, std::ios::binary); + + if (outFile.is_open()) { + outFile.write(reinterpret_cast(result_data), ggml_nbytes(result)); + outFile.close(); + std::cout << "Data successfully written to " + bin_file << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } + + free(result_data); + // 11. Free memory and exit + ggml_free(ctx_cgraph); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); +} + + +enum model_output_type { + conv3d, + patch_embed, + patch_win_attn_scatter, + first_attn_layer, + last_attn_layer, + attn_softmax, + final_layer, +}; + +static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) { + int ih = 140; + int iw = 196; + // int ih = 56; + // int iw = 56; + // int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama)); + int n_embd = 1280; + int merge = 1; + if (output_type == model_output_type::final_layer) { + n_embd = 2048; + merge = 2; + } + else if (output_type == model_output_type::attn_softmax) { + merge = 1; + n_embd = (ih/14/merge) * (iw/14/merge) * 16; + } + + int ne = (ih/14/merge) * (iw/14/merge) * n_embd; + float vals[iw * ih * 3]; // float embd[ne]; std::vector embd; embd.resize(ne); - for (int i = 0; i < 56*56; i++) + for (int i = 0; i < iw*ih; i++) { for (int c = 0; c < 3; c++) - vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); + vals[i * 3 + c] = (float)i / (iw*ih); } - clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data()); + clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data()); - std::ofstream outFile("img_embed.bin", std::ios::binary); + std::string file_postfix = ""; + switch (output_type) + { + case model_output_type::conv3d: + file_postfix = "conv3d"; + break; + case model_output_type::patch_embed: + file_postfix = "patch_embed"; + break; + case model_output_type::patch_win_attn_scatter: + file_postfix = "scatter"; + break; + case model_output_type::first_attn_layer: + file_postfix = "first_attn"; + break; + case model_output_type::last_attn_layer: + file_postfix = "last_attn"; + break; + case model_output_type::attn_softmax: + file_postfix = "attn_softmax"; + break; + case model_output_type::final_layer: + file_postfix = "final"; + break; + default: + break; + } + auto output_path = "img_embed_" + file_postfix + ".bin"; + + std::ofstream outFile(output_path, std::ios::binary); if (outFile.is_open()) { outFile.write(reinterpret_cast(embd.data()), ne * sizeof(float)); outFile.close(); - std::cout << "Data successfully written to mrope.bin" << std::endl; + std::cout << "Data successfully written to ::[ " << output_path << std::endl; } else { std::cerr << "Error opening file!" << std::endl; } } + +static void dump_win_attn_mask() { + const int image_size_width = 196; + const int image_size_height = 140; + const int patch_size = 14; + const int attn_window_size = 112; + + const int merge_ratio = 2; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int grid_window = attn_window_size / patch_size / merge_ratio; + /* + pw * ph = number of tokens output by ViT after apply patch merger + ipw * ipw = number of vision token been processed inside ViT + */ + + std::vector idx(ph * pw); + std::vector inv_idx(ph * pw); + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + int ne = pow(ipw * iph, 2); + std::vector mask(ne, std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y+=grid_window) + { + for (int x = 0; x < pw; x+=grid_window) + { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + assert(src < (int)idx.size()); + assert(dst < (int)inv_idx.size()); + idx[src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + auto output_path = "win_attn_mask_fp32.bin"; + + std::ofstream outFile(output_path, std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(mask.data()), ne * sizeof(float)); + + outFile.close(); + std::cout << "Data successfully written to " << output_path << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } +} + + #endif @@ -551,8 +952,12 @@ int main(int argc, char ** argv) { } else if (params.image[0].empty()) { auto ctx_llava = llava_init_context(¶ms, model); - debug_test_mrope_2d(); - debug_dump_img_embed(ctx_llava); + // debug_test_mrope_2d(); + debug_dump_img_embed(ctx_llava, model_output_type::final_layer); + // debug_dump_img_embed(ctx_llava, model_output_type::conv3d); + // debug_test_get_rows(); + // dump_win_attn_mask(); + // debug_patch_layout(); llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL;