add llama_model_n_flops

2025-09-06 18:09:50 +00:00 · 2024-11-20 19:40:27 +04:00 · 2024-11-20 19:40:27 +04:00 · 477ecf2084
commit 477ecf2084
parent 10f6f92c7e
4 changed files with 445 additions and 107 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -841,6 +841,8 @@ static void llama_assign_n_layer_window(
        return;
    }
    (void)my_rank;
    std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
 }
@ -894,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    device_info dev_info;
    dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, params.model.c_str(), params.cpuparams.n_threads);
+    llama_profile_device(&dev_info, model, ml, params.model.c_str(), params.cpuparams.n_threads);
    // create llama context
    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
--- a/common/profiler.h
+++ b/common/profiler.h
@ -67,6 +67,21 @@ struct device_info {
        : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {}
 };
 struct flops_info {
    // model flops
    int64_t input_flops;
    int64_t output_flops;
    int64_t layer_flops;
    // model params
    int64_t input_params;
    int64_t output_params;
    int64_t layer_params;
    flops_info()
        : input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {}
 };
 enum profiler_backend_type {
    PROFILER_BACKEND_TYPE_CPU   = 0,
    PROFILER_BACKEND_TYPE_METAL = 1,
--- a/include/llama.h
+++ b/include/llama.h
@ -410,7 +410,13 @@ extern "C" {
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(void);
-    LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads);
+    LLAMA_API void llama_profile_device(
                struct device_info        * dev_info, 
                struct llama_model        * model, 
                struct llama_model_loader * ml, 
                const char                * test_file, 
                int                         n_threads);
    LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
    //optional:
@ -518,6 +524,14 @@ extern "C" {
    // Returns the total number of parameters in the model
    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
    // Return the total number of float operations in the model
    LLAMA_API void llama_model_n_flops(
                        struct llama_model * model, 
                 struct llama_model_loader * ml, 
                         struct flops_info * ffo,
                             const int64_t   n_input,
                             const int64_t   n_history);
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2880,21 +2880,21 @@ struct llama_model {
    llama_vocab   vocab;
    // TODO: should init all tensors to nullptr
-    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * tok_embd   = nullptr;
-    struct ggml_tensor * type_embd;
+    struct ggml_tensor * type_embd  = nullptr;
-    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * pos_embd   = nullptr;
-    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm   = nullptr;
-    struct ggml_tensor * tok_norm_b;
+    struct ggml_tensor * tok_norm_b = nullptr;
-    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm     = nullptr;
-    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output_norm_b   = nullptr;
-    struct ggml_tensor * output;
+    struct ggml_tensor * output          = nullptr;
-    struct ggml_tensor * output_b;
+    struct ggml_tensor * output_b        = nullptr;
-    struct ggml_tensor * output_norm_enc;
+    struct ggml_tensor * output_norm_enc = nullptr;
    // classifier
-    struct ggml_tensor * cls;
+    struct ggml_tensor * cls       = nullptr;
-    struct ggml_tensor * cls_b;
+    struct ggml_tensor * cls_b     = nullptr;
    struct ggml_tensor * cls_out   = nullptr;
    struct ggml_tensor * cls_out_b = nullptr;
@ -3546,7 +3546,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    GGML_UNUSED(model);
 }
-void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads) {
+void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
    dev_info->device_name               = device_name();
    dev_info->cpu_props.cores           = device_cpu_cores();
    dev_info->cpu_props.flops_f32       = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
@ -3568,7 +3568,6 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
    dev_info->gpu_support.blas          = device_has_blas();
    dev_info->gpu_support.sycl          = device_has_sycl();
    ggml_backend_dev_props cpu_props;
    ggml_backend_dev_props gpu_props;
    device_get_props(model, -1, &cpu_props); // -1 for cpu
@ -3582,10 +3581,21 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
    dev_info->gpu_props.memory_free     = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_props.memory_total    = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_props.metal_flops     = device_metal_flops(model, GGML_TYPE_F32);
-    dev_info->gpu_props.cuda_flops_f32  = device_cuda_flops(model, GGML_TYPE_F32);
+    dev_info->gpu_props.cuda_flops_f32  = device_cuda_flops(model,  GGML_TYPE_F32);
-    dev_info->gpu_props.cuda_flops_f16  = device_cuda_flops(model, GGML_TYPE_F16);
+    dev_info->gpu_props.cuda_flops_f16  = device_cuda_flops(model,  GGML_TYPE_F16);
-    dev_info->gpu_props.cuda_flops_q8   = device_cuda_flops(model, GGML_TYPE_Q8_0);
+    dev_info->gpu_props.cuda_flops_q8   = device_cuda_flops(model,  GGML_TYPE_Q8_0);
-    dev_info->gpu_props.cuda_flops_q4k  = device_cuda_flops(model, GGML_TYPE_Q4_K);
+    dev_info->gpu_props.cuda_flops_q4k  = device_cuda_flops(model,  GGML_TYPE_Q4_K);
    if (dev_info->rank == 0) {
        struct flops_info ffo = flops_info{};
        llama_model_n_flops(model, ml, &ffo, 1, 10);
        LLAMA_LOG_INFO("input_flops:   %llu\n", ffo.input_flops);
        LLAMA_LOG_INFO("output_flops:  %llu\n", ffo.output_flops);
        LLAMA_LOG_INFO("layer_flops:   %llu\n", ffo.layer_flops);
        LLAMA_LOG_INFO("input_params:  %llu\n", ffo.input_params);
        LLAMA_LOG_INFO("output_params: %llu\n", ffo.output_params);
        LLAMA_LOG_INFO("layer_params:  %llu\n", ffo.layer_params);
    }
 }
 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@ -7141,6 +7151,124 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    }
 }
 static void llm_load_llama_tensors(
        llama_model_loader & ml,
        llama_model        & model,
        std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
        uint32_t             n_world, 
        uint32_t             my_rank, 
        const uint32_t     * n_layer_window,
        bool               * use_mmap_buffer) {
    const auto tn = LLM_TN(model.arch);
    ggml_context * ctx_input        = nullptr;
    ggml_context * ctx_output       = nullptr;
    ggml_context * ctx_output_split = nullptr;
    if (my_rank == 0) {
        ctx_input        = ctx_map.at(model.buft_input.buft); 
        ctx_output       = ctx_map.at(model.buft_output.buft);
        ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
    }
    auto ctx_for_layer       = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
    auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
    const llama_hparams hparams = model.hparams;
    const int64_t n_head        = hparams.n_head();
    const int64_t n_embd        = hparams.n_embd;
    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
    const int64_t n_embd_head_k = hparams.n_embd_head_k;
    const int64_t n_ff          = hparams.n_ff();
    const int64_t n_embd_gqa    = n_embd_v_gqa;
    const int64_t n_vocab       = hparams.n_vocab;
    const int64_t n_rot         = hparams.n_rot;
    const int64_t n_expert      = hparams.n_expert;
    const int64_t n_layer       = hparams.n_layer;
    if (my_rank == 0) {
        // token embedding
        model.tok_embd    = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
        // output
        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
        // if output is NULL, init from the input tok embed
        if (model.output == NULL) {
            model.output  = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
        }
    }
    for (int i = 0; i < n_layer; ++i) {
        if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
            continue;
        }
        int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
        ggml_context * ctx_layer = ctx_for_layer(local_i);
        ggml_context * ctx_split = ctx_for_layer_split(local_i);
        auto & layer = model.layers[local_i];
        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
        // optional bias tensors
        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd});
        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
        if (n_expert == 0) {
            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd});
            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
            // optional MLP bias
            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
        } else {
            layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
            if (layer.ffn_gate_exps) {
                layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
                layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
            } else {
                // merge split expert into a single tensor for compatibility with older models
                // requires disabling mmap
                *use_mmap_buffer = false;
                ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
                ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
                ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
                layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
                layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff,   n_embd, n_expert);
                layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
                ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
                ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
                ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
                for (uint32_t x = 0; x < n_expert; ++x) {
                    // the individual experts are loaded into a view of the merged tensor
                    ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
                    ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
                    ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
                }
            }
        }
    }
 }
 // Returns false if cancelled by progress_callback
 static bool llm_load_tensors_impl(
        llama_model_loader   &  ml,
@ -7195,7 +7323,7 @@ static bool llm_load_tensors_impl(
    // assign the input and output layers on CPU by default
    if (my_rank == 0) {
-        model.buft_input = llama_default_buffer_type_cpu(model, true);
+        model.buft_input  = llama_default_buffer_type_cpu(model, true);
        model.buft_output = llama_default_buffer_type_cpu(model, true);
        LLAMA_LOG_INFO("Layer input assigned to cpu\n");
        LLAMA_LOG_INFO("Layer output assigned to cpu\n");
@ -7280,91 +7408,8 @@ static bool llm_load_tensors_impl(
            case LLM_ARCH_MINICPM:
            case LLM_ARCH_GRANITE:
            case LLM_ARCH_GRANITE_MOE:
-                {
+                llm_load_llama_tensors(ml, model, ctx_map, n_world, my_rank, n_layer_window, &use_mmap_buffer);
-                    if (my_rank == 0) {
+                break;
                        model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                        // output
                        {
                            model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                            model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                            // if output is NULL, init from the input tok embed
                            if (model.output == NULL) {
                                model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                            }
                        }
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
                            continue;
                        }
                        int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
                        ggml_context * ctx_layer = ctx_for_layer(local_i);
                        ggml_context * ctx_split = ctx_for_layer_split(local_i);
                        auto & layer = model.layers[local_i];
                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
                        // optional bias tensors
                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd});
                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                        if (n_expert == 0) {
                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd});
                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                            // optional MLP bias
                            layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff},   llama_model_loader::TENSOR_NOT_REQUIRED);
                        } else {
                            layer.ffn_gate_inp  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
                            if (layer.ffn_gate_exps) {
                                layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
                                layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
                            } else {
                                // merge split expert into a single tensor for compatibility with older models
                                // requires disabling mmap
                                use_mmap_buffer = false;
                                ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
                                ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
                                ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
                                layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
                                layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff,   n_embd, n_expert);
                                layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
                                ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
                                ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
                                ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
                                for (uint32_t x = 0; x < n_expert; ++x) {
                                    // the individual experts are loaded into a view of the merged tensor
                                    ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
                                    ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
                                    ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
                                }
                            }
                        }
                    }
                } break;
            case LLM_ARCH_MINICPM3:
                {
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
@ -9285,7 +9330,7 @@ static struct ggml_tensor * llm_build_inp_embd(
        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
    } else {
-       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
        inpL = lctx.inp_embd;
        ggml_set_input(lctx.inp_embd);
    }
@ -19841,6 +19886,8 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
        LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
        return -1;
    }
    return 0;
 }
 int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
@ -20586,6 +20633,266 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
    return nparams;
 }
 static void llama_model_reset_tensors(struct llama_model * model) {
    model->buft_input.buft         = nullptr;
    model->buft_input.buft_matrix  = nullptr;
    model->buft_output.buft        = nullptr;
    model->buft_output.buft_matrix = nullptr;
    for (int i = 0; i < (int)model->hparams.n_layer; ++i) {
        model->buft_layer[i].buft  = nullptr;
        model->buft_layer[i].buft_matrix = nullptr;
    }
    // layers
    model->buft_layer.resize(0);
    model->layers.resize(0);
    // input
    model->tok_embd   = nullptr;
    model->type_embd  = nullptr;
    model->pos_embd   = nullptr;
    model->tok_norm   = nullptr;
    model->tok_norm_b = nullptr;
    // output
    model->output_norm     = nullptr;
    model->output_norm_b   = nullptr;
    model->output          = nullptr;
    model->output_b        = nullptr;
    model->output_norm_enc = nullptr;
    // classifier
    model->cls = nullptr;
    model->cls_b = nullptr;
    model->cls_out = nullptr;
    model->cls_out_b = nullptr;
 }
 void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct flops_info * ffo, const int64_t n_input, const int64_t n_history) {
    const llama_hparams hparams  = model->hparams;
    const int64_t n_layer        = hparams.n_layer;
    const int64_t n_vocab        = hparams.n_vocab;
    const int64_t n_embd         = hparams.n_embd;
    const int64_t n_head         = hparams.n_head();
    const int64_t n_ff           = hparams.n_ff();
    const int64_t n_embd_k_gqa   = hparams.n_embd_k_gqa();
    const int64_t n_embd_v_gqa   = hparams.n_embd_v_gqa();
    const int64_t n_embd_head_k  = hparams.n_embd_head_k;
    const int64_t n_expert       = hparams.n_expert;
    // assign all the tensors on CPU by default
    model->buft_input  = llama_default_buffer_type_cpu(*model, true);
    model->buft_output = llama_default_buffer_type_cpu(*model, true);
    model->buft_layer.resize(n_layer);
    for (int i = 0; i < (int)n_layer; ++i) {
        model->buft_layer[i] = llama_default_buffer_type_cpu(*model, true);
    }
    // count used buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    buft_layer_count[model->buft_input.buft]++;
    buft_layer_count[model->buft_input.buft_matrix]++;
    buft_layer_count[model->buft_output.buft]++;
    buft_layer_count[model->buft_output.buft_matrix]++;
    for (int i = 0; i < (int)n_layer; ++i) {
        buft_layer_count[model->buft_layer[i].buft]++;
        buft_layer_count[model->buft_layer[i].buft_matrix]++;
    }
    // create one context per buffer type
    size_t ctx_size = ggml_tensor_overhead() * (ml->n_tensors + 1);
    // for moe merged tensors
    ctx_size += ggml_tensor_overhead() * n_layer * 3;
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    std::vector<struct ggml_context *> ctxs;
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
            throw std::runtime_error(format("failed to create context\n"));
        }
        ctx_map[it.first] = ctx;
        ctxs.push_back(ctx);
    }
    const uint32_t n_layer_window[32] = {(uint32_t)n_layer};
    bool use_mmap_buffer = false;
    model->layers.resize(n_layer);
    switch (model->arch) {
            case LLM_ARCH_LLAMA:
            case LLM_ARCH_REFACT:
            case LLM_ARCH_MINICPM:
            case LLM_ARCH_GRANITE:
            case LLM_ARCH_GRANITE_MOE:
                llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer);
                break;
            default:
                throw std::runtime_error("unsupported architecture\n");
    }
    std::unordered_map<std::string, int> tensor_name_map = {
        {"token_embd.weight",          1},
        {"output_norm.weight",         2},
        {"output.weight",              3},
        {"blk.0.attn_norm.weight",     4},
        {"blk.0.attn_q.weight",        5},
        {"blk.0.attn_k.weight",        6},
        {"blk.0.attn_v.weight",        7},
        {"blk.0.attn_output.weight",   8},
        {"blk.0.ffn_gate.weight",      9},
        {"blk.0.ffn_down.weight",      10},
        {"blk.0.ffn_up.weight",        11},
        {"blk.0.ffn_norm.weight",      12},
        {"rope_freqs.weight",          13},
        // optional: bias tensors
        {"blk.0.attn_q.bias",          14},
        {"blk.0.attn_k.bias",          15},
        {"blk.0.attn_v.bias",          16},
        {"blk.0.attn_output.bias",     17},
        {"blk.0.ffn_gate.bias",        18},
        {"blk.0.ffn_down.bias",        19},
        {"blk.0.ffn_up.bias",          20},
        // optional: expert tensors
        {"blk.0.ffn_gate_inp.weight",  21},
        {"blk.0.ffn_gate_exps.weight", 22},
        {"blk.0.ffn_down_exps.weight", 23},
        {"blk.0.ffn_up_exps.weight",   24},
    };
    for (ggml_context * ctx : ctxs) {
        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            auto it = tensor_name_map.find(ggml_get_name(cur));
            if (it != tensor_name_map.end()) {
                switch (it->second) {
                    case 1: { // "token_embd.weight"
                        ffo->input_flops  += (2 * n_input * n_embd * n_vocab - n_input * n_embd);
                        ffo->input_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 2: { // "output_norm.weight"
                        ffo->output_flops  += n_input * (8 * n_embd + 1);
                        ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 3: { // "output.weight"
                        ffo->output_flops  += 2 * n_input * n_embd * n_vocab;
                        ffo->output_flops  += 5 * n_input * n_vocab;
                        ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 4:  // "blk.0.attn_norm.weight"
                    case 12: // "blk.0.ffn_norm.weight"
                    { 
                        ffo->layer_flops  += n_input * (8 * n_embd + 1);
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 5: { // "blk.0.attn_q.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_head_k);
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 6: { // "blk.0.attn_k.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_k_gqa);
                        ffo->layer_flops  += 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head; // Q*K with KVCache
                        ffo->layer_flops  += 7 * n_input * (n_input + n_history) * n_head; // scale, mask, and softmax
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 7: { // "blk.0.attn_v.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_v_gqa);
                        ffo->layer_flops  += n_input * (n_input + n_history) * n_embd_head_k * n_head; // QKV with KVCache
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 8: { // "blk.0.attn_output.weight"
                        ffo->layer_flops  += 2 * n_input * (n_head * n_embd_head_k) * n_embd;
                        ffo->layer_flops  += n_input * n_embd; // shortcut
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 9: { // "blk.0.ffn_gate.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
                        ffo->layer_flops  += 5 * n_input * n_ff; // SiLU
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 10: { // "blk.0.ffn_down.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
                        ffo->layer_flops  += n_input * n_embd; // shortcut
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 11: { // "blk.0.ffn_up.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
                        ffo->layer_flops  += n_input * n_ff; // silu(gate(x)) * up(x)
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 13: { // rope_freqs.weight, for Q and K
                        ffo->layer_flops  += 8 * n_input * n_head * n_embd_head_k;
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    // optional: bias tensors
                    case 14: // "blk.0.attn_q.bias"
                    case 15: // "blk.0.attn_k.bias"
                    case 16: // "blk.0.attn_v.bias"
                    case 17: // "blk.0.attn_output.bias"
                    case 19: // "blk.0.ffn_down.bias"
                    {
                        ffo->layer_flops  += n_input * n_embd; 
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 18: // "blk.0.ffn_gate.bias"
                    case 20: // "blk.0.ffn_up.bias"
                    {
                        ffo->layer_flops  += n_input * n_ff;
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break; 
                    }
                    // optional: expert tensors
                    case 21: { // "blk.0.ffn_gate_inp.weight"
                        ffo->layer_flops  += 2 * n_input * n_embd * n_expert;
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    case 22: // "blk.0.ffn_gate_exps.weight"
                    case 23: // "blk.0.ffn_down_exps.weight"
                    case 24: // "blk.0.ffn_up_exps.weight"
                    { 
                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff * n_expert;
                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                        break;
                    }
                    default:
                        LLAMA_LOG_INFO("Uncaught tensor\n");
                        return;
                }
            }
        }
    }
    // reset ml, model, and clear contexts
    ml->n_created = 0;
    ml->size_data = 0;
    llama_model_reset_tensors(model);
    for (ggml_context * ctx : ctxs) {
        ggml_free(ctx);
    }
    ctxs.clear();
    ctx_map.clear();
 }
 struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
            [name](const std::pair<std::string, struct ggml_tensor *> & it) {