fix model bytes counter

2025-09-06 13:59:04 +00:00 · 2024-12-10 14:57:48 +04:00 · 2024-12-10 14:57:48 +04:00 · 8e9ab45458
commit 8e9ab45458
parent 2d79554694
4 changed files with 152 additions and 190 deletions
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -1285,33 +1285,17 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c

 // estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
 static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
-    struct model_params n_params = dev_info.model_params;
+    auto n_bytes     = dev_info.model_bytes;
    int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
    
-    int64_t layer_bytes = 
-                   n_params.layer_f32 * 4 +
-                   n_params.layer_f16 * 2 +
-                   n_params.layer_q4k * 4 / 8 +
-                   n_params.layer_q5k * 5 / 8 +
-                   n_params.layer_q6k * 6 / 8 +
-                   n_params.layer_q80;
-
-    int64_t output_bytes = 
-                   n_params.output_f32 * 4 +
-                   n_params.output_f16 * 2 +
-                   n_params.output_q4k * 4 / 8 +
-                   n_params.output_q5k * 5 / 8 +
-                   n_params.output_q6k * 6 / 8 +
-                   n_params.output_q80;
-    
    uint64_t cpu_kv_size;
    uint64_t gpu_kv_size;

 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);

-    int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
-    int64_t ram_bytes  = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
+    int64_t vram_bytes = n_bytes.nb_layer * n_gpu_layers + gpu_kv_size;
+    int64_t ram_bytes  = n_bytes.nb_layer * (n_layers - n_gpu_layers) + n_bytes.nb_output + cpu_kv_size;

 #ifdef GGML_USE_CUDA
    double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
@ -1327,53 +1311,33 @@ static float device_memory_access_delay(struct device_info & dev_info, struct ll

    (void)n_gpu_layers;
    (void)gpu_kv_size;
-    int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
+    int64_t ram_bytes = n_bytes.nb_layer * n_layers + n_bytes.nb_output + cpu_kv_size;
    double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
    return static_cast<float>(ram_access_delay); // ms
 #endif
 }

 static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
-    auto n_params         = dev_info.model_params;
+    auto n_bytes     = dev_info.model_bytes;
    int n_layers     = llama_model_n_layers(model);
    int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
    int n_vocab      = llama_n_vocab(model);

-    int64_t input_bytes = (
-                n_params.input_f32 * 4 +
-                n_params.input_f16 * 2 +
-                n_params.input_q4k * 4 / 8 +
-                n_params.input_q5k * 5 / 8 +
-                n_params.input_q6k * 6 / 8 +
-                n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements
-    
-    int64_t cpu_total_bytes = input_bytes;
-
-    int64_t layer_bytes =
-                n_params.layer_f32 * 4 +
-                n_params.layer_f16 * 2 +
-                n_params.layer_q4k * 4 / 8 +
-                n_params.layer_q5k * 5 / 8 +
-                n_params.layer_q6k * 6 / 8 +
-                n_params.layer_q80;
+    int64_t cpu_total_bytes = 0;
+    int64_t input_bytes = n_bytes.nb_input / n_vocab; // lookup table, retrieve only n_embd elements
+    cpu_total_bytes += input_bytes;

 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
-    cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
+    cpu_total_bytes += n_bytes.nb_layer * (n_layers - n_gpu_layers);
 #if defined(GGML_USE_METAL)
-    int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
+    int64_t gpu_total_bytes = n_bytes.nb_layer * n_gpu_layers;
 #endif
 #else
    (void)n_gpu_layers;
-    cpu_total_bytes += layer_bytes * n_layers;
+    cpu_total_bytes += n_bytes.nb_layer * n_layers;
 #endif

-    cpu_total_bytes += (
-                n_params.output_f32 * 4 +
-                n_params.output_f16 * 2 +
-                n_params.output_q4k * 4 / 8 +
-                n_params.output_q5k * 5 / 8 +
-                n_params.output_q6k * 6 / 8 +
-                n_params.output_q80);
+    cpu_total_bytes += n_bytes.nb_output;

    uint64_t cpu_kv_size;
    uint64_t gpu_kv_size;
@ -1850,6 +1814,18 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q80);
    LOG_INF("\n");

+    LOG_INF("| Model bytes  (input)         ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_input);
+    LOG_INF("\n");
+
+    LOG_INF("| Model bytes  (layer)         ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_layer);
+    LOG_INF("\n");
+
+    LOG_INF("| Model bytes  (output)        ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_output);
+    LOG_INF("\n");
+
    // todo: calculate for each device, not only master
    float latency = 0.0f;
    int n_layers  = llama_model_n_layers (model);
--- a/common/profiler.h
+++ b/common/profiler.h
@ -181,6 +181,17 @@ struct model_params {
        layer_q80 (0) {}
 };

+struct model_bytes {
+    int64_t nb_input;
+    int64_t nb_layer;
+    int64_t nb_output;
+
+    model_bytes() :
+        nb_input (0),
+        nb_layer (0),
+        nb_output(0) {}
+};
+
 struct disk_props {
    float read_seq_bw;  // in GB/s
    float read_rnd_bw;  // in GB/s
@ -204,6 +215,7 @@ struct device_info {
    struct gpu_props    gpu_props;
    struct model_flops  model_flops;
    struct model_params model_params;
+    struct model_bytes  model_bytes;

    device_info() : 
        rank(0), 
@ -214,7 +226,8 @@ struct device_info {
        gpu_support(), 
        gpu_props(), 
        model_flops(),
-        model_params() {}
+        model_params(),
+        model_bytes() {}
 };

 enum profiler_backend_type {
--- a/include/llama.h
+++ b/include/llama.h
@ -563,6 +563,7 @@ extern "C" {
                 struct llama_model_loader * ml, 
                        struct model_flops * n_flops,
                       struct model_params * n_params,
+                        struct model_bytes * n_bytes,
                             const int64_t   n_history,
                             const int64_t   n_ctx,
                            enum ggml_type * inp_embd_dtype,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3589,9 +3589,10 @@ void llama_profile_device(

    struct model_flops  * n_flops  = &dev_info->model_flops;
    struct model_params * n_params = &dev_info->model_params;
+    struct model_bytes  * n_bytes  = &dev_info->model_bytes;
    if (dev_info->rank == 0) {    
        enum ggml_type inp_embd_dtype  = GGML_TYPE_F32;
-        llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
+        llama_model_n_flops(model, ml, n_flops, n_params, n_bytes, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
        n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
    }

@ -20881,6 +20882,26 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
    }
 }

+static void count_n_bytes(struct model_bytes * n_bytes, enum profiler_layer_type ltype, size_t n) {
+    int64_t n_i64t = static_cast<int64_t>(n);
+    switch (ltype) {
+        case PROFILER_LAYER_INPUT:
+            n_bytes->nb_input  += n_i64t;
+            break;
+
+        case PROFILER_LAYER_OUTPUT:
+            n_bytes->nb_output += n_i64t;
+            break;
+
+        case PROFILER_LAYER_BACKEND:
+            n_bytes->nb_layer  += n_i64t;
+            break;
+
+        default:
+            throw std::runtime_error("Unrecognized profiler layer type\n");
+    }
+}
+
 void llama_model_compute_buf_size(
                            uint64_t * cpu_buf, 
                            uint64_t * gpu_buf, 
@ -20977,6 +20998,7 @@ void llama_model_n_flops(
     struct llama_model_loader * ml, 
            struct model_flops * n_flops, 
           struct model_params * n_params, 
+            struct model_bytes * n_bytes,
                 const int64_t   n_history,
                 const int64_t   n_ctx, 
                enum ggml_type * inp_embd_dtype,
@ -21052,151 +21074,101 @@ void llama_model_n_flops(
                throw std::runtime_error("unsupported architecture\n");
    }

-    std::unordered_map<std::string, int> tensor_name_map = {
-        {"token_embd.weight",          1},
-        {"output_norm.weight",         2},
-        {"output.weight",              3},
-        {"blk.0.attn_norm.weight",     4},
-        {"blk.0.attn_q.weight",        5},
-        {"blk.0.attn_k.weight",        6},
-        {"blk.0.attn_v.weight",        7},
-        {"blk.0.attn_output.weight",   8},
-        {"blk.0.ffn_gate.weight",      9},
-        {"blk.0.ffn_down.weight",      10},
-        {"blk.0.ffn_up.weight",        11},
-        {"blk.0.ffn_norm.weight",      12},
-        {"rope_freqs.weight",          13},
-        // optional: bias tensors
-        {"blk.0.attn_q.bias",          14},
-        {"blk.0.attn_k.bias",          15},
-        {"blk.0.attn_v.bias",          16},
-        {"blk.0.attn_output.bias",     17},
-        {"blk.0.ffn_gate.bias",        18},
-        {"blk.0.ffn_down.bias",        19},
-        {"blk.0.ffn_up.bias",          20},
-        // optional: expert tensors
-        {"blk.0.ffn_gate_inp.weight",  21},
-        {"blk.0.ffn_gate_exps.weight", 22},
-        {"blk.0.ffn_down_exps.weight", 23},
-        {"blk.0.ffn_up_exps.weight",   24},
-    };
+    bool rope_used = false;

    for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
-        auto it = tensor_name_map.find(ggml_get_name(cur));
-        if (it != tensor_name_map.end()) {
-            switch (it->second) {
-                case 1: { // "token_embd.weight"
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_INPUT, ggml_nelements(cur));
-                    *inp_embd_dtype = cur->type;
-                    break;
-                }
-                case 2: { // "output_norm.weight"
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_embd); // norm weights
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
-                    break;
-                }
-                case 3: { // "output.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
-                    break;
-                }
-                case 4:  // "blk.0.attn_norm.weight"
-                case 12: // "blk.0.ffn_norm.weight"
-                { 
+        std::string tensor_name(ggml_get_name(cur));
+        std::regex blk_regex("blk\\.\\d+\\.(.+)");
+        std::smatch match;
+
+        if (std::regex_match(tensor_name, match, blk_regex) && match.size() > 1) {
+            std::string blk_suffix = match[1].str();
+
+            if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") {
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd); // norm weights
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 5: { // "blk.0.attn_q.weight"
+            } else if (blk_suffix == "attn_q.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd);  // rope
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 6: { // "blk.0.attn_k.weight"
+            } else if (blk_suffix == "attn_k.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
                count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 7: { // "blk.0.attn_v.weight"
+            } else if (blk_suffix == "attn_v.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
                count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 8: { // "blk.0.attn_output.weight"
+            } else if (blk_suffix == "attn_output.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 9: { // "blk.0.ffn_gate.weight"
+            } else if (blk_suffix == "ffn_gate.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 10: { // "blk.0.ffn_down.weight"
+            } else if (blk_suffix == "ffn_down.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 11: { // "blk.0.ffn_up.weight"
+            } else if (blk_suffix == "ffn_up.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 13: { // rope_freqs.weight, has been counted in q and k
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                // optional: bias tensors
-                case 14: // "blk.0.attn_q.bias"
-                case 15: // "blk.0.attn_k.bias"
-                case 16: // "blk.0.attn_v.bias"
-                case 17: // "blk.0.attn_output.bias"
-                case 19: // "blk.0.ffn_down.bias"
-                {
+            } else if (blk_suffix == "attn_q.bias" || blk_suffix == "attn_k.bias" || blk_suffix == "attn_v.bias" || blk_suffix == "blk.0.attn_output.bias" || blk_suffix == "ffn_down.bias") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd);
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 18: // "blk.0.ffn_gate.bias"
-                case 20: // "blk.0.ffn_up.bias"
-                {
+            } else if (blk_suffix == "ffn_gate.bias" || blk_suffix == "ffn_up.bias") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_ff);
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break; 
-                }
-                // optional: expert tensors
-                case 21: { // "blk.0.ffn_gate_inp.weight"
+            } else if (blk_suffix == "ffn_gate_inp.weight") { // optional: expert tensors
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
-                }
-                case 22: // "blk.0.ffn_gate_exps.weight"
-                case 23: // "blk.0.ffn_down_exps.weight"
-                case 24: // "blk.0.ffn_up_exps.weight"
-                { 
+            } else if (blk_suffix == "ffn_gate_exps.weight" || blk_suffix == "ffn_down_exps.weight" || blk_suffix == "ffn_up_exps.weight") {
                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
-                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
-                    break;
+            } else {
+                LLAMA_LOG_INFO("Uncaught tensor\n");
+                return;
            }
-                default:
+            count_n_params(n_params, cur->type,         PROFILER_LAYER_BACKEND, ggml_nelements(cur));
+            count_n_bytes (n_bytes,                     PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
+        } else {
+            if (tensor_name == "token_embd.weight") {
+                count_n_params(n_params, cur->type,     PROFILER_LAYER_INPUT, ggml_nelements(cur));
+                count_n_bytes (n_bytes,                 PROFILER_LAYER_INPUT, ggml_nbytes(cur));
+                *inp_embd_dtype = cur->type;
+            } else if (tensor_name == "output_norm.weight") {
+                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1);
+                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_embd);
+                count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
+                count_n_bytes (n_bytes,                 PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
+            } else if (tensor_name == "output.weight") {
+                count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
+                count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
+                count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
+                count_n_bytes (n_bytes,                 PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
+            } else if (tensor_name == "rope_freqs.weight") {
+                if (!rope_used) {
+                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
+                    count_n_bytes (n_bytes,                 PROFILER_LAYER_BACKEND, ggml_nbytes(cur));
+                    rope_used = true;
+                }
+            } else {
                LLAMA_LOG_INFO("Uncaught tensor\n");
                return;
            }
        }
    }

+    // use average values instead of total values
+    n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
+    n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
+    n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
+    n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
+    n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
+    n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
+
+    n_params->layer_f32    = static_cast<int64_t>((double)n_params->layer_f32    / (double)n_layer);
+    n_params->layer_f16    = static_cast<int64_t>((double)n_params->layer_f16    / (double)n_layer);
+    n_params->layer_q4k    = static_cast<int64_t>((double)n_params->layer_q4k    / (double)n_layer);
+    n_params->layer_q5k    = static_cast<int64_t>((double)n_params->layer_q5k    / (double)n_layer);
+    n_params->layer_q6k    = static_cast<int64_t>((double)n_params->layer_q6k    / (double)n_layer);
+    n_params->layer_q80    = static_cast<int64_t>((double)n_params->layer_q80    / (double)n_layer);
+
+    n_bytes->nb_layer      = static_cast<int64_t>((double)n_bytes->nb_layer      / (double)n_layer);
+
    // reset ml, model, and clear contexts
    ml->n_created = 0;
    ml->size_data = 0;