llama_model_n_flops: remove ctxs

2025-09-11 12:14:34 +00:00 · 2024-12-06 11:31:53 +04:00 · 2024-12-06 11:31:53 +04:00 · a46d56cc60
commit a46d56cc60
parent f1c1d1b929
1 changed files with 122 additions and 127 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3571,23 +3571,23 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
 }
 void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
    struct model_flops  * n_flops  = &dev_info->model_flops;
    struct model_params * n_params = &dev_info->model_params;
    if (dev_info->rank == 0) {    
        enum ggml_type inp_embd_dtype  = GGML_TYPE_F32;
        llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype);
        n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
    }
    dev_info->device_name               = device_name();
    dev_info->cpu_props.cores           = device_cpu_cores();
    dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
    dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
    dev_info->memory.total_swap         = round(device_swap_memory(false)     / (double)(1 << 30) * 100) / 100;
    dev_info->memory.available_swap     = round(device_swap_memory(true)      / (double)(1 << 30) * 100) / 100;
    dev_info->memory.cpu_read_ram_bw    = device_memory_bw(n_threads);
    struct model_flops  * n_flops  = &dev_info->model_flops;
    struct model_params * n_params = &dev_info->model_params;
    if (dev_info->rank == 0) {    
        enum ggml_type inp_embd_dtype  = GGML_TYPE_F32;
        llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype);
        n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
    }
    device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw, n_threads);
    device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw, n_threads);
@ -20966,6 +20966,7 @@ void llama_model_n_flops(
        buft_layer_count[model->buft_layer[i].buft]++;
        buft_layer_count[model->buft_layer[i].buft_matrix]++;
    }
    GGML_ASSERT(buft_layer_count.size() == 1);
    // create one context per buffer type
    size_t ctx_size = ggml_tensor_overhead() * (ml->n_tensors + 1);
@ -20974,19 +20975,18 @@ void llama_model_n_flops(
    ctx_size += ggml_tensor_overhead() * n_layer * 3;
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    std::vector<struct ggml_context *> ctxs;
+    struct ggml_context * ctx = nullptr;
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
-        ggml_context * ctx = ggml_init(params);
+        ctx = ggml_init(params);
        if (!ctx) {
            throw std::runtime_error(format("failed to create context\n"));
        }
        ctx_map[it.first] = ctx;
        ctxs.push_back(ctx);
    }
    const uint32_t n_layer_window[32] = {(uint32_t)n_layer};
@ -21035,7 +21035,6 @@ void llama_model_n_flops(
        {"blk.0.ffn_up_exps.weight",   24},
    };
    for (ggml_context * ctx : ctxs) {
    for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
        auto it = tensor_name_map.find(ggml_get_name(cur));
        if (it != tensor_name_map.end()) {
@ -21149,16 +21148,12 @@ void llama_model_n_flops(
            }
        }
    }
    }
    // reset ml, model, and clear contexts
    ml->n_created = 0;
    ml->size_data = 0;
    llama_model_reset_tensors(model);
    for (ggml_context * ctx : ctxs) {
    ggml_free(ctx);
    }
    ctxs.clear();
    ctx_map.clear();
 }