improve the computing buffer estimate

2025-09-11 13:14:33 +00:00 · 2025-06-19 08:02:43 +00:00 · 2025-06-19 08:02:43 +00:00 · dd589561b4
commit dd589561b4
parent 0b4ffdfde5
8 changed files with 87 additions and 34 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -765,6 +765,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.force = true;
        }
    ).set_env("LLAMA_ARG_FORCE"));
+    add_opt(llama_arg(
+        {"--master-priority"}, "N",
+        format("priority to assign workload to the master (default: %f, set 1.01 to use master first, and 0.99 to offload to other devices)", params.master_priority),
+        [](gpt_params & params, const std::string & value) {
+            params.master_priority = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_MASTER_PRIORITY"));
 // #ifdef GGML_USE_METAL
 //     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
 //     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1053,7 +1053,7 @@ static bool assign_layers_to_device(
            GGML_ASSERT(!is_windows && "Windows is not tested yet\n");

            bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]);

            int  l_m          = w[m] * k;  // total number of layers assigned to device m
            int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
@ -1248,10 +1248,8 @@ static bool assign_layers_to_device(
                    return cost * k;
                }
            );
-            // apply higher priority to the head device, here 0.99 is a heuristic value
-            // to ensure that small models in homogeneous clusters result in 32:0 partitioning,
-            // rather than 1:31.
-            model.lp_.col_cost_[0] *= 0.99;
+            // apply priority to the head device
+            model.lp_.col_cost_[0] *= 1.0 / cparams.master_priority;

            // define the variable bounds
            model.lp_.col_lower_ = std::vector<double>(n_world * 2, 0.0);
@ -1524,7 +1522,7 @@ static bool assign_layers_to_device(
    for (uint32_t m = 0; m < n_world; ++m) {
        const device_info & dev = dev_info_set[m];
        bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes);

        if (dev.gpu_support.cuda || dev.gpu_support.metal) {
            int64_t required_mem = w[m] * b_prime;
@ -2024,6 +2022,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.rank              = params.rank;
    cparams.prefetch          = params.prefetch;
    cparams.force             = params.force;
+    cparams.master_priority   = params.master_priority;
    cparams.keep_out_in_metal = params.keep_out_in_metal;
    cparams.n_gpu_layers      = params.n_gpu_layers;
    cparams.n_cycles          = params.n_cycles;
--- a/common/common.h
+++ b/common/common.h
@ -152,6 +152,7 @@ struct gpt_params {
    bool    prefetch              = false; // prefetch layer weights
    bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
    bool    force                 = false; // force to start prefetching after computation
+    float   master_priority       =  1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
    int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
    int32_t n_cycles              =     0; // number of cycles to output one token
    int32_t n_predict             =    -1; // new tokens to predict
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -1603,10 +1603,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam

 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_layers, n_gpu_layers);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_bytes, n_layers > n_gpu_layers);
 #else
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers);
 #endif

    double cpu_kv_size_gib     = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0;     // convert to GiB
--- a/common/profiler.h
+++ b/common/profiler.h
@ -293,10 +293,20 @@ struct model_bytes {
    int64_t nb_layer;
    int64_t nb_output;

+    // used to estimate the compute buffer size 
+    int64_t nb_output_w;
+    int64_t nb_attn_norm_w;
+    int64_t nb_ffn_gate_w;
+    int64_t nb_ffn_down_w;
+
    model_bytes() :
-        nb_input (0),
-        nb_layer (0),
-        nb_output(0) {}
+        nb_input      (0),
+        nb_layer      (0),
+        nb_output     (0), 
+        nb_output_w   (0),
+        nb_attn_norm_w(0),
+        nb_ffn_gate_w (0),
+        nb_ffn_down_w (0) {}
 };

 struct disk_props {