From dd589561b42a8a2ab49ef7526bcd942335c7807c Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Thu, 19 Jun 2025 08:02:43 +0000 Subject: [PATCH] improve the computing buffer estimate --- README.md | 1 + common/arg.cpp | 7 +++++ common/common.cpp | 11 +++---- common/common.h | 1 + common/profiler.cpp | 4 +-- common/profiler.h | 16 ++++++++-- include/llama.h | 5 +-- src/llama.cpp | 76 ++++++++++++++++++++++++++++++++------------- 8 files changed, 87 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index c69194f0..300ffa6a 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ mkdir build && cd build cmake .. make -j$(nproc) sudo make install +sudo ldconfig ``` **macOS:** diff --git a/common/arg.cpp b/common/arg.cpp index e282c80d..f1a33372 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -765,6 +765,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.force = true; } ).set_env("LLAMA_ARG_FORCE")); + add_opt(llama_arg( + {"--master-priority"}, "N", + format("priority to assign workload to the master (default: %f, set 1.01 to use master first, and 0.99 to offload to other devices)", params.master_priority), + [](gpt_params & params, const std::string & value) { + params.master_priority = std::stof(value); + } + ).set_env("LLAMA_ARG_MASTER_PRIORITY")); // #ifdef GGML_USE_METAL // // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data // // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency. diff --git a/common/common.cpp b/common/common.cpp index 39b95d32..cc4536b6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1053,7 +1053,7 @@ static bool assign_layers_to_device( GGML_ASSERT(!is_windows && "Windows is not tested yet\n"); bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k); + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]); int l_m = w[m] * k; // total number of layers assigned to device m int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU @@ -1248,10 +1248,8 @@ static bool assign_layers_to_device( return cost * k; } ); - // apply higher priority to the head device, here 0.99 is a heuristic value - // to ensure that small models in homogeneous clusters result in 32:0 partitioning, - // rather than 1:31. - model.lp_.col_cost_[0] *= 0.99; + // apply priority to the head device + model.lp_.col_cost_[0] *= 1.0 / cparams.master_priority; // define the variable bounds model.lp_.col_lower_ = std::vector(n_world * 2, 0.0); @@ -1524,7 +1522,7 @@ static bool assign_layers_to_device( for (uint32_t m = 0; m < n_world; ++m) { const device_info & dev = dev_info_set[m]; bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]); + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes); if (dev.gpu_support.cuda || dev.gpu_support.metal) { int64_t required_mem = w[m] * b_prime; @@ -2024,6 +2022,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.rank = params.rank; cparams.prefetch = params.prefetch; cparams.force = params.force; + cparams.master_priority = params.master_priority; cparams.keep_out_in_metal = params.keep_out_in_metal; cparams.n_gpu_layers = params.n_gpu_layers; cparams.n_cycles = params.n_cycles; diff --git a/common/common.h b/common/common.h index c6ffe136..cd78c173 100644 --- a/common/common.h +++ b/common/common.h @@ -152,6 +152,7 @@ struct gpt_params { bool prefetch = false; // prefetch layer weights bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default bool force = false; // force to start prefetching after computation + float master_priority = 1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices) int32_t gpu_mem = 999.0; // gpu memory to use, in GiB int32_t n_cycles = 0; // number of cycles to output one token int32_t n_predict = -1; // new tokens to predict diff --git a/common/profiler.cpp b/common/profiler.cpp index 18fe795d..292dc026 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1603,10 +1603,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); - llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_layers, n_gpu_layers); + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_bytes, n_layers > n_gpu_layers); #else llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); - llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers); + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers); #endif double cpu_kv_size_gib = static_cast(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB diff --git a/common/profiler.h b/common/profiler.h index ff69a454..c904ef98 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -293,10 +293,20 @@ struct model_bytes { int64_t nb_layer; int64_t nb_output; + // used to estimate the compute buffer size + int64_t nb_output_w; + int64_t nb_attn_norm_w; + int64_t nb_ffn_gate_w; + int64_t nb_ffn_down_w; + model_bytes() : - nb_input (0), - nb_layer (0), - nb_output(0) {} + nb_input (0), + nb_layer (0), + nb_output (0), + nb_output_w (0), + nb_attn_norm_w(0), + nb_ffn_gate_w (0), + nb_ffn_down_w (0) {} }; struct disk_props { diff --git a/include/llama.h b/include/llama.h index 3c220562..c61dd851 100644 --- a/include/llama.h +++ b/include/llama.h @@ -327,6 +327,7 @@ extern "C" { uint32_t n_cycles; // number of cycles to output one token bool prefetch; // whether to prefetch layer weights bool force; // force to start prefetching after computation + float master_priority; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices) bool keep_out_in_metal; // whether to keep output weights in metal memory char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node @@ -575,8 +576,8 @@ extern "C" { const struct llama_context_params cparams, bool use_gpu, bool is_master, - int n_layers, - int n_gpu_layers); + struct model_bytes n_bytes, + bool offload); // Return the size of KV cache in the model LLAMA_API void llama_total_kv_size( diff --git a/src/llama.cpp b/src/llama.cpp index 8b5af567..15255ea0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3679,6 +3679,8 @@ void llama_profile_device( dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(); dev_info->gpu_props.metal_mem_cpy_delay = device_metal_mem_copy(model); dev_info->gpu_props.cuda_mem_cpy_delay = device_cuda_mem_copy(model); +#else + (void)gpu_mem; #endif if (is_dtype_exist(n_params, GGML_TYPE_F32)) { @@ -20263,6 +20265,7 @@ struct llama_context_params llama_context_default_params() { /*.n_cycles =*/ 0, /*.prefetch =*/ false, /*.force =*/ false, + /*.master_priority =*/ 1.01, /*.keep_out_in_metal =*/ true, /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, @@ -21860,8 +21863,8 @@ void llama_model_compute_buf_size( const struct llama_context_params cparams, bool use_gpu, bool is_master, - int n_layers, - int n_gpu_layers) { + struct model_bytes n_bytes, + bool offload) { const llama_hparams hparams = model->hparams; // input tensors @@ -21872,34 +21875,61 @@ void llama_model_compute_buf_size( const int64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch; const int64_t n_inp_pos = cparams.n_ubatch; const int64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch; - const int64_t n_inp_out_ids = cparams.n_ubatch; const int64_t n_norm = hparams.n_embd * cparams.n_ubatch; - const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2; - const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head(); + const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch; + const int64_t n_ffn_gate = hparams.n_ff() * cparams.n_ubatch; + const int64_t n_ffn_up = hparams.n_ff() * cparams.n_ubatch; + const int64_t n_inp_out_ids = cparams.n_ubatch; // outputs const int64_t n_out_embd = hparams.n_embd * cparams.n_ubatch; - const int64_t n_output = hparams.n_vocab * cparams.n_ubatch; + const int64_t n_result = hparams.n_vocab * cparams.n_ubatch; - // compute buffer size for input, each layer, and output - const int64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); - const int64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask + - n_inp_out_ids + n_norm + n_qcur + n_kq - ) * ggml_type_size(GGML_TYPE_F32); - const int64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); - - *cpu_buf = 0; - *gpu_buf = 0; - if (is_master) *cpu_buf = n_buf_inp + n_buf_out; + // weights + const int64_t nb_output_w = n_bytes.nb_output_w; + const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w; + const int64_t nb_ffn_gate_w = n_bytes.nb_ffn_gate_w; + const int64_t nb_ffn_down_w = n_bytes.nb_ffn_down_w; + + const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * ggml_type_size(GGML_TYPE_F32); + *gpu_buf = use_gpu ? nb_act_buf_base : 0; + *cpu_buf = nb_act_buf_base; + int64_t gpu_host_buf = 0; + // estimate GPU computing buffer and GPU-host computing buffer if (use_gpu) { - *gpu_buf += n_buf_act; - if (n_layers > n_gpu_layers) { - *cpu_buf += n_buf_act; + if (is_master) { + if (offload) { + *gpu_buf += (n_ffn_up + n_qcur) * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w; + } else { + *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * ggml_type_size(GGML_TYPE_F32); + } + *gpu_buf += (n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32) + nb_output_w; + gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * ggml_type_size(GGML_TYPE_F32); + } else { + if (offload) { + *gpu_buf += n_qcur * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w; + } else { + *gpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); + } + gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); } - } else { - *cpu_buf += n_buf_act; + } + + // estimate CPU computing buffer + { + if (is_master) { + *cpu_buf += (n_ffn_up + n_kq_mask + n_inp_out_ids + n_qcur + n_inp_toks + n_inp_embd + n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32); + } else { + *cpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); + } + *cpu_buf += gpu_host_buf; } + + LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__, + *gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0)); + LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (CPU)\n", __func__, + *cpu_buf / (1024.0 * 1024.0)); } void llama_total_kv_size( @@ -22045,6 +22075,7 @@ void llama_model_n_flops( if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") { count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights + n_bytes->nb_attn_norm_w = std::max(n_bytes->nb_attn_norm_w, (int64_t)ggml_nbytes(cur)); } else if (blk_suffix == "attn_q.weight") { count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd); count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope @@ -22062,9 +22093,11 @@ void llama_model_n_flops( } else if (blk_suffix == "ffn_gate.weight") { count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU + n_bytes->nb_ffn_gate_w = std::max(n_bytes->nb_ffn_gate_w, (int64_t)ggml_nbytes(cur)); } else if (blk_suffix == "ffn_down.weight") { count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut + n_bytes->nb_ffn_down_w = std::max(n_bytes->nb_ffn_down_w, (int64_t)ggml_nbytes(cur)); } else if (blk_suffix == "ffn_up.weight") { count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x) @@ -22097,6 +22130,7 @@ void llama_model_n_flops( count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur)); count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur)); + n_bytes->nb_output_w = std::max(n_bytes->nb_output_w, (int64_t)ggml_nbytes(cur)); } else if (tensor_name == "rope_freqs.weight") { if (!rope_used) { count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));