mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 19:39:03 +00:00
improve the computing buffer estimate
This commit is contained in:
parent
0b4ffdfde5
commit
dd589561b4
8 changed files with 87 additions and 34 deletions
|
@ -135,6 +135,7 @@ mkdir build && cd build
|
|||
cmake ..
|
||||
make -j$(nproc)
|
||||
sudo make install
|
||||
sudo ldconfig
|
||||
```
|
||||
|
||||
**macOS:**
|
||||
|
|
|
@ -765,6 +765,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.force = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_FORCE"));
|
||||
add_opt(llama_arg(
|
||||
{"--master-priority"}, "N",
|
||||
format("priority to assign workload to the master (default: %f, set 1.01 to use master first, and 0.99 to offload to other devices)", params.master_priority),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.master_priority = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_MASTER_PRIORITY"));
|
||||
// #ifdef GGML_USE_METAL
|
||||
// // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
|
||||
// // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
|
||||
|
|
|
@ -1053,7 +1053,7 @@ static bool assign_layers_to_device(
|
|||
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||
|
||||
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]);
|
||||
|
||||
int l_m = w[m] * k; // total number of layers assigned to device m
|
||||
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
|
||||
|
@ -1248,10 +1248,8 @@ static bool assign_layers_to_device(
|
|||
return cost * k;
|
||||
}
|
||||
);
|
||||
// apply higher priority to the head device, here 0.99 is a heuristic value
|
||||
// to ensure that small models in homogeneous clusters result in 32:0 partitioning,
|
||||
// rather than 1:31.
|
||||
model.lp_.col_cost_[0] *= 0.99;
|
||||
// apply priority to the head device
|
||||
model.lp_.col_cost_[0] *= 1.0 / cparams.master_priority;
|
||||
|
||||
// define the variable bounds
|
||||
model.lp_.col_lower_ = std::vector<double>(n_world * 2, 0.0);
|
||||
|
@ -1524,7 +1522,7 @@ static bool assign_layers_to_device(
|
|||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes);
|
||||
|
||||
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
|
||||
int64_t required_mem = w[m] * b_prime;
|
||||
|
@ -2024,6 +2022,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.rank = params.rank;
|
||||
cparams.prefetch = params.prefetch;
|
||||
cparams.force = params.force;
|
||||
cparams.master_priority = params.master_priority;
|
||||
cparams.keep_out_in_metal = params.keep_out_in_metal;
|
||||
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||
cparams.n_cycles = params.n_cycles;
|
||||
|
|
|
@ -152,6 +152,7 @@ struct gpt_params {
|
|||
bool prefetch = false; // prefetch layer weights
|
||||
bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default
|
||||
bool force = false; // force to start prefetching after computation
|
||||
float master_priority = 1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
|
||||
int32_t gpu_mem = 999.0; // gpu memory to use, in GiB
|
||||
int32_t n_cycles = 0; // number of cycles to output one token
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
|
|
@ -1603,10 +1603,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
|
|||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_layers, n_gpu_layers);
|
||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_bytes, n_layers > n_gpu_layers);
|
||||
#else
|
||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers);
|
||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers);
|
||||
#endif
|
||||
|
||||
double cpu_kv_size_gib = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB
|
||||
|
|
|
@ -293,10 +293,20 @@ struct model_bytes {
|
|||
int64_t nb_layer;
|
||||
int64_t nb_output;
|
||||
|
||||
// used to estimate the compute buffer size
|
||||
int64_t nb_output_w;
|
||||
int64_t nb_attn_norm_w;
|
||||
int64_t nb_ffn_gate_w;
|
||||
int64_t nb_ffn_down_w;
|
||||
|
||||
model_bytes() :
|
||||
nb_input (0),
|
||||
nb_layer (0),
|
||||
nb_output(0) {}
|
||||
nb_input (0),
|
||||
nb_layer (0),
|
||||
nb_output (0),
|
||||
nb_output_w (0),
|
||||
nb_attn_norm_w(0),
|
||||
nb_ffn_gate_w (0),
|
||||
nb_ffn_down_w (0) {}
|
||||
};
|
||||
|
||||
struct disk_props {
|
||||
|
|
|
@ -327,6 +327,7 @@ extern "C" {
|
|||
uint32_t n_cycles; // number of cycles to output one token
|
||||
bool prefetch; // whether to prefetch layer weights
|
||||
bool force; // force to start prefetching after computation
|
||||
float master_priority; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
|
||||
bool keep_out_in_metal; // whether to keep output weights in metal memory
|
||||
char * master_ip; // ip address of the master node
|
||||
char * next_node_ip; // ip address of the next node
|
||||
|
@ -575,8 +576,8 @@ extern "C" {
|
|||
const struct llama_context_params cparams,
|
||||
bool use_gpu,
|
||||
bool is_master,
|
||||
int n_layers,
|
||||
int n_gpu_layers);
|
||||
struct model_bytes n_bytes,
|
||||
bool offload);
|
||||
|
||||
// Return the size of KV cache in the model
|
||||
LLAMA_API void llama_total_kv_size(
|
||||
|
|
|
@ -3679,6 +3679,8 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
||||
dev_info->gpu_props.metal_mem_cpy_delay = device_metal_mem_copy(model);
|
||||
dev_info->gpu_props.cuda_mem_cpy_delay = device_cuda_mem_copy(model);
|
||||
#else
|
||||
(void)gpu_mem;
|
||||
#endif
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
|
||||
|
@ -20263,6 +20265,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_cycles =*/ 0,
|
||||
/*.prefetch =*/ false,
|
||||
/*.force =*/ false,
|
||||
/*.master_priority =*/ 1.01,
|
||||
/*.keep_out_in_metal =*/ true,
|
||||
/*.master_ip =*/ nullptr,
|
||||
/*.next_node_ip =*/ nullptr,
|
||||
|
@ -21860,8 +21863,8 @@ void llama_model_compute_buf_size(
|
|||
const struct llama_context_params cparams,
|
||||
bool use_gpu,
|
||||
bool is_master,
|
||||
int n_layers,
|
||||
int n_gpu_layers) {
|
||||
struct model_bytes n_bytes,
|
||||
bool offload) {
|
||||
const llama_hparams hparams = model->hparams;
|
||||
|
||||
// input tensors
|
||||
|
@ -21872,34 +21875,61 @@ void llama_model_compute_buf_size(
|
|||
const int64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch;
|
||||
const int64_t n_inp_pos = cparams.n_ubatch;
|
||||
const int64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch;
|
||||
const int64_t n_inp_out_ids = cparams.n_ubatch;
|
||||
const int64_t n_norm = hparams.n_embd * cparams.n_ubatch;
|
||||
const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2;
|
||||
const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head();
|
||||
const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch;
|
||||
const int64_t n_ffn_gate = hparams.n_ff() * cparams.n_ubatch;
|
||||
const int64_t n_ffn_up = hparams.n_ff() * cparams.n_ubatch;
|
||||
const int64_t n_inp_out_ids = cparams.n_ubatch;
|
||||
|
||||
// outputs
|
||||
const int64_t n_out_embd = hparams.n_embd * cparams.n_ubatch;
|
||||
const int64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
||||
const int64_t n_result = hparams.n_vocab * cparams.n_ubatch;
|
||||
|
||||
// compute buffer size for input, each layer, and output
|
||||
const int64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
|
||||
const int64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
||||
n_inp_out_ids + n_norm + n_qcur + n_kq
|
||||
) * ggml_type_size(GGML_TYPE_F32);
|
||||
const int64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
|
||||
|
||||
*cpu_buf = 0;
|
||||
*gpu_buf = 0;
|
||||
if (is_master) *cpu_buf = n_buf_inp + n_buf_out;
|
||||
// weights
|
||||
const int64_t nb_output_w = n_bytes.nb_output_w;
|
||||
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
|
||||
const int64_t nb_ffn_gate_w = n_bytes.nb_ffn_gate_w;
|
||||
const int64_t nb_ffn_down_w = n_bytes.nb_ffn_down_w;
|
||||
|
||||
const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * ggml_type_size(GGML_TYPE_F32);
|
||||
*gpu_buf = use_gpu ? nb_act_buf_base : 0;
|
||||
*cpu_buf = nb_act_buf_base;
|
||||
int64_t gpu_host_buf = 0;
|
||||
|
||||
// estimate GPU computing buffer and GPU-host computing buffer
|
||||
if (use_gpu) {
|
||||
*gpu_buf += n_buf_act;
|
||||
if (n_layers > n_gpu_layers) {
|
||||
*cpu_buf += n_buf_act;
|
||||
if (is_master) {
|
||||
if (offload) {
|
||||
*gpu_buf += (n_ffn_up + n_qcur) * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w;
|
||||
} else {
|
||||
*gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
*gpu_buf += (n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32) + nb_output_w;
|
||||
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * ggml_type_size(GGML_TYPE_F32);
|
||||
} else {
|
||||
if (offload) {
|
||||
*gpu_buf += n_qcur * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w;
|
||||
} else {
|
||||
*gpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
} else {
|
||||
*cpu_buf += n_buf_act;
|
||||
}
|
||||
|
||||
// estimate CPU computing buffer
|
||||
{
|
||||
if (is_master) {
|
||||
*cpu_buf += (n_ffn_up + n_kq_mask + n_inp_out_ids + n_qcur + n_inp_toks + n_inp_embd + n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32);
|
||||
} else {
|
||||
*cpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
*cpu_buf += gpu_host_buf;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,
|
||||
*gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0));
|
||||
LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (CPU)\n", __func__,
|
||||
*cpu_buf / (1024.0 * 1024.0));
|
||||
}
|
||||
|
||||
void llama_total_kv_size(
|
||||
|
@ -22045,6 +22075,7 @@ void llama_model_n_flops(
|
|||
if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") {
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights
|
||||
n_bytes->nb_attn_norm_w = std::max(n_bytes->nb_attn_norm_w, (int64_t)ggml_nbytes(cur));
|
||||
} else if (blk_suffix == "attn_q.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
|
||||
|
@ -22062,9 +22093,11 @@ void llama_model_n_flops(
|
|||
} else if (blk_suffix == "ffn_gate.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
|
||||
n_bytes->nb_ffn_gate_w = std::max(n_bytes->nb_ffn_gate_w, (int64_t)ggml_nbytes(cur));
|
||||
} else if (blk_suffix == "ffn_down.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
|
||||
n_bytes->nb_ffn_down_w = std::max(n_bytes->nb_ffn_down_w, (int64_t)ggml_nbytes(cur));
|
||||
} else if (blk_suffix == "ffn_up.weight") {
|
||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
|
||||
|
@ -22097,6 +22130,7 @@ void llama_model_n_flops(
|
|||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||
count_n_bytes (n_bytes, PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
|
||||
n_bytes->nb_output_w = std::max(n_bytes->nb_output_w, (int64_t)ggml_nbytes(cur));
|
||||
} else if (tensor_name == "rope_freqs.weight") {
|
||||
if (!rope_used) {
|
||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||
|
|
Loading…
Add table
Reference in a new issue