fix compute buffer estimate: tested on cuda

This commit is contained in:
Zonghang Li 2025-06-22 08:10:57 +00:00
parent 80e5b71b48
commit 45e8b0420c
5 changed files with 68 additions and 30 deletions

View file

@ -1081,7 +1081,7 @@ static bool assign_layers_to_device(
bool is_windows = strcmp(dev.device_os, "Windows") == 0; bool is_windows = strcmp(dev.device_os, "Windows") == 0;
GGML_ASSERT(!is_windows && "Windows is not tested yet\n"); GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0); llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0);
int l_m = w[m] * k; // total number of layers assigned to device m int l_m = w[m] * k; // total number of layers assigned to device m
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
@ -1242,6 +1242,7 @@ static bool assign_layers_to_device(
if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) { if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime); vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
} }
vec_z_gpu[m] = std::max(vec_z_gpu[m], 0.0f);
} }
} }
@ -1554,7 +1555,7 @@ static bool assign_layers_to_device(
for (uint32_t m = 0; m < n_world; ++m) { for (uint32_t m = 0; m < n_world; ++m) {
const device_info & dev = dev_info_set[m]; const device_info & dev = dev_info_set[m];
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, true); llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m, dev_info_set[0].model_bytes, true);
if (dev.gpu_support.cuda || dev.gpu_support.metal) { if (dev.gpu_support.cuda || dev.gpu_support.metal) {
int64_t required_mem = w[m] * b_prime; int64_t required_mem = w[m] * b_prime;

View file

@ -1610,13 +1610,13 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
#elif GGML_USE_CUDA #elif GGML_USE_CUDA
backend = BACKEND_CUDA; backend = BACKEND_CUDA;
#endif #endif
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0); llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, 0, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
#else #else
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
enum backend_type backend = BACKEND_CPU; enum backend_type backend = BACKEND_CPU;
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0); llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, 0, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
#endif #endif
double cpu_kv_size_gib = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB double cpu_kv_size_gib = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB

View file

@ -295,7 +295,9 @@ struct model_bytes {
// used to estimate the compute buffer size // used to estimate the compute buffer size
int64_t nb_output_w; int64_t nb_output_w;
int64_t nb_output_norm_w;
int64_t nb_attn_norm_w; int64_t nb_attn_norm_w;
int64_t nb_attn_q_w;
int64_t nb_ffn_gate_w; int64_t nb_ffn_gate_w;
int64_t nb_ffn_down_w; int64_t nb_ffn_down_w;
@ -304,7 +306,9 @@ struct model_bytes {
nb_layer (0), nb_layer (0),
nb_output (0), nb_output (0),
nb_output_w (0), nb_output_w (0),
nb_attn_norm_w(0), nb_output_norm_w(0),
nb_attn_norm_w (0),
nb_attn_q_w (0),
nb_ffn_gate_w (0), nb_ffn_gate_w (0),
nb_ffn_down_w (0) {} nb_ffn_down_w (0) {}
}; };

View file

@ -585,7 +585,7 @@ extern "C" {
const struct llama_model * model, const struct llama_model * model,
const struct llama_context_params cparams, const struct llama_context_params cparams,
enum backend_type backend, enum backend_type backend,
bool is_master, int my_rank,
struct model_bytes n_bytes, struct model_bytes n_bytes,
bool offload, bool offload,
bool has_gpu_layers); bool has_gpu_layers);

View file

@ -21863,11 +21863,12 @@ void llama_model_compute_buf_size(
const struct llama_model * model, const struct llama_model * model,
const struct llama_context_params cparams, const struct llama_context_params cparams,
enum backend_type backend, enum backend_type backend,
bool is_master, int my_rank,
struct model_bytes n_bytes, struct model_bytes n_bytes,
bool offload, bool offload,
bool has_gpu_layers) { bool has_gpu_layers) {
const llama_hparams hparams = model->hparams; const llama_hparams hparams = model->hparams;
bool is_master = my_rank == 0;
// input tensors // input tensors
const int64_t n_inp_toks = cparams.n_ubatch; const int64_t n_inp_toks = cparams.n_ubatch;
@ -21884,6 +21885,8 @@ void llama_model_compute_buf_size(
const int64_t n_ffn_out = hparams.n_embd * cparams.n_ubatch; const int64_t n_ffn_out = hparams.n_embd * cparams.n_ubatch;
const int64_t n_ffn_inp = hparams.n_embd * cparams.n_ubatch; const int64_t n_ffn_inp = hparams.n_embd * cparams.n_ubatch;
const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head(); const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head();
const int64_t n_k = cparams.n_ctx * hparams.n_embd_head_k * hparams.n_head_kv();
const int64_t n_v = cparams.n_ctx * hparams.n_embd_head_v * hparams.n_head_kv();
const int64_t n_inp_out_ids = cparams.n_ubatch; const int64_t n_inp_out_ids = cparams.n_ubatch;
// outputs // outputs
@ -21893,34 +21896,60 @@ void llama_model_compute_buf_size(
// weights // weights
const int64_t nb_output_w = n_bytes.nb_output_w; const int64_t nb_output_w = n_bytes.nb_output_w;
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w; const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
const int64_t nb_ffn_gate_w = n_bytes.nb_ffn_gate_w; const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w;
const int64_t nb_ffn_down_w = n_bytes.nb_ffn_down_w;
// format bytes
const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32); const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
const int64_t type_size_f16 = ggml_type_size(GGML_TYPE_F16);
bool use_gpu = backend != BACKEND_CPU && has_gpu_layers; bool use_gpu = backend != BACKEND_CPU && has_gpu_layers;
*gpu_buf = 0; *gpu_buf = 0;
*cpu_buf = 0; *cpu_buf = 0;
int64_t gpu_host_buf = 0; int64_t gpu_host_buf = 0;
// GPU compute buffer
// estimate the GPU compute buffer, here we can only estimate the upper bound of various models,
// but cannot estimate the exact value.
if (backend == BACKEND_CUDA) { if (backend == BACKEND_CUDA) {
const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * type_size_f32; *gpu_buf = (n_bak_embd + n_norm) * type_size_f32;
*gpu_buf = use_gpu ? nb_act_buf_base : 0;
// CUDA computing buffer and CUDA-host buffer
if (is_master) { if (is_master) {
if (has_gpu_layers) {
if (offload) { if (offload) {
*gpu_buf += (n_ffn_up + n_qcur) * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w; *gpu_buf += std::max<int64_t>({
(n_qcur + n_inp_pos + n_kq_mask + n_inp_out_ids) * type_size_f32 + nb_attn_norm_w,
(n_qcur + n_inp_pos + n_norm) * type_size_f32 + nb_attn_norm_w,
(n_qcur + n_qcur + n_kq_mask + n_inp_pos) * type_size_f32,
(n_qcur + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_q_w,
n_inp_pos * type_size_f32 + (n_k + n_v) * type_size_f16 + nb_attn_q_w
});
} else { } else {
*gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * type_size_f32; *gpu_buf += (n_qcur + n_inp_pos + n_kq_mask + n_inp_out_ids) * type_size_f32;
}
*gpu_buf += (n_qcur + n_kq) * type_size_f32;
} else {
*gpu_buf += (n_qcur + n_kq) * type_size_f32;
*gpu_buf += std::max<int64_t>({
(n_kq_mask + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_norm_w,
(n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
});
} }
*gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w; *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32; gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
} else { } else {
if (has_gpu_layers) {
if (offload) { if (offload) {
*gpu_buf += n_qcur * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w; *gpu_buf += (n_kq + n_qcur) * type_size_f32;
*gpu_buf += std::max<int64_t>({
(n_inp_pos + n_norm + n_kq_mask) * type_size_f32 + nb_attn_norm_w,
(n_inp_pos + n_norm + n_qcur) * type_size_f32 + nb_attn_norm_w,
n_inp_pos * type_size_f32 + (n_k + n_v) * type_size_f16 + nb_attn_q_w,
});
} else { } else {
*gpu_buf += (n_ffn_up + n_kq_mask) * type_size_f32; *gpu_buf += (n_inp_pos + n_kq_mask + n_qcur + n_qcur + n_kq) * type_size_f32;
}
} else {
*gpu_buf += (n_qcur + n_kq + n_kq_mask + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_norm_w;
} }
gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * type_size_f32; gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * type_size_f32;
} }
@ -21964,19 +21993,22 @@ void llama_model_compute_buf_size(
GGML_ASSERT(false && "Unsupported backend type for compute buffer estimation.\n"); GGML_ASSERT(false && "Unsupported backend type for compute buffer estimation.\n");
} }
// CPU computing buffer // CPU compute buffer
if (*cpu_buf == 0) { if (*cpu_buf == 0) {
*cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32; *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32;
if (is_master) { if (is_master) {
*cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32; *cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32;
} }
*cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32; *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32;
*cpu_buf += gpu_host_buf;
} }
LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__, LLAMA_LOG_INFO("\n");
*gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0)); LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (CPU)\n", __func__, LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,
*cpu_buf / (1024.0 * 1024.0)); my_rank, *gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0));
LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (CPU and GPU-Host buffer)\n", __func__,
my_rank, *cpu_buf / (1024.0 * 1024.0));
} }
void llama_total_kv_size( void llama_total_kv_size(
@ -22126,6 +22158,7 @@ void llama_model_n_flops(
} else if (blk_suffix == "attn_q.weight") { } else if (blk_suffix == "attn_q.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd); count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
n_bytes->nb_attn_q_w = std::max(n_bytes->nb_attn_q_w, (int64_t)ggml_nbytes(cur));
} else if (blk_suffix == "attn_k.weight") { } else if (blk_suffix == "attn_k.weight") {
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k); count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope