diff --git a/common/arg.cpp b/common/arg.cpp index f1a33372..4039b19c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -749,6 +749,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, format("maximum GPU memory to use (default: %d)", params.gpu_mem), [](gpt_params & params, int value) { params.gpu_mem = value; // in GiB + if (value == 0) { + LOG_WRN("WARN: Set --gpu-mem to 0 may lead to errors during workload distribution.\n"); + } } ).set_env("LLAMA_ARG_CUDA_MEM")); add_opt(llama_arg( diff --git a/common/common.cpp b/common/common.cpp index cc4536b6..4f1ddff3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -847,6 +847,16 @@ static std::string vec_to_str(const std::vector & vec) { return oss.str(); } +static backend_type get_backend_type(const gpu_support & support) { + if (support.cuda) return BACKEND_CUDA; + if (support.metal) return BACKEND_METAL; + if (support.vulkan) return BACKEND_VULKAN; + if (support.kompute) return BACKEND_KOMPUTE; + if (support.gpublas) return BACKEND_GPUBLAS; + if (support.sycl) return BACKEND_SYCL; + return BACKEND_CPU; +} + static bool assign_layers_to_device( uint32_t n_world, const device_info * dev_info_set, @@ -972,7 +982,7 @@ static bool assign_layers_to_device( bool is_android = strcmp(dev.device_os, "Android") == 0; bool is_windows = strcmp(dev.device_os, "Windows") == 0; GGML_ASSERT(!is_windows && "Windows is not tested yet\n"); - + if ((is_macos && !dev.gpu_support.metal) || is_linux) { mem_budget[m] = dev.memory.available_physical; } else if (is_macos && dev.gpu_support.metal) { @@ -985,11 +995,21 @@ static bool assign_layers_to_device( } } - // initialize w_m proportionally to memory budget and n_m to 0 + // initialize w_m proportionally to memory budget float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f); for (uint32_t m = 0; m < n_world; ++m) { w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer); - n[m] = 0; + } + // no 0 is allowed in w, it must be at least 1 + for (uint32_t m = 0; m < n_world; ++m) { + if (w[m] == 0) { + w[m] = 1; + // find the maximum and decrease it by 1 + auto max_it = std::max_element(w.begin(), w.end()); + if (max_it != w.end() && *max_it > 1) { + *max_it -= 1; + } + } } // adjust w[m] to ensure L mod W = 0 int diff = n_layer - std::accumulate(w.begin(), w.end(), 0); @@ -997,6 +1017,15 @@ static bool assign_layers_to_device( : std::min_element(mem_budget.begin(), mem_budget.end()); w[std::distance(mem_budget.begin(), device)] += diff; + // initialize n_m to w_m (if there is GPU), assume all layers can run on GPU + for (uint32_t m = 0; m < n_world; ++m) { + if (dev_info_set[m].gpu_support.metal || dev_info_set[m].gpu_support.cuda) { + n[m] = w[m]; + } else { + n[m] = 0; + } + } + // stores the actual read bandwidth (GB/s) for each device std::vector disk_speed(n_world, 0.0f); for (uint32_t m = 0; m < n_world; ++m) { @@ -1052,8 +1081,7 @@ static bool assign_layers_to_device( bool is_windows = strcmp(dev.device_os, "Windows") == 0; GGML_ASSERT(!is_windows && "Windows is not tested yet\n"); - bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]); + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0); int l_m = w[m] * k; // total number of layers assigned to device m int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU @@ -1424,14 +1452,18 @@ static bool assign_layers_to_device( if (n_m < static_cast(std::floor(W * vec_z_gpu[m]))) { // if there is still free GPU memory has_free_gpu_memory = true; + LOG_INF("Device %d still has free GPU memory: w_m = %d, n_m = %d, W * vec_z_gpu[m]) = %d\n", + m, w_m, n_m, static_cast(std::floor(W * vec_z_gpu[m]))); } if (w_m > n_m) { // if layers are offloaded to CPU has_gpu_overload = true; + LOG_INF("Device %d has GPU overload: w_m = %d, n_m = %d\n", m, w_m, n_m); } } else if (!in_set(m, M4)) { // if the CPU is overloaded has_cpu_overload = true; + LOG_INF("Device %d has CPU overload.\n", m); } } @@ -1522,7 +1554,7 @@ static bool assign_layers_to_device( for (uint32_t m = 0; m < n_world; ++m) { const device_info & dev = dev_info_set[m]; bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes); + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, true); if (dev.gpu_support.cuda || dev.gpu_support.metal) { int64_t required_mem = w[m] * b_prime; diff --git a/common/profiler.cpp b/common/profiler.cpp index 292dc026..c788a3e6 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1603,10 +1603,20 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); - llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_bytes, n_layers > n_gpu_layers); + + enum backend_type backend; +#if GGML_USE_METAL + backend = BACKEND_METAL; +#elif GGML_USE_CUDA + backend = BACKEND_CUDA; +#endif + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0); + #else llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); - llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers); + + enum backend_type backend = BACKEND_CPU; + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0); #endif double cpu_kv_size_gib = static_cast(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB diff --git a/include/llama.h b/include/llama.h index c61dd851..fc42856c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -67,6 +67,16 @@ extern "C" { typedef int32_t llama_token; typedef int32_t llama_seq_id; + enum backend_type { + BACKEND_CPU = 0, + BACKEND_CUDA = 1, + BACKEND_METAL = 2, + BACKEND_VULKAN = 3, + BACKEND_KOMPUTE = 4, + BACKEND_GPUBLAS = 5, + BACKEND_SYCL = 6 + }; + enum llama_vocab_type { LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback @@ -574,10 +584,11 @@ extern "C" { int64_t * gpu_buf, const struct llama_model * model, const struct llama_context_params cparams, - bool use_gpu, + enum backend_type backend, bool is_master, struct model_bytes n_bytes, - bool offload); + bool offload, + bool has_gpu_layers); // Return the size of KV cache in the model LLAMA_API void llama_total_kv_size( diff --git a/src/llama.cpp b/src/llama.cpp index 15255ea0..eb551280 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3673,6 +3673,7 @@ void llama_profile_device( // reserved/limit memory to avoid potential OOM, default to 300 MiB dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_free = std::min((float)gpu_mem, dev_info->gpu_props.memory_free) - 0.3; + dev_info->gpu_props.memory_free = std::max(dev_info->gpu_props.memory_free, 0.0f); dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(); @@ -21861,10 +21862,11 @@ void llama_model_compute_buf_size( int64_t * gpu_buf, const struct llama_model * model, const struct llama_context_params cparams, - bool use_gpu, + enum backend_type backend, bool is_master, struct model_bytes n_bytes, - bool offload) { + bool offload, + bool has_gpu_layers) { const llama_hparams hparams = model->hparams; // input tensors @@ -21879,6 +21881,9 @@ void llama_model_compute_buf_size( const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch; const int64_t n_ffn_gate = hparams.n_ff() * cparams.n_ubatch; const int64_t n_ffn_up = hparams.n_ff() * cparams.n_ubatch; + const int64_t n_ffn_out = hparams.n_embd * cparams.n_ubatch; + const int64_t n_ffn_inp = hparams.n_embd * cparams.n_ubatch; + const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head(); const int64_t n_inp_out_ids = cparams.n_ubatch; // outputs @@ -21890,40 +21895,82 @@ void llama_model_compute_buf_size( const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w; const int64_t nb_ffn_gate_w = n_bytes.nb_ffn_gate_w; const int64_t nb_ffn_down_w = n_bytes.nb_ffn_down_w; - - const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * ggml_type_size(GGML_TYPE_F32); - *gpu_buf = use_gpu ? nb_act_buf_base : 0; - *cpu_buf = nb_act_buf_base; + + const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32); + + bool use_gpu = backend != BACKEND_CPU && has_gpu_layers; + *gpu_buf = 0; + *cpu_buf = 0; int64_t gpu_host_buf = 0; - // estimate GPU computing buffer and GPU-host computing buffer - if (use_gpu) { + if (backend == BACKEND_CUDA) { + const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * type_size_f32; + *gpu_buf = use_gpu ? nb_act_buf_base : 0; + + // CUDA computing buffer and CUDA-host buffer if (is_master) { if (offload) { - *gpu_buf += (n_ffn_up + n_qcur) * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w; + *gpu_buf += (n_ffn_up + n_qcur) * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w; } else { - *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * ggml_type_size(GGML_TYPE_F32); + *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * type_size_f32; } - *gpu_buf += (n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32) + nb_output_w; - gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * ggml_type_size(GGML_TYPE_F32); + *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w; + gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32; } else { if (offload) { - *gpu_buf += n_qcur * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w; + *gpu_buf += n_qcur * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w; } else { - *gpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); + *gpu_buf += (n_ffn_up + n_kq_mask) * type_size_f32; } - gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); + gpu_host_buf = (n_bak_embd + n_inp_pos + n_kq_mask) * type_size_f32; } } - // estimate CPU computing buffer - { - if (is_master) { - *cpu_buf += (n_ffn_up + n_kq_mask + n_inp_out_ids + n_qcur + n_inp_toks + n_inp_embd + n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32); + else if (backend == BACKEND_METAL) { + const int64_t nb_act_buf_base = (n_inp_pos + n_kq_mask) * type_size_f32; + *gpu_buf = nb_act_buf_base; + *cpu_buf = nb_act_buf_base; + + if (use_gpu) { + if (is_master) { + *cpu_buf += (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32; + + if (offload) { + *gpu_buf += (n_ffn_out + n_ffn_inp + n_inp_out_ids) * type_size_f32; + *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32; + *cpu_buf += n_norm * type_size_f32; + *cpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32; + } else { + *gpu_buf += (n_bak_embd + n_inp_out_ids + n_norm) * type_size_f32; + *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32; + } + } else { + *gpu_buf += (n_ffn_out + n_ffn_inp) * type_size_f32; + *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32; + + *cpu_buf += n_bak_embd * type_size_f32; + if (offload) { + *cpu_buf += n_norm * type_size_f32; + *cpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32; + } + } } else { - *cpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32); + *gpu_buf = 0; + *cpu_buf = 0; } - *cpu_buf += gpu_host_buf; + } + + else if (backend != BACKEND_CPU) { + GGML_ASSERT(false && "Unsupported backend type for compute buffer estimation.\n"); + } + + // CPU computing buffer + if (*cpu_buf == 0) { + *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32; + if (is_master) { + *cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32; + } + *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32; } LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,