diff --git a/common/profiler.cpp b/common/profiler.cpp index 9020b5e9..261c5fb2 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -880,7 +880,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += gpu_latency_per_layer * n_gpu_layers; total_latency += cpu_latency_per_layer * (n_layers - n_gpu_layers); #else - total_latency *= cpu_latency_per_layer * n_layers; + total_latency += cpu_latency_per_layer * n_layers; #endif total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9; @@ -900,8 +900,8 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c static float device_memory_access_delay(struct device_info & dev_info, int n_layers) { struct model_params n_params = dev_info.model_params; - int64_t total_bytes = 0; - total_bytes += n_params.layer_f32 * 4 + + int64_t total_bytes = + n_params.layer_f32 * 4 + n_params.layer_f16 * 2 + n_params.layer_q4k / 2 + n_params.layer_q6k * 3 / 8 + @@ -929,14 +929,22 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam int n_layers = llama_model_n_layers(model); int n_gpu_layers = cparams.n_gpu_layers; + uint64_t cpu_kv_size; + uint64_t gpu_kv_size; + uint64_t cpu_compute_buf; + uint64_t gpu_compute_buf; + #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) - double cpu_kv_size_gb = 0.0f; - double cpu_compute_buf_gb = 0.0f; + llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true); #else - double cpu_kv_size_gb = static_cast(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB - double cpu_compute_buf_gb = static_cast(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB + llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); + llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false); #endif + double cpu_kv_size_gb = static_cast(cpu_kv_size) / 1e9; // convert to GB + double cpu_compute_buf_gb = static_cast(cpu_compute_buf) / 1e9; // convert to GB + int64_t cpu_total_bytes = n_params.layer_f32 * 4 + n_params.layer_f16 * 2 + @@ -947,6 +955,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) cpu_total_bytes *= (n_layers - n_gpu_layers); #else + (void)n_gpu_layers; cpu_total_bytes *= n_layers; #endif @@ -960,13 +969,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam float cpu_total_gbytes = (double)cpu_total_bytes / 1e9; // convert to GB float cpu_mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB cpu_mem_avail -= static_cast(cpu_kv_size_gb); - - if (cpu_mem_avail - static_cast(cpu_compute_buf_gb) < cpu_total_gbytes) { - double compressed_compute_buf_gb = static_cast(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB - cpu_mem_avail -= static_cast(compressed_compute_buf_gb); - } else { - cpu_mem_avail -= static_cast(cpu_compute_buf_gb); - } + cpu_mem_avail -= static_cast(cpu_compute_buf_gb); #ifdef __linux__ float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s diff --git a/include/llama.h b/include/llama.h index c8eb58cd..e21b056a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -525,10 +525,20 @@ extern "C" { LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); // Return the size of compute buffer size, including input tensors and activations - LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory); + LLAMA_API void llama_model_compute_buf_size( + uint64_t * cpu_buf, + uint64_t * gpu_buf, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu); // Return the size of KV cache in the model - LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams); + LLAMA_API void llama_model_kvcache_size( + uint64_t * cpu_cache, + uint64_t * gpu_cache, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu); // Return the total number of float operations in the model LLAMA_API void llama_model_n_flops( diff --git a/src/llama.cpp b/src/llama.cpp index 80aa07fe..489ff29f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20810,7 +20810,12 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, } } -uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) { +void llama_model_compute_buf_size( + uint64_t * cpu_buf, + uint64_t * gpu_buf, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu) { const llama_hparams hparams = model->hparams; // input tensors @@ -20831,30 +20836,42 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch; // compute buffer size for input, each layer, and output - const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression + const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_norm + n_qcur + n_kq ) * ggml_type_size(GGML_TYPE_F32); - const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression + const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); - uint64_t n_buf_total = 0; - if (cparams.rank == 0) { - if (compress_memory) { - n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1 + if (use_gpu) { + *gpu_buf = n_buf_act; + if (llama_model_n_layers(model) > cparams.n_gpu_layers) { + *cpu_buf = n_buf_inp + n_buf_act + n_buf_out; } else { - n_buf_total = n_buf_inp + n_buf_act + n_buf_out; + *cpu_buf = n_buf_inp + n_buf_out; } } else { - n_buf_total = n_buf_act; + *gpu_buf = 0; + *cpu_buf = n_buf_inp + n_buf_act + n_buf_out; } - return n_buf_total; } -uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) { +void llama_model_kvcache_size( + uint64_t * cpu_cache, + uint64_t * gpu_cache, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu) { const llama_hparams hparams = model->hparams; uint64_t ne_k = static_cast(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k); uint64_t ne_v = static_cast(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v); - return (ne_k + ne_v) * llama_model_n_layers(model); + if (use_gpu) { + int n_gpu_layers = cparams.n_gpu_layers; + *gpu_cache = (ne_k + ne_v) * n_gpu_layers; + *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers); + } else { + *gpu_cache = 0; + *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model); + } } void llama_model_n_flops(