From 0f73d1224760221d3e6f6772e3aca6bba26af7e7 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Fri, 29 Nov 2024 11:15:54 +0400 Subject: [PATCH] decrease compute buf from available memory --- common/profiler.cpp | 16 ++++++++++++---- include/llama.h | 3 +++ src/llama.cpp | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 642f793d..1dd52618 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -877,9 +877,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay } static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) { - auto n_params = dev_info.model_params; - int n_layers = llama_model_n_layers(model); - double kv_size_gb = static_cast(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB + auto n_params = dev_info.model_params; + int n_layers = llama_model_n_layers(model); + double kv_size_gb = static_cast(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB + double compute_buf_gb = static_cast(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB int64_t total_bytes = 0; total_bytes += n_params.layer_f32 * 4 + @@ -899,7 +900,14 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam float total_gbytes = (double)total_bytes / 1e9; // convert to GB float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB mem_avail -= static_cast(kv_size_gb); - // todo: consider activations which also consumes the available memory + + if (mem_avail - static_cast(compute_buf_gb) < total_gbytes) { + double compressed_compute_buf_gb = static_cast(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB + mem_avail -= static_cast(compressed_compute_buf_gb); + } else { + mem_avail -= static_cast(compute_buf_gb); + } + #ifdef __linux__ float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s #else diff --git a/include/llama.h b/include/llama.h index 27c322f3..3c3191b0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -523,6 +523,9 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Return the size of compute buffer size, including input tensors and activations + LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory); + // Return the size of KV cache in the model LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams); diff --git a/src/llama.cpp b/src/llama.cpp index 851d1d69..690e65fd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20808,6 +20808,44 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, } } +uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) { + const llama_hparams hparams = model->hparams; + + // input tensors + const uint64_t n_inp_toks = cparams.n_ubatch; + const uint64_t n_inp_embd = hparams.n_embd * cparams.n_ubatch; + + // activations (see figures/memory-allocation-map-for-activations.png for detailed allocation) + const uint64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch; + const uint64_t n_inp_pos = cparams.n_ubatch; + const uint64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch; + const uint64_t n_inp_out_ids = cparams.n_ubatch; + const uint64_t n_norm = hparams.n_embd * cparams.n_ubatch; + const uint64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2; + const uint64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head(); + + // outputs + const uint64_t n_out_embd = hparams.n_embd * cparams.n_ubatch; + const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch; + + // compute buffer size for input, each layer, and output + // const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression + const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1 + const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask + + n_inp_out_ids + n_norm + n_qcur + n_kq + ) * ggml_type_size(GGML_TYPE_F32); + // const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression + const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1 + + uint64_t n_buf_total = 0; + if (cparams.rank == 0) { + n_buf_total = n_buf_inp + n_buf_act + n_buf_out; + } else { + n_buf_total = n_buf_act; + } + return n_buf_total; +} + uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) { const llama_hparams hparams = model->hparams; uint64_t ne_k = static_cast(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);