From 0f73d1224760221d3e6f6772e3aca6bba26af7e7 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Fri, 29 Nov 2024 11:15:54 +0400
Subject: [PATCH] decrease compute buf from available memory

---
 common/profiler.cpp | 16 ++++++++++++----
 include/llama.h     |  3 +++
 src/llama.cpp       | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 642f793d..1dd52618 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -877,9 +877,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
 }
 
 static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
-    auto n_params     = dev_info.model_params;
-    int n_layers      = llama_model_n_layers(model);
-    double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
+    auto n_params         = dev_info.model_params;
+    int n_layers          = llama_model_n_layers(model);
+    double kv_size_gb     = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
+    double compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
 
     int64_t total_bytes = 0;
     total_bytes += n_params.layer_f32 * 4 +
@@ -899,7 +900,14 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
     float total_gbytes = (double)total_bytes / 1e9; // convert to GB
     float mem_avail    = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
           mem_avail   -= static_cast<float>(kv_size_gb);
-    // todo: consider activations which also consumes the available memory
+    
+    if (mem_avail - static_cast<float>(compute_buf_gb) < total_gbytes) {
+        double compressed_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB
+        mem_avail -= static_cast<float>(compressed_compute_buf_gb);
+    } else {
+        mem_avail -= static_cast<float>(compute_buf_gb);
+    }
+          
 #ifdef __linux__
     float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
 #else
diff --git a/include/llama.h b/include/llama.h
index 27c322f3..3c3191b0 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -523,6 +523,9 @@ extern "C" {
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
+    // Return the size of compute buffer size, including input tensors and activations
+    LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory);
+
     // Return the size of KV cache in the model
     LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 851d1d69..690e65fd 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20808,6 +20808,44 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
     }
 }
 
+uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) {
+    const llama_hparams hparams = model->hparams;
+
+    // input tensors
+    const uint64_t n_inp_toks = cparams.n_ubatch;
+    const uint64_t n_inp_embd = hparams.n_embd  * cparams.n_ubatch;
+
+    // activations (see figures/memory-allocation-map-for-activations.png for detailed allocation)
+    const uint64_t n_bak_embd = hparams.n_embd  * cparams.n_ubatch;
+    const uint64_t n_inp_pos  = cparams.n_ubatch;
+    const uint64_t n_kq_mask  = cparams.n_ctx   * cparams.n_ubatch;
+    const uint64_t n_inp_out_ids = cparams.n_ubatch;
+    const uint64_t n_norm     = hparams.n_embd  * cparams.n_ubatch;
+    const uint64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch * 2;
+    const uint64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();
+
+    // outputs
+    const uint64_t n_out_embd = hparams.n_embd  * cparams.n_ubatch;
+    const uint64_t n_output   = hparams.n_vocab * cparams.n_ubatch;
+
+    // compute buffer size for input, each layer, and output
+    // const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);  // do not consider memory compression
+    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
+    const uint64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
+                                 n_inp_out_ids + n_norm + n_qcur + n_kq
+                                ) * ggml_type_size(GGML_TYPE_F32);
+    // const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);    // do not consider memory compression
+    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2;   // consider compressed memory with ratio 2:1
+
+    uint64_t n_buf_total = 0;
+    if (cparams.rank == 0) {
+        n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
+    } else {
+        n_buf_total = n_buf_act;
+    }
+    return n_buf_total;
+}
+
 uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
     const llama_hparams hparams = model->hparams;
     uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);