add gpu support in llama_model_kvcache_size and llama_model_compute_buf_size

2025-09-10 06:54:33 +00:00 · 2024-11-29 21:06:32 +04:00 · 2024-11-29 21:06:32 +04:00 · 6f54a12c7d
commit 6f54a12c7d
parent f8e9dc2713
3 changed files with 58 additions and 28 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -20810,7 +20810,12 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
    }
 }

-uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) {
+void llama_model_compute_buf_size(
+                            uint64_t * cpu_buf, 
+                            uint64_t * gpu_buf, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams,
+                                bool   use_gpu) {
    const llama_hparams hparams = model->hparams;

    // input tensors
@ -20831,30 +20836,42 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
    const uint64_t n_output   = hparams.n_vocab * cparams.n_ubatch;

    // compute buffer size for input, each layer, and output
-    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);  // do not consider memory compression
+    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
    const uint64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
                                 n_inp_out_ids + n_norm + n_qcur + n_kq
                                ) * ggml_type_size(GGML_TYPE_F32);
-    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);    // do not consider memory compression
+    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);

-    uint64_t n_buf_total = 0;
-    if (cparams.rank == 0) {
-        if (compress_memory) {
-            n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1
+    if (use_gpu) {
+        *gpu_buf = n_buf_act;
+        if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
+            *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
        } else {
-            n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
+            *cpu_buf = n_buf_inp + n_buf_out;
        }
    } else {
-        n_buf_total = n_buf_act;
+        *gpu_buf = 0;
+        *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
    }
-    return n_buf_total;
 }

-uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
+void llama_model_kvcache_size(
+                            uint64_t * cpu_cache, 
+                            uint64_t * gpu_cache, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams, 
+                                bool   use_gpu) {
    const llama_hparams hparams = model->hparams;
    uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
    uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
-    return (ne_k + ne_v) * llama_model_n_layers(model);
+    if (use_gpu) {
+        int n_gpu_layers = cparams.n_gpu_layers;
+        *gpu_cache = (ne_k + ne_v) * n_gpu_layers;
+        *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
+    } else {
+        *gpu_cache = 0;
+        *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
+    }
 }

 void llama_model_n_flops(