diff --git a/src/llama.cpp b/src/llama.cpp
index dd232fe0..45b15223 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21994,11 +21994,13 @@ void llama_model_compute_buf_size(
 
     // CPU compute buffer for NUMA system or Metal with ngl=0
     if (*cpu_buf == 0) {
-        *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32;
         if (is_master) {
             *cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32;   
         }
-        *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32;
+        if (offload) {
+            *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32;
+            *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32;
+        }
         *cpu_buf += gpu_host_buf;
     }