From e543fcd55f98dcd2545a046488e132c172af3b39 Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Sat, 4 Jan 2025 19:16:36 +0400
Subject: [PATCH] fix buf_compute_meta size

---
 src/llama.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 27e2e88c..aa497843 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20423,10 +20423,12 @@ void * llama_context_setup_backend(
                 }
             }
 
-            const size_t max_nodes = llama_model_max_nodes(*model);
+            const size_t max_nodes    = llama_model_max_nodes(*model);
+            const size_t n_window_sum = std::accumulate(cparams.n_layer_window, cparams.n_layer_window + 32, 0u);
+            const size_t n_graphs     = static_cast<size_t>(std::ceil(llama_n_layer(model) / n_window_sum));
 
             // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)*n_graphs);
 
             // TODO: move these checks to ggml_backend_sched
             // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary