From e543fcd55f98dcd2545a046488e132c172af3b39 Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Sat, 4 Jan 2025 19:16:36 +0400 Subject: [PATCH] fix buf_compute_meta size --- src/llama.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 27e2e88c..aa497843 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20423,10 +20423,12 @@ void * llama_context_setup_backend( } } - const size_t max_nodes = llama_model_max_nodes(*model); + const size_t max_nodes = llama_model_max_nodes(*model); + const size_t n_window_sum = std::accumulate(cparams.n_layer_window, cparams.n_layer_window + 32, 0u); + const size_t n_graphs = static_cast(std::ceil(llama_n_layer(model) / n_window_sum)); // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)*n_graphs); // TODO: move these checks to ggml_backend_sched // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary