fix buf_compute_meta size

2025-09-06 23:19:02 +00:00 · 2025-01-04 19:16:36 +04:00 · 2025-01-04 19:16:36 +04:00 · e543fcd55f
commit e543fcd55f
parent b2f806a572
1 changed files with 4 additions and 2 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -20423,10 +20423,12 @@ void * llama_context_setup_backend(
                }
            }

-            const size_t max_nodes = llama_model_max_nodes(*model);
+            const size_t max_nodes    = llama_model_max_nodes(*model);
+            const size_t n_window_sum = std::accumulate(cparams.n_layer_window, cparams.n_layer_window + 32, 0u);
+            const size_t n_graphs     = static_cast<size_t>(std::ceil(llama_n_layer(model) / n_window_sum));

            // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)*n_graphs);

            // TODO: move these checks to ggml_backend_sched
            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary