fix buf_compute_meta size

This commit is contained in:
Zonghang Li 2025-01-04 19:16:36 +04:00
parent b2f806a572
commit e543fcd55f

View file

@ -20423,10 +20423,12 @@ void * llama_context_setup_backend(
}
}
const size_t max_nodes = llama_model_max_nodes(*model);
const size_t max_nodes = llama_model_max_nodes(*model);
const size_t n_window_sum = std::accumulate(cparams.n_layer_window, cparams.n_layer_window + 32, 0u);
const size_t n_graphs = static_cast<size_t>(std::ceil(llama_n_layer(model) / n_window_sum));
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)*n_graphs);
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary