mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-07 18:59:08 +00:00
fix buf_compute_meta size
This commit is contained in:
parent
b2f806a572
commit
e543fcd55f
1 changed files with 4 additions and 2 deletions
|
@ -20423,10 +20423,12 @@ void * llama_context_setup_backend(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t max_nodes = llama_model_max_nodes(*model);
|
const size_t max_nodes = llama_model_max_nodes(*model);
|
||||||
|
const size_t n_window_sum = std::accumulate(cparams.n_layer_window, cparams.n_layer_window + 32, 0u);
|
||||||
|
const size_t n_graphs = static_cast<size_t>(std::ceil(llama_n_layer(model) / n_window_sum));
|
||||||
|
|
||||||
// buffer used to store the computation graph and the tensor meta data
|
// buffer used to store the computation graph and the tensor meta data
|
||||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)*n_graphs);
|
||||||
|
|
||||||
// TODO: move these checks to ggml_backend_sched
|
// TODO: move these checks to ggml_backend_sched
|
||||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||||
|
|
Loading…
Add table
Reference in a new issue