mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 10:29:43 +00:00
llama : reuse compute graphs (#14482)
* llama : reuse compute graphs ggml-ci * llama-bench : add graph reuse parameter ggml-ci * cont : remove the parameter and the sched resets ggml-ci * graph : rename update() to can_reuse() ggml-ci * params : remove is_same() ggml-ci * graph : set res->params in llm_graph_context constructor ggml-ci * graph : avoid set_max_nodes in llm_graph_result ggml-ci * kv-cache : reuse llama_context's graph result instance ggml-ci * context : reset the previous graph result upon memory updates ggml-ci * batch : llama_ubatch now carries its data instead of pointing to balloc ggml-ci * merge : fix build ggml-ci * graph : fix can_reuse() checks when flash-attention is disabled * graph : move llm_graph_result impl in source file + debug env ggml-ci
This commit is contained in:
parent
086cf81e88
commit
01612b7409
12 changed files with 548 additions and 289 deletions
|
@ -10463,7 +10463,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
|
|||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
|
@ -10598,7 +10598,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
|
|||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
|
@ -15870,7 +15870,7 @@ private:
|
|||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
|
@ -16559,7 +16559,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
|
||||
|
@ -16728,10 +16728,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_model::build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const {
|
||||
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
|
||||
auto * gf = params.res->get_gf();
|
||||
|
||||
std::unique_ptr<llm_graph_context> llm;
|
||||
|
||||
switch (arch) {
|
||||
|
@ -16951,7 +16951,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
} break;
|
||||
case LLM_ARCH_T5:
|
||||
{
|
||||
switch (type) {
|
||||
switch (params.gtype) {
|
||||
case LLM_GRAPH_TYPE_ENCODER:
|
||||
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
||||
break;
|
||||
|
@ -17057,7 +17057,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
// add on pooling layer
|
||||
llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
return std::move(llm->res);
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue