Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	ggml/src/ggml-sycl/ggml-sycl.cpp
This commit is contained in:
Concedo 2025-07-18 13:46:32 +08:00
commit b8e3280432
18 changed files with 926 additions and 317 deletions

View file

@ -2861,7 +2861,8 @@ class Ernie4_5Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"] num_heads = self.hparams["num_attention_heads"]
num_kv_heads = self.hparams["num_key_value_heads"] num_kv_heads = self.hparams["num_key_value_heads"]
head_dim = self.hparams["head_dim"] if (head_dim := self.hparams.get("head_dim")) is None:
head_dim = self.hparams["hidden_size"] // num_heads
if "ernie." in name: if "ernie." in name:
name = name.replace("ernie.", "model.") name = name.replace("ernie.", "model.")
@ -2894,6 +2895,93 @@ class Ernie4_5Model(TextModel):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("Ernie4_5_MoeForCausalLM")
class Ernie4_5MoeModel(Ernie4_5Model):
model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
_experts: list[dict[str, Tensor]] | None = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._experts = [{} for _ in range(self.block_count)]
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
self.gguf_writer.add_expert_shared_count(shared_expert_count)
if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Modify correction bias name as in DeepseekV2
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
# skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
match = re.match(r"model.mtp_block.(\d+)", name)
if match:
return []
# skip all other MTP tensors for now
match = re.match(r"model.mtp_emb_norm.(\d+)", name)
if match:
return []
match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
if match:
return []
match = re.match(r"model.mtp_linear_proj.(\d+)", name)
if match:
return []
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["moe_num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["gate_proj", "up_proj", "down_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename_to_retrieve])
del self._experts[bid][ename_to_retrieve]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register( @ModelBase.register(
"Qwen2VLModel", "Qwen2VLModel",
"Qwen2VLForConditionalGeneration", "Qwen2VLForConditionalGeneration",

View file

@ -364,6 +364,7 @@ class MODEL_ARCH(IntEnum):
DOTS1 = auto() DOTS1 = auto()
ARCEE = auto() ARCEE = auto()
ERNIE4_5 = auto() ERNIE4_5 = auto()
ERNIE4_5_MOE = auto()
HUNYUAN_MOE = auto() HUNYUAN_MOE = auto()
SMOLLM3 = auto() SMOLLM3 = auto()
LFM2 = auto() LFM2 = auto()
@ -680,6 +681,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.DOTS1: "dots1",
MODEL_ARCH.ARCEE: "arcee", MODEL_ARCH.ARCEE: "arcee",
MODEL_ARCH.ERNIE4_5: "ernie4_5", MODEL_ARCH.ERNIE4_5: "ernie4_5",
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.SMOLLM3: "smollm3", MODEL_ARCH.SMOLLM3: "smollm3",
@ -2022,6 +2024,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B, MODEL_TENSOR.FFN_EXP_PROBS_B,
], ],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.PLM: [ MODEL_ARCH.PLM: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,

View file

@ -324,7 +324,8 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_EXP_PROBS_B: ( MODEL_TENSOR.FFN_EXP_PROBS_B: (
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1 "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
"model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
), ),
# Feed-forward up # Feed-forward up
@ -364,13 +365,13 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4 "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
), ),
MODEL_TENSOR.FFN_UP_SHEXP: ( MODEL_TENSOR.FFN_UP_SHEXP: (
@ -403,12 +404,12 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4 "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
), ),
MODEL_TENSOR.FFN_GATE_SHEXP: ( MODEL_TENSOR.FFN_GATE_SHEXP: (
@ -450,14 +451,14 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4 "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
), ),
MODEL_TENSOR.FFN_DOWN_SHEXP: ( MODEL_TENSOR.FFN_DOWN_SHEXP: (

View file

@ -1397,6 +1397,7 @@ extern "C" {
int32_t n_p_eval; int32_t n_p_eval;
int32_t n_eval; int32_t n_eval;
int32_t n_reused; // number of times a ggml compute graph had been reused
}; };
struct llama_perf_sampler_data { struct llama_perf_sampler_data {

View file

@ -7679,7 +7679,7 @@ Current version indicated by LITEVER below.
true true
); );
} else { } else {
msgbox("Could not load selected file. Is it valid?"); msgbox("Could not load selected file. Is it valid?\n\nIf you are trying to attach files to the current session, please drop them into the input box instead.");
} }
} }

View file

@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_DOTS1, "dots1" },
{ LLM_ARCH_ARCEE, "arcee" }, { LLM_ARCH_ARCEE, "arcee" },
{ LLM_ARCH_ERNIE4_5, "ernie4_5" }, { LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_LFM2, "lfm2" },
@ -1825,6 +1826,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_ERNIE4_5_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
},
},
{ {
LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_HUNYUAN_MOE,
{ {

View file

@ -86,6 +86,7 @@ enum llm_arch {
LLM_ARCH_DOTS1, LLM_ARCH_DOTS1,
LLM_ARCH_ARCEE, LLM_ARCH_ARCEE,
LLM_ARCH_ERNIE4_5, LLM_ARCH_ERNIE4_5,
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_SMOLLM3, LLM_ARCH_SMOLLM3,
LLM_ARCH_LFM2, LLM_ARCH_LFM2,

View file

@ -210,7 +210,7 @@ bool llama_batch_allocr::init(
LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__); LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
llama_ubatch ubatch { llama_ubatch ubatch {
/*.equal_seqs =*/ false, /*.b_equal_seqs =*/ false,
/*.n_tokens =*/ (uint32_t) batch.n_tokens, /*.n_tokens =*/ (uint32_t) batch.n_tokens,
/*.n_seq_tokens =*/ (uint32_t) 1, /*.n_seq_tokens =*/ (uint32_t) 1,
/*.n_seqs =*/ (uint32_t) batch.n_tokens, /*.n_seqs =*/ (uint32_t) batch.n_tokens,
@ -223,6 +223,7 @@ bool llama_batch_allocr::init(
/*.seq_id_unq =*/ this->seq_id_unq.data(), /*.seq_id_unq =*/ this->seq_id_unq.data(),
/*.seq_idx =*/ this->seq_idx.data(), /*.seq_idx =*/ this->seq_idx.data(),
/*.output =*/ batch.logits, /*.output =*/ batch.logits,
/*.data =*/ {},
}; };
ubatch_print(ubatch, debug); ubatch_print(ubatch, debug);
@ -365,39 +366,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
clear(); clear();
split_reset(); split_reset();
ubatches.emplace_back(); auto udata = std::make_shared<llama_ubatch::data_t>();
auto & ubatch = ubatches.back(); udata->token .resize(n_tokens);
udata->embd .clear();
ubatch.token .resize(n_tokens); udata->pos .resize(n_tokens);
ubatch.embd .clear(); udata->n_seq_id .resize(n_tokens);
ubatch.pos .resize(n_tokens); udata->seq_id .resize(n_tokens);
ubatch.n_seq_id .resize(n_tokens); udata->seq_id_unq.resize(0);
ubatch.seq_id .resize(n_tokens); udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
ubatch.seq_id_unq.resize(0); udata->output .resize(n_tokens);
ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1);
ubatch.output .resize(n_tokens);
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
ubatch.seq_idx[s] = s; udata->seq_idx[s] = s;
ubatch.seq_id_unq.push_back(s); udata->seq_id_unq.push_back(s);
} }
llama_ubatch res { llama_ubatch res {
/*.equal_seqs =*/ true, /*.b_equal_seqs =*/ true,
/*.n_tokens =*/ n_tokens, /*.n_tokens =*/ n_tokens,
/*.n_seq_tokens =*/ n_seq_tokens, /*.n_seq_tokens =*/ n_seq_tokens,
/*.n_seqs =*/ n_seqs, /*.n_seqs =*/ n_seqs,
/*.n_seqs_unq =*/ n_seqs, /*.n_seqs_unq =*/ n_seqs,
/*.token =*/ ubatch.token.data(), /*.token =*/ udata->token.data(),
/*.embd =*/ nullptr, /*.embd =*/ nullptr,
/*.pos =*/ ubatch.pos.data(), /*.pos =*/ udata->pos.data(),
/*.n_seq_id =*/ ubatch.n_seq_id.data(), /*.n_seq_id =*/ udata->n_seq_id.data(),
/*.seq_id =*/ ubatch.seq_id.data(), /*.seq_id =*/ udata->seq_id.data(),
/*.seq_id_unq =*/ ubatch.seq_id_unq.data(), /*.seq_id_unq =*/ udata->seq_id_unq.data(),
/*.seq_idx =*/ ubatch.seq_idx.data(), /*.seq_idx =*/ udata->seq_idx.data(),
/*.output =*/ ubatch.output.data(), /*.output =*/ udata->output.data(),
/*.data =*/ std::move(udata),
}; };
return res; return res;
@ -438,8 +438,6 @@ void llama_batch_allocr::split_reset() {
used.clear(); used.clear();
used.resize(get_n_tokens(), false); used.resize(get_n_tokens(), false);
ubatches.clear();
} }
llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) { llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
@ -654,78 +652,77 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
assert(n_tokens%n_seqs == 0); assert(n_tokens%n_seqs == 0);
ubatches.emplace_back(); auto udata = std::make_shared<llama_ubatch::data_t>();
auto & ubatch = ubatches.back();
const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1; const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur; const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
ubatch.token .resize(n_tokens); udata->token .resize(n_tokens);
ubatch.embd .resize(n_embd_all); udata->embd .resize(n_embd_all);
ubatch.pos .resize(n_pos_all); udata->pos .resize(n_pos_all);
ubatch.n_seq_id .resize(n_tokens); udata->n_seq_id .resize(n_tokens);
ubatch.seq_id .resize(n_tokens); udata->seq_id .resize(n_tokens);
ubatch.seq_id_unq.resize(0); udata->seq_id_unq.resize(0);
ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1); udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
ubatch.output .resize(n_tokens); udata->output .resize(n_tokens);
seq_set_t seq_set_unq; seq_set_t seq_set_unq;
for (size_t i = 0; i < idxs.size(); ++i) { for (size_t i = 0; i < idxs.size(); ++i) {
if (batch.token) { if (batch.token) {
ubatch.token[i] = batch.token[idxs[i]]; udata->token[i] = batch.token[idxs[i]];
} }
if (batch.embd) { if (batch.embd) {
memcpy(ubatch.embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float)); memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
} }
for (int j = 0; j < n_pos_cur; ++j) { for (int j = 0; j < n_pos_cur; ++j) {
ubatch.pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]]; udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
} }
ubatch.n_seq_id[i] = batch.n_seq_id[idxs[i]]; udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
ubatch.seq_id[i] = batch.seq_id[idxs[i]]; udata->seq_id[i] = batch.seq_id[idxs[i]];
ubatch.output[i] = batch.logits[idxs[i]]; udata->output[i] = batch.logits[idxs[i]];
for (int s = 0; s < ubatch.n_seq_id[i]; ++s) { for (int s = 0; s < udata->n_seq_id[i]; ++s) {
seq_set_unq.set(ubatch.seq_id[i][s]); seq_set_unq.set(udata->seq_id[i][s]);
} }
if (ubatch.output[i]) { if (udata->output[i]) {
out_ids.push_back(idxs[i]); out_ids.push_back(idxs[i]);
} }
} }
for (uint32_t s = 0; s < n_seq_max; ++s) { for (uint32_t s = 0; s < n_seq_max; ++s) {
if (seq_set_unq.test(s)) { if (seq_set_unq.test(s)) {
ubatch.seq_idx[s] = ubatch.seq_id_unq.size(); udata->seq_idx[s] = udata->seq_id_unq.size();
ubatch.seq_id_unq.push_back(s); udata->seq_id_unq.push_back(s);
} }
} }
llama_ubatch res { llama_ubatch res {
/*.equal_seqs =*/ equal_seqs, /*.b_equal_seqs =*/ equal_seqs,
/*.n_tokens =*/ n_tokens, /*.n_tokens =*/ n_tokens,
/*.n_seq_tokens =*/ n_tokens/n_seqs, /*.n_seq_tokens =*/ n_tokens/n_seqs,
/*.n_seqs =*/ n_seqs, /*.n_seqs =*/ n_seqs,
/*.n_seqs_unq =*/ (uint32_t) ubatch.seq_id_unq.size(), /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
/*.token =*/ batch.token ? ubatch.token.data() : nullptr, /*.token =*/ batch.token ? udata->token.data() : nullptr,
/*.embd =*/ batch.embd ? ubatch.embd.data() : nullptr, /*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
/*.pos =*/ ubatch.pos.data(), /*.pos =*/ udata->pos.data(),
/*.n_seq_id =*/ ubatch.n_seq_id.data(), /*.n_seq_id =*/ udata->n_seq_id.data(),
/*.seq_id =*/ ubatch.seq_id.data(), /*.seq_id =*/ udata->seq_id.data(),
/*.seq_id_unq =*/ ubatch.seq_id_unq.data(), /*.seq_id_unq =*/ udata->seq_id_unq.data(),
/*.seq_idx =*/ ubatch.seq_idx.data(), /*.seq_idx =*/ udata->seq_idx.data(),
/*.output =*/ ubatch.output.data(), /*.output =*/ udata->output.data(),
/*.data =*/ std::move(udata),
}; };
if (debug > 0) { if (debug > 0) {
LLAMA_LOG_DEBUG("%s: added ubatch %d to split:\n", __func__, (int) ubatches.size() - 1); LLAMA_LOG_DEBUG("%s: added ubatch to split:\n", __func__);
ubatch_print(res, debug); ubatch_print(res, debug);
} }
@ -735,7 +732,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
if (debug > 0) { if (debug > 0) {
LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs); LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs());
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, ubatch.n_tokens); LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, ubatch.n_tokens);
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens); LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
LLAMA_LOG_DEBUG("%s: n_seqs = %d\n", __func__, ubatch.n_seqs); LLAMA_LOG_DEBUG("%s: n_seqs = %d\n", __func__, ubatch.n_seqs);

View file

@ -8,12 +8,17 @@
#include <vector> #include <vector>
#include <set> #include <set>
#include <bitset> #include <bitset>
#include <memory>
#include <unordered_map> #include <unordered_map>
// keep this struct lightweight // keep this struct lightweight
// it points to data in `llama_batch_allocr`
struct llama_ubatch { struct llama_ubatch {
bool equal_seqs; bool equal_seqs() const {
return b_equal_seqs != 0;
}
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
// otherwise address sanitizer complains
// TODO: whole_seqs for embeddings? // TODO: whole_seqs for embeddings?
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs) uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
@ -34,6 +39,20 @@ struct llama_ubatch {
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
int8_t * output; // [n_tokens] | i | - int8_t * output; // [n_tokens] | i | -
struct data_t {
std::vector<llama_token> token;
std::vector<float> embd;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<llama_seq_id> seq_id_unq;
std::vector<int32_t> seq_idx;
std::vector<int8_t> output;
};
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
std::shared_ptr<data_t> data;
}; };
// a helper for sanitizing, fulfilling and splitting a batch // a helper for sanitizing, fulfilling and splitting a batch
@ -137,20 +156,5 @@ private:
// used[i] indicates if token i has already been used in a previous ubatch // used[i] indicates if token i has already been used in a previous ubatch
std::vector<bool> used; std::vector<bool> used;
// llama_ubatch points to this data:
struct ubatch {
std::vector<llama_token> token;
std::vector<float> embd;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<llama_seq_id> seq_id_unq;
std::vector<int32_t> seq_idx;
std::vector<int8_t> output;
};
// current splitting state:
std::vector<ubatch> ubatches;
int debug; int debug;
}; };

View file

@ -105,7 +105,7 @@ llama_context::llama_context(
{ {
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
const bool supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
if (!supports_set_rows && !cparams.kv_unified) { if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__); LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@ -238,8 +238,8 @@ llama_context::llama_context(
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
// buffer used to store the computation graph and the tensor meta data gf_res_prev.reset(new llm_graph_result(max_nodes));
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); gf_res_reserve.reset(new llm_graph_result(max_nodes));
// TODO: move these checks to ggml_backend_sched // TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@ -403,10 +403,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
return sched.get(); return sched.get();
} }
ggml_context * llama_context::get_ctx_compute() const {
return ctx_compute.get();
}
uint32_t llama_context::n_ctx() const { uint32_t llama_context::n_ctx() const {
return cparams.n_ctx; return cparams.n_ctx;
} }
@ -478,6 +474,11 @@ bool llama_context::kv_self_update(bool optimize) {
} }
} }
// reset the previous graph result to make sure that it won't be reused
// TODO: change the mctx->apply() to return information if a graph reserve is needed
// reset the graph result only if the memory module did reset the scheduler
gf_res_prev->reset();
if (!mctx->apply()) { if (!mctx->apply()) {
LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__); LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
} }
@ -693,38 +694,59 @@ bool llama_context::apply_adapter_cvec(
return cvec.apply(model, data, len, n_embd, il_start, il_end); return cvec.apply(model, data, len, n_embd, il_start, il_end);
} }
llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
if (mctx && !mctx->apply()) { if (mctx && !mctx->apply()) {
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
ret = GGML_STATUS_FAILED; ret = GGML_STATUS_FAILED;
return nullptr; return nullptr;
} }
auto * gf = graph_init(); auto * res = gf_res_prev.get();
if (!gf) { auto * gf = res->get_gf();
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
ret = GGML_STATUS_FAILED; // the new graph parameters
return nullptr; // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
const auto gparams = graph_params(res, ubatch, mctx, gtype);
if (res->can_reuse(gparams)) {
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
n_reused++;
} else {
res->reset();
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
//const auto t_start_us = ggml_time_us();
gf = model.build_graph(gparams);
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
if (!gf) {
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
ret = GGML_STATUS_FAILED;
return nullptr;
}
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
ret = GGML_STATUS_ALLOC_FAILED;
return nullptr;
}
} }
auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx); // set the input data for the input tensors
if (!res) { {
LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__); //const auto t_start_us = ggml_time_us();
ret = GGML_STATUS_FAILED;
return nullptr; res->set_inputs(&ubatch);
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
} }
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
ret = GGML_STATUS_ALLOC_FAILED;
return nullptr;
}
res->set_inputs(&ubatch);
const auto status = graph_compute(gf, ubatch.n_tokens > 1);
if (status != GGML_STATUS_SUCCESS) { if (status != GGML_STATUS_SUCCESS) {
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status); LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
ret = status; ret = status;
@ -785,9 +807,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
n_outputs = n_tokens; n_outputs = n_tokens;
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
const auto causal_attn_org = cparams.causal_attn; const auto causal_attn_org = cparams.causal_attn;
// always use non-causal attention for encoder graphs // always use non-causal attention for encoder graphs
@ -796,7 +815,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
cparams.causal_attn = false; cparams.causal_attn = false;
ggml_status status; ggml_status status;
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
cparams.causal_attn = causal_attn_org; cparams.causal_attn = causal_attn_org;
@ -872,10 +891,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
} }
} }
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
// TODO: hacky solution // TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) { if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd; //cross.t_embd = t_embd;
@ -1033,11 +1048,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
n_outputs = n_outputs_new; n_outputs = n_outputs_new;
} }
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
ggml_status status; ggml_status status;
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
if (!res) { if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@ -1218,10 +1230,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output) // wait for the computation to finish (automatically done when obtaining the model output)
//synchronize(); //synchronize();
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
return 0; return 0;
} }
@ -1303,20 +1311,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
// graph // graph
// //
int32_t llama_context::graph_max_nodes() const { uint32_t llama_context::graph_max_nodes() const {
return std::max<int32_t>(65536, 5*model.n_tensors()); return std::max<uint32_t>(65536u, 5u*model.n_tensors());
} }
ggml_cgraph * llama_context::graph_init() { llm_graph_result * llama_context::get_gf_res_reserve() const {
ggml_init_params params = { return static_cast<llm_graph_result *>(gf_res_reserve.get());
/*.mem_size =*/ buf_compute_meta.size(),
/*.mem_buffer =*/ buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ctx_compute.reset(ggml_init(params));
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
} }
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
@ -1329,6 +1329,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
//LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); //LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
} }
ggml_backend_sched_reset(sched.get());
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
gf_res_prev->reset();
// store the n_outputs as it is, and restore it afterwards // store the n_outputs as it is, and restore it afterwards
// TODO: not sure if needed, might simplify in the future by removing this // TODO: not sure if needed, might simplify in the future by removing this
const auto save_n_outputs = this->n_outputs; const auto save_n_outputs = this->n_outputs;
@ -1338,18 +1343,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
auto * gf = graph_init(); auto * res = gf_res_reserve.get();
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
res->reset();
auto * gf = model.build_graph(gparams);
this->n_outputs = save_n_outputs; this->n_outputs = save_n_outputs;
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
return nullptr;
}
ggml_backend_sched_reset(sched.get());
// initialize scheduler with the specified graph // initialize scheduler with the specified graph
if (!ggml_backend_sched_reserve(sched.get(), gf)) { if (!ggml_backend_sched_reserve(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
@ -1359,28 +1362,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
return gf; return gf;
} }
llm_graph_result_ptr llama_context::graph_build( llm_graph_params llama_context::graph_params(
ggml_context * ctx, llm_graph_result_i * res,
ggml_cgraph * gf, const llama_ubatch & ubatch,
const llama_ubatch & ubatch, const llama_memory_context_i * mctx,
llm_graph_type gtype, llm_graph_type gtype) const {
const llama_memory_context_i * mctx) { return {
return model.build_graph( /*.arch =*/ model.arch,
{ /*.hparams =*/ model.hparams,
/*.ctx =*/ ctx, /*.cparams =*/ cparams,
/*.arch =*/ model.arch, /*.ubatch =*/ ubatch,
/*.hparams =*/ model.hparams, /*.gtype =*/ gtype,
/*.cparams =*/ cparams, /*.sched =*/ sched.get(),
/*.ubatch =*/ ubatch, /*.backend_cpu =*/ backend_cpu,
/*.sched =*/ sched.get(), /*.cvec =*/ &cvec,
/*.backend_cpu =*/ backend_cpu, /*.loras =*/ &loras,
/*.cvec =*/ &cvec, /*.mctx =*/ mctx,
/*.loras =*/ &loras, /*.cross =*/ &cross,
/*.mctx =*/ mctx, /*.n_outputs =*/ n_outputs,
/*.cross =*/ &cross, /*.cb =*/ graph_get_cb(),
/*.n_outputs =*/ n_outputs, /*.res =*/ res,
/*.cb =*/ graph_get_cb(), };
}, gf, gtype);
} }
ggml_status llama_context::graph_compute( ggml_status llama_context::graph_compute(
@ -1958,6 +1960,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
data.t_eval_ms = 1e-3 * t_eval_us; data.t_eval_ms = 1e-3 * t_eval_us;
data.n_p_eval = std::max(1, n_p_eval); data.n_p_eval = std::max(1, n_p_eval);
data.n_eval = std::max(1, n_eval); data.n_eval = std::max(1, n_eval);
data.n_reused = std::max(0, n_reused);
return data; return data;
} }
@ -1966,6 +1969,7 @@ void llama_context::perf_reset() {
t_start_us = ggml_time_us(); t_start_us = ggml_time_us();
t_eval_us = n_eval = 0; t_eval_us = n_eval = 0;
t_p_eval_us = n_p_eval = 0; t_p_eval_us = n_p_eval = 0;
n_reused = 0;
} }
// //
@ -2092,8 +2096,13 @@ void llama_context::opt_epoch_iter(
break; break;
} }
auto * gf = graph_init(); auto * res = gf_res_prev.get();
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
res->reset();
auto * gf = model.build_graph(gparams);
struct ggml_context * ctx_compute_opt; struct ggml_context * ctx_compute_opt;
{ {
@ -2836,6 +2845,7 @@ void llama_perf_context_print(const llama_context * ctx) {
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
} }
void llama_perf_context_reset(llama_context * ctx) { void llama_perf_context_reset(llama_context * ctx) {

View file

@ -35,8 +35,6 @@ struct llama_context {
ggml_backend_sched_t get_sched() const; ggml_backend_sched_t get_sched() const;
ggml_context * get_ctx_compute() const;
uint32_t n_ctx() const; uint32_t n_ctx() const;
uint32_t n_ctx_per_seq() const; uint32_t n_ctx_per_seq() const;
uint32_t n_batch() const; uint32_t n_batch() const;
@ -96,7 +94,7 @@ struct llama_context {
// if memory_context is provided, it will be applied first to the context's memory // if memory_context is provided, it will be applied first to the context's memory
// ret contains the status of the graph computation // ret contains the status of the graph computation
// returns nullptr only if ret != GGML_STATUS_SUCCESS // returns nullptr only if ret != GGML_STATUS_SUCCESS
llm_graph_result_ptr process_ubatch( llm_graph_result_i * process_ubatch(
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
llm_graph_type gtype, llm_graph_type gtype,
llama_memory_context_i * mctx, llama_memory_context_i * mctx,
@ -188,10 +186,10 @@ private:
// //
public: public:
int32_t graph_max_nodes() const; uint32_t graph_max_nodes() const;
// zero-out inputs and create the ctx_compute for the compute graph // can reuse the llm_graph_result instance of the context (for example to update a memory module)
ggml_cgraph * graph_init(); llm_graph_result * get_gf_res_reserve() const;
// returns the result of ggml_backend_sched_graph_compute_async execution // returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status graph_compute(ggml_cgraph * gf, bool batched); ggml_status graph_compute(ggml_cgraph * gf, bool batched);
@ -200,12 +198,11 @@ public:
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
private: private:
llm_graph_result_ptr graph_build( llm_graph_params graph_params(
ggml_context * ctx, llm_graph_result_i * res,
ggml_cgraph * gf, const llama_ubatch & ubatch,
const llama_ubatch & ubatch, const llama_memory_context_i * mctx,
llm_graph_type gtype, llm_graph_type gtype) const;
const llama_memory_context_i * mctx);
llm_graph_cb graph_get_cb() const; llm_graph_cb graph_get_cb() const;
@ -258,8 +255,6 @@ private:
ggml_backend_t backend_cpu = nullptr; ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends; std::vector<ggml_backend_ptr> backends;
ggml_context_ptr ctx_compute;
// training // training
ggml_opt_context_t opt_ctx = nullptr; ggml_opt_context_t opt_ctx = nullptr;
@ -275,8 +270,8 @@ private:
std::vector<ggml_backend_t> backend_ptrs; std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft; std::vector<ggml_backend_buffer_type_t> backend_buft;
// memory buffers used to evaluate the model llm_graph_result_ptr gf_res_prev;
std::vector<uint8_t> buf_compute_meta; llm_graph_result_ptr gf_res_reserve;
// host buffer for the model output (logits and embeddings) // host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output; ggml_backend_buffer_ptr buf_output;
@ -294,4 +289,6 @@ private:
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
mutable int32_t n_eval = 0; // number of eval calls mutable int32_t n_eval = 0; // number of eval calls
mutable int32_t n_reused = 0; // number of times the previous graph was reused
}; };

View file

@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
} }
} }
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
return res;
}
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) { if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
} }
} }
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= pos->ne[0] == params.ubatch.n_tokens;
return res;
}
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) { if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
@ -71,7 +88,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
int32_t * data = (int32_t *) pos_bucket->data; int32_t * data = (int32_t *) pos_bucket->data;
@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
} }
} }
bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= n_outputs == params.n_outputs;
return res;
}
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
} }
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_supports_set_rows(); // TODO: tmp
return res;
}
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
} }
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
return res;
}
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
GGML_ASSERT(cross_kq_mask); GGML_ASSERT(cross_kq_mask);
@ -306,7 +373,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
float * data = (float *) cross_kq_mask->data; float * data = (float *) cross_kq_mask->data;
@ -340,6 +407,85 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
inp_rs->set_input(ubatch); inp_rs->set_input(ubatch);
} }
//
// llm_graph_result
//
llm_graph_result::llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
reset();
const char * LLAMA_GRAPH_RESULT_DEBUG = getenv("LLAMA_GRAPH_RESULT_DEBUG");
debug = LLAMA_GRAPH_RESULT_DEBUG ? atoi(LLAMA_GRAPH_RESULT_DEBUG) : 0;
}
int64_t llm_graph_result::get_max_nodes() const {
return max_nodes;
}
void llm_graph_result::reset() {
t_tokens = nullptr;
t_logits = nullptr;
t_embd = nullptr;
t_embd_pooled = nullptr;
inputs.clear();
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
ggml_init_params params = {
/*.mem_size =*/ buf_compute_meta.size(),
/*.mem_buffer =*/ buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ctx_compute.reset(ggml_init(params));
gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
}
void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
for (auto & input : inputs) {
input->set_input(ubatch);
}
}
bool llm_graph_result::can_reuse(const llm_graph_params & params) {
if (!this->params.allow_reuse(params)) {
if (debug > 1) {
LLAMA_LOG_DEBUG("%s: cannot reuse graph due to incompatible graph parameters\n", __func__);
}
return false;
}
if (debug > 1) {
LLAMA_LOG_DEBUG("%s: checking compatibility of %d inputs:\n", __func__, (int) inputs.size());
}
bool res = true;
for (auto & input : inputs) {
const bool cur = input->can_reuse(params);
if (debug > 1) {
LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur);
}
res = res && cur;
}
if (debug > 0) {
LLAMA_LOG_DEBUG("%s: can reuse graph = %d\n", __func__, res);
}
return res;
}
llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
inputs.emplace_back(std::move(input));
return inputs.back().get();
}
// //
// llm_graph_context // llm_graph_context
// //
@ -374,7 +520,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
n_ctx_orig (cparams.n_ctx_orig_yarn), n_ctx_orig (cparams.n_ctx_orig_yarn),
pooling_type (cparams.pooling_type), pooling_type (cparams.pooling_type),
rope_type (hparams.rope_type), rope_type (hparams.rope_type),
ctx0 (params.ctx),
sched (params.sched), sched (params.sched),
backend_cpu (params.backend_cpu), backend_cpu (params.backend_cpu),
cvec (params.cvec), cvec (params.cvec),
@ -382,7 +527,9 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
mctx (params.mctx), mctx (params.mctx),
cross (params.cross), cross (params.cross),
cb_func (params.cb), cb_func (params.cb),
res (std::make_unique<llm_graph_result>()) { res (static_cast<llm_graph_result *>(params.res)),
ctx0 (res->get_ctx()) {
res->params = params;
} }
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@ -1127,8 +1274,8 @@ ggml_tensor * llm_graph_context::build_attn(
const auto & kq_mask = inp->get_kq_mask(); const auto & kq_mask = inp->get_kq_mask();
// [TAG_NO_CACHE_PAD] // [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs == true, we can split the three tensors below into ubatch.n_seqs_unq streams // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
assert(ubatch.equal_seqs == false); assert(!ubatch.equal_seqs());
ggml_tensor * q = q_cur; ggml_tensor * q = q_cur;
ggml_tensor * k = k_cur; ggml_tensor * k = k_cur;

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "llama-arch.h" #include "llama-arch.h"
#include "llama-batch.h"
#include "llama-hparams.h" #include "llama-hparams.h"
#include "llama-adapter.h" #include "llama-adapter.h"
@ -14,7 +15,6 @@ struct ggml_cgraph;
struct ggml_context; struct ggml_context;
struct ggml_tensor; struct ggml_tensor;
struct llama_ubatch;
struct llama_cparams; struct llama_cparams;
struct llama_memory_context_i; struct llama_memory_context_i;
@ -69,6 +69,8 @@ struct llama_cross {
std::vector<std::set<llama_seq_id>> seq_ids_enc; std::vector<std::set<llama_seq_id>> seq_ids_enc;
}; };
struct llm_graph_params;
// //
// llm_graph_input // llm_graph_input
// //
@ -78,11 +80,19 @@ public:
virtual ~llm_graph_input_i() = default; virtual ~llm_graph_input_i() = default;
virtual void set_input(const llama_ubatch * ubatch) = 0; virtual void set_input(const llama_ubatch * ubatch) = 0;
// return true if the resulting input tensors using the provided graph parameters would be
// the same as the previous input tensors that we have currently stored in the object
virtual bool can_reuse(const llm_graph_params & params) {
// returning false here by default will prevent from reusing the graph if the check
// for the input type has not been implemented yet
GGML_UNUSED(params);
return false;
}
}; };
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>; using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
class llm_graph_input_embd : public llm_graph_input_i { class llm_graph_input_embd : public llm_graph_input_i {
public: public:
llm_graph_input_embd() = default; llm_graph_input_embd() = default;
@ -90,6 +100,8 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
}; };
@ -101,6 +113,8 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * pos = nullptr; // I32 [n_batch] ggml_tensor * pos = nullptr; // I32 [n_batch]
const uint32_t n_pos_per_embd = 1; const uint32_t n_pos_per_embd = 1;
@ -154,17 +168,19 @@ public:
llm_graph_input_out_ids( llm_graph_input_out_ids(
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_cparams & cparams, const llama_cparams & cparams,
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
virtual ~llm_graph_input_out_ids() = default; virtual ~llm_graph_input_out_ids() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * out_ids; // I32 [n_outputs] ggml_tensor * out_ids; // I32 [n_outputs]
const llama_hparams & hparams; const llama_hparams & hparams;
const llama_cparams & cparams; const llama_cparams & cparams;
const int32_t n_outputs; const uint32_t n_outputs;
}; };
class llm_graph_input_mean : public llm_graph_input_i { class llm_graph_input_mean : public llm_graph_input_i {
@ -249,6 +265,8 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@ -280,6 +298,8 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; }
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
@ -351,65 +371,40 @@ public:
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
// these are used by the llama_context to extact the relevant data, based on the compute parameters // these are used by the llama_context to extact the relevant data, based on the compute parameters
// TODO: this interface seems redundant - remove it
class llm_graph_result_i { class llm_graph_result_i {
public: public:
virtual ~llm_graph_result_i() = default; virtual ~llm_graph_result_i() = default;
virtual ggml_tensor * get_tokens() = 0; virtual ggml_tensor * get_tokens() const = 0;
virtual ggml_tensor * get_logits() = 0; virtual ggml_tensor * get_logits() const = 0;
virtual ggml_tensor * get_embd() = 0; virtual ggml_tensor * get_embd() const = 0;
virtual ggml_tensor * get_embd_pooled() = 0; virtual ggml_tensor * get_embd_pooled() const = 0;
virtual ggml_cgraph * get_gf() = 0;
virtual ggml_context * get_ctx() = 0;
virtual void reset() = 0;
virtual void set_inputs(const llama_ubatch * ubatch) = 0; virtual void set_inputs(const llama_ubatch * ubatch) = 0;
virtual bool can_reuse(const llm_graph_params & params) = 0;
}; };
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>; using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
class llm_graph_result : public llm_graph_result_i {
public:
virtual ~llm_graph_result() = default;
ggml_tensor * get_tokens() override { return t_tokens; }
ggml_tensor * get_logits() override { return t_logits; }
ggml_tensor * get_embd() override { return t_embd; }
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
void set_inputs(const llama_ubatch * ubatch) override {
for (auto & input : inputs) {
input->set_input(ubatch);
}
}
llm_graph_input_i * add_input(llm_graph_input_ptr input) {
inputs.emplace_back(std::move(input));
return inputs.back().get();
}
// important graph nodes
ggml_tensor * t_tokens = nullptr;
ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;
std::vector<llm_graph_input_ptr> inputs;
};
//
// llm_graph_context
//
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>; using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
struct llm_graph_params { struct llm_graph_params {
ggml_context * ctx; llm_arch arch = LLM_ARCH_UNKNOWN;
const llm_arch arch; llama_hparams hparams;
llama_cparams cparams;
const llama_hparams & hparams; llama_ubatch ubatch; // note: intentionally make a copy
const llama_cparams & cparams;
const llama_ubatch & ubatch; llm_graph_type gtype;
ggml_backend_sched_t sched; ggml_backend_sched_t sched;
ggml_backend_t backend_cpu; ggml_backend_t backend_cpu;
@ -421,9 +416,113 @@ struct llm_graph_params {
uint32_t n_outputs; uint32_t n_outputs;
const llm_graph_cb & cb; llm_graph_cb cb;
// TODO: temporary
llm_graph_result_i * res;
// return true if the "other" params would result in a graph with the same topology as with the current params
// having the same topology allows us to reuse the graph in some cases
bool allow_reuse(const llm_graph_params & other) const {
// first check the ubatch
bool can_reuse_ubatch =
ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
ubatch.n_tokens == other.ubatch.n_tokens &&
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
ubatch.n_seqs == other.ubatch.n_seqs &&
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
(
(!ubatch.token && !other.ubatch.token) ||
(!ubatch.embd && !other.ubatch.embd)
);
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
if (!ubatch.data) {
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
// therefore we cannot perform the sequence id check. normally should never happen
can_reuse_ubatch = false;
} else {
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
}
}
}
if (!can_reuse_ubatch) {
return false;
}
return
cparams.embeddings == other.cparams.embeddings &&
cparams.causal_attn == other.cparams.causal_attn &&
arch == other.arch &&
gtype == other.gtype &&
cvec == other.cvec &&
loras == other.loras &&
cross == other.cross &&
n_outputs == other.n_outputs;
}
}; };
class llm_graph_result : public llm_graph_result_i {
public:
llm_graph_result(int64_t max_nodes);
virtual ~llm_graph_result() = default;
ggml_tensor * get_tokens() const override { return t_tokens; }
ggml_tensor * get_logits() const override { return t_logits; }
ggml_tensor * get_embd() const override { return t_embd; }
ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
ggml_cgraph * get_gf() override { return gf; }
ggml_context * get_ctx() override { return ctx_compute.get(); }
int64_t get_max_nodes() const;
void reset() override;
void set_inputs(const llama_ubatch * ubatch) override;
// try to update the existing graph result using the new graph parameters in order to reuse it
// this can only be done if we determine that the resulting graph using the new graph parameters
// would be identical to the existing graph. in that case, we simply have to update the memory
// contexts of the input tensors of the graph and we can reuse it for another computation
// return true if the graph was updated and can be reused
bool can_reuse(const llm_graph_params & params) override;
llm_graph_input_i * add_input(llm_graph_input_ptr input);
// important graph nodes
ggml_tensor * t_tokens = nullptr;
ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;
std::vector<llm_graph_input_ptr> inputs;
ggml_context_ptr ctx_compute;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_cgraph * gf;
int64_t max_nodes;
// keep a copy of the previous graph parameters
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
// note: these are updated after constructing the new graph
llm_graph_params params;
// env: LLAMA_GRAPH_RESULT_DEBUG
int debug = 0;
};
//
// llm_graph_context
//
// used in build_rs to properly order writes and avoid unnecessary copies // used in build_rs to properly order writes and avoid unnecessary copies
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>; using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
@ -463,8 +562,6 @@ struct llm_graph_context {
const enum llama_pooling_type pooling_type; const enum llama_pooling_type pooling_type;
const enum llama_rope_type rope_type; const enum llama_rope_type rope_type;
ggml_context * ctx0 = nullptr;
ggml_backend_sched_t sched; ggml_backend_sched_t sched;
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@ -476,7 +573,9 @@ struct llm_graph_context {
const llm_graph_cb & cb_func; const llm_graph_cb & cb_func;
std::unique_ptr<llm_graph_result> res; llm_graph_result * res;
ggml_context * ctx0 = nullptr;
llm_graph_context(const llm_graph_params & params); llm_graph_context(const llm_graph_params & params);
virtual ~llm_graph_context() = default; virtual ~llm_graph_context() = default;

View file

@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
if (!supports_set_rows) { if (!supports_set_rows) {
// ref: https://github.com/ggml-org/llama.cpp/pull/14363 // ref: https://github.com/ggml-org/llama.cpp/pull/14363
@ -656,14 +656,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
auto * gf = lctx->graph_init(); auto * res = lctx->get_gf_res_reserve();
auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf); res->reset();
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
return updated;
}
auto * gf = build_graph_shift(res, lctx);
if (!ggml_backend_sched_alloc_graph(sched, gf)) { if (!ggml_backend_sched_alloc_graph(sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__); LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
return updated; return updated;
@ -713,14 +710,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
auto * gf = lctx->graph_init(); auto * res = lctx->get_gf_res_reserve();
auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo); res->reset();
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
return updated;
}
auto * gf = build_graph_defrag(res, lctx, dinfo);
if (!ggml_backend_sched_alloc_graph(sched, gf)) { if (!ggml_backend_sched_alloc_graph(sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
return updated; return updated;
@ -1035,6 +1029,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
return result; return result;
} }
bool llama_kv_cache_unified::get_supports_set_rows() const {
return supports_set_rows;
}
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
const int32_t ikv = map_layer_ids.at(il); const int32_t ikv = map_layer_ids.at(il);
@ -1263,7 +1261,7 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
const auto & cells = v_cells[s]; const auto & cells = v_cells[s];
for (uint32_t i = 0; i < cells.size(); ++i) { for (uint32_t i = 0; i < cells.size(); ++i) {
data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i); data[s*cells.size() + i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
} }
} }
} }
@ -1297,6 +1295,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
// xxxxx----- // xxxxx-----
// xxxxx----- // xxxxx-----
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
// TODO: optimize this section
for (uint32_t h = 0; h < 1; ++h) { for (uint32_t h = 0; h < 1; ++h) {
for (uint32_t s = 0; s < n_stream; ++s) { for (uint32_t s = 0; s < n_stream; ++s) {
for (uint32_t ii = 0; ii < n_tps; ++ii) { for (uint32_t ii = 0; ii < n_tps; ++ii) {
@ -1346,7 +1345,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
const auto & cells = v_cells[0]; const auto & cells = v_cells[0];
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
int32_t * data = (int32_t *) dst->data; int32_t * data = (int32_t *) dst->data;
@ -1464,11 +1463,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
} }
} }
llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
const llama_cparams & cparams, auto * ctx = res->get_ctx();
ggml_context * ctx, auto * gf = res->get_gf();
ggml_cgraph * gf) const {
auto res = std::make_unique<llm_graph_result>();
const auto & n_embd_head_k = hparams.n_embd_head_k; const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v; //const auto & n_embd_head_v = hparams.n_embd_head_v;
@ -1478,6 +1475,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
ggml_set_input(inp->k_shift); ggml_set_input(inp->k_shift);
const auto & cparams = lctx->get_cparams();
for (const auto & layer : layers) { for (const auto & layer : layers) {
const uint32_t il = layer.il; const uint32_t il = layer.il;
@ -1503,15 +1502,15 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
res->add_input(std::move(inp)); res->add_input(std::move(inp));
return res; return gf;
} }
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams, llm_graph_result * res,
ggml_context * ctx, llama_context * lctx,
ggml_cgraph * gf, const defrag_info & dinfo) const {
const defrag_info & dinfo) const { auto * ctx = res->get_ctx();
auto res = std::make_unique<llm_graph_result>(); auto * gf = res->get_gf();
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
@ -1519,6 +1518,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const auto & ids = dinfo.ids; const auto & ids = dinfo.ids;
const auto & cparams = lctx->get_cparams();
#if 0 #if 0
// CPU defrag // CPU defrag
// //
@ -1655,7 +1656,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif #endif
return res; return gf;
} }
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
@ -2331,6 +2332,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
return n_kv; return n_kv;
} }
bool llama_kv_cache_unified_context::get_supports_set_rows() const {
return kv->get_supports_set_rows();
}
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]); return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
} }

View file

@ -154,6 +154,9 @@ public:
uint32_t get_n_kv() const; uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache // get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@ -227,7 +230,7 @@ private:
// env: LLAMA_SET_ROWS (temporary) // env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285 // ref: https://github.com/ggml-org/llama.cpp/pull/14285
int supports_set_rows = false; bool supports_set_rows = false;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
@ -270,15 +273,13 @@ private:
float freq_base, float freq_base,
float freq_scale) const; float freq_scale) const;
llm_graph_result_ptr build_graph_shift( ggml_cgraph * build_graph_shift(
const llama_cparams & cparams, llm_graph_result * res,
ggml_context * ctx, llama_context * lctx) const;
ggml_cgraph * gf) const;
llm_graph_result_ptr build_graph_defrag( ggml_cgraph * build_graph_defrag(
const llama_cparams & cparams, llm_graph_result * res,
ggml_context * ctx, llama_context * lctx,
ggml_cgraph * gf,
const defrag_info & dinfo) const; const defrag_info & dinfo) const;
struct cell_ranges_t { struct cell_ranges_t {
@ -340,6 +341,9 @@ public:
uint32_t get_n_kv() const; uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache // get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

View file

@ -446,7 +446,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
// A slot should be always be contiguous. // A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence // can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs());
int32_t min = size - 1; int32_t min = size - 1;
int32_t max = 0; int32_t max = 0;

View file

@ -112,8 +112,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_A13B: return "A13B"; case LLM_TYPE_A13B: return "A13B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B"; case LLM_TYPE_E4B: return "E4B";
default: return "?B"; default: return "?B";
@ -1654,10 +1656,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
if (arch == LLM_ARCH_ERNIE4_5_MOE) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
}
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 18: type = LLM_TYPE_0_3B; break; case 18: type = LLM_TYPE_0_3B; break;
case 28: type = LLM_TYPE_21B_A3B; break;
case 54: type = LLM_TYPE_300B_A47B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
@ -4954,6 +4966,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4982,9 +4995,27 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); int n_ff_exp = hparams.n_ff_exp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
// Shared expert (if present)
if (hparams.n_ff_shexp > 0) {
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
}
} else { // Dense layers
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} }
} break; } break;
case LLM_ARCH_FALCON_H1: case LLM_ARCH_FALCON_H1:
@ -10563,7 +10594,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens;
GGML_ASSERT(n_seqs != 0); GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
@ -10698,7 +10729,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens;
GGML_ASSERT(n_seqs != 0); GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
@ -15669,6 +15700,176 @@ struct llm_build_ernie4_5 : public llm_graph_context {
} }
}; };
struct llm_build_ernie4_5_moe : public llm_graph_context {
llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
ggml_tensor * inp_out_ids = build_inp_out_ids();
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
{
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
}
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
if (!is_moe_layer) {
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
// Shared expert (if present)
if (hparams.n_ff_shexp > 0) {
ggml_tensor * ffn_shexp = build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
} else {
cur = moe_out;
}
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_falcon_h1 : public llm_graph_context_mamba { struct llm_build_falcon_h1 : public llm_graph_context_mamba {
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@ -15970,7 +16171,7 @@ private:
const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens;
GGML_ASSERT(n_seqs != 0); GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
@ -16659,7 +16860,7 @@ struct llm_build_lfm2 : public llm_graph_context {
const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens;
const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seqs = ubatch.n_seqs;
GGML_ASSERT(n_seqs != 0); GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(hparams.n_shortconv_l_cache > 1); GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
@ -16828,10 +17029,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
return res; return res;
} }
llm_graph_result_ptr llama_model::build_graph( ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
const llm_graph_params & params, // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
ggml_cgraph * gf, auto * gf = params.res->get_gf();
llm_graph_type type) const {
std::unique_ptr<llm_graph_context> llm; std::unique_ptr<llm_graph_context> llm;
switch (arch) { switch (arch) {
@ -17051,7 +17252,7 @@ llm_graph_result_ptr llama_model::build_graph(
} break; } break;
case LLM_ARCH_T5: case LLM_ARCH_T5:
{ {
switch (type) { switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER: case LLM_GRAPH_TYPE_ENCODER:
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf); llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
break; break;
@ -17134,6 +17335,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf); llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
} break; } break;
case LLM_ARCH_ERNIE4_5_MOE:
{
llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params, gf);
} break;
case LLM_ARCH_HUNYUAN_MOE: case LLM_ARCH_HUNYUAN_MOE:
{ {
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf); llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
@ -17157,7 +17362,7 @@ llm_graph_result_ptr llama_model::build_graph(
// add on pooling layer // add on pooling layer
llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
return std::move(llm->res); return llm->res->get_gf();
} }
// //
@ -17306,6 +17511,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
case LLM_ARCH_ARCEE: case LLM_ARCH_ARCEE:
case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2

View file

@ -99,8 +99,10 @@ enum llm_type {
LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_A13B, LLM_TYPE_A13B,
LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B, LLM_TYPE_30B_A3B,
LLM_TYPE_235B_A22B, LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_E2B, LLM_TYPE_E2B,
LLM_TYPE_E4B, LLM_TYPE_E4B,
}; };
@ -452,10 +454,7 @@ struct llama_model {
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
// TODO: move this to new llm_arch_model_i interface // TODO: move this to new llm_arch_model_i interface
llm_graph_result_ptr build_graph( ggml_cgraph * build_graph(const llm_graph_params & params) const;
const llm_graph_params & params,
ggml_cgraph * gf,
llm_graph_type type) const;
private: private:
struct impl; struct impl;