mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # src/llama-model.cpp
This commit is contained in:
commit
be3bba67ff
6 changed files with 79 additions and 78 deletions
|
@ -537,16 +537,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||||
|
|
||||||
float freq_base_l = cparams.rope_freq_base;
|
const bool is_swa = hparams.is_swa(il);
|
||||||
float freq_scale_l = cparams.rope_freq_scale;
|
|
||||||
|
|
||||||
// TODO: improve
|
// note: the swa rope params could become part of the cparams in the future
|
||||||
if (model.arch == LLM_ARCH_GEMMA3) {
|
// if we decide to make them configurable, like the non-sliding ones
|
||||||
const bool is_sliding = hparams.is_sliding(il);
|
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||||
|
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||||
freq_base_l = is_sliding ? 10000.0f : cparams.rope_freq_base;
|
|
||||||
freq_scale_l = is_sliding ? 1.0f : cparams.rope_freq_scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
|
ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
|
||||||
|
|
||||||
|
|
|
@ -1311,29 +1311,23 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified(
|
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
|
||||||
bool causal,
|
|
||||||
bool swa) const {
|
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
|
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
|
||||||
|
|
||||||
const auto n_kv = kv_self->n;
|
const auto n_kv = kv_self->n;
|
||||||
|
|
||||||
inp->self_kq_mask = causal
|
inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
|
||||||
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
||||||
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
||||||
if (swa) {
|
if (hparams.n_swa_pattern > 1) {
|
||||||
GGML_ASSERT(hparams.n_swa > 0);
|
GGML_ASSERT(hparams.n_swa > 0);
|
||||||
|
|
||||||
inp->self_kq_mask_swa = causal
|
inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
|
||||||
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
||||||
//cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
|
//cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
ggml_set_input(inp->self_kq_mask_swa);
|
||||||
|
|
||||||
|
@ -1403,9 +1397,9 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool is_sliding = hparams.is_sliding(il);
|
const bool is_swa = hparams.is_swa(il);
|
||||||
|
|
||||||
const auto & kq_mask = is_sliding ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
||||||
|
|
||||||
const auto n_kv = kv_self->n;
|
const auto n_kv = kv_self->n;
|
||||||
|
|
||||||
|
|
|
@ -509,9 +509,7 @@ struct llm_graph_context {
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified(
|
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
|
||||||
bool causal,
|
|
||||||
bool swa) const;
|
|
||||||
|
|
||||||
ggml_tensor * build_attn(
|
ggml_tensor * build_attn(
|
||||||
llm_graph_input_attn_kv_unified * inp,
|
llm_graph_input_attn_kv_unified * inp,
|
||||||
|
|
|
@ -70,7 +70,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||||
return ssm_d_state * ssm_d_inner;
|
return ssm_d_state * ssm_d_inner;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_hparams::is_sliding(uint32_t il) const {
|
bool llama_hparams::is_swa(uint32_t il) const {
|
||||||
if (il < n_layer) {
|
if (il < n_layer) {
|
||||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,7 +79,9 @@ struct llama_hparams {
|
||||||
|
|
||||||
float rope_attn_factor = 1.0f;
|
float rope_attn_factor = 1.0f;
|
||||||
float rope_freq_base_train;
|
float rope_freq_base_train;
|
||||||
|
float rope_freq_base_train_swa;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
|
float rope_freq_scale_train_swa;
|
||||||
uint32_t n_ctx_orig_yarn;
|
uint32_t n_ctx_orig_yarn;
|
||||||
float rope_yarn_log_mul;
|
float rope_yarn_log_mul;
|
||||||
|
|
||||||
|
@ -135,7 +137,7 @@ struct llama_hparams {
|
||||||
// dimension of the recurrent state embeddings
|
// dimension of the recurrent state embeddings
|
||||||
uint32_t n_embd_v_s() const;
|
uint32_t n_embd_v_s() const;
|
||||||
|
|
||||||
bool is_sliding(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
|
@ -480,6 +480,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||||
|
|
||||||
|
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||||||
|
|
||||||
// non-transformer models do not have attention heads
|
// non-transformer models do not have attention heads
|
||||||
|
@ -785,9 +789,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.n_swa = 2047;
|
hparams.n_swa = 2047;
|
||||||
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
||||||
// default value for Phi-3-mini-128k-instruct
|
// default value for Phi-3-mini-128k-instruct
|
||||||
|
// note: this seems incorrect because the window is bigger than the train context?
|
||||||
hparams.n_swa = 262144;
|
hparams.n_swa = 262144;
|
||||||
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
||||||
// default value for Phi-3-medium-128k-instruct
|
// default value for Phi-3-medium-128k-instruct
|
||||||
|
// note: this seems incorrect because the window is equal to the train context?
|
||||||
hparams.n_swa = 131072;
|
hparams.n_swa = 131072;
|
||||||
}
|
}
|
||||||
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
|
@ -882,6 +888,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
hparams.n_swa_pattern = 6;
|
hparams.n_swa_pattern = 6;
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||||
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
@ -1357,13 +1366,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
#endif
|
#endif
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
|
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
|
||||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||||
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
|
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
||||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||||
}
|
}
|
||||||
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||||
auto * dev = devices.at(layer_gpu);
|
auto * dev = devices.at(layer_gpu);
|
||||||
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
|
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
|
||||||
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3798,6 +3808,7 @@ void llama_model::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||||
|
LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
|
||||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||||||
|
@ -3962,7 +3973,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -4129,7 +4140,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -4287,7 +4298,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -4405,7 +4416,7 @@ struct llm_build_xverse : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -4513,7 +4524,7 @@ struct llm_build_falcon : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * attn_norm;
|
ggml_tensor * attn_norm;
|
||||||
|
@ -4638,7 +4649,7 @@ struct llm_build_grok : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -4792,7 +4803,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -4916,7 +4927,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||||
cb(pos, "pos_embd", -1);
|
cb(pos, "pos_embd", -1);
|
||||||
|
@ -5019,7 +5030,7 @@ struct llm_build_refact : public llm_graph_context {
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -5282,7 +5293,7 @@ struct llm_build_bloom : public llm_graph_context {
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
inpL = build_norm(inpL,
|
inpL = build_norm(inpL,
|
||||||
model.tok_norm,
|
model.tok_norm,
|
||||||
|
@ -5387,7 +5398,7 @@ struct llm_build_mpt : public llm_graph_context {
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
if (model.pos_embd) {
|
if (model.pos_embd) {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
|
@ -5531,7 +5542,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// norm
|
// norm
|
||||||
|
@ -5682,7 +5693,7 @@ struct llm_build_qwen : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -5798,7 +5809,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -5913,7 +5924,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
int sections[4];
|
int sections[4];
|
||||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
@ -6033,7 +6044,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -6182,7 +6193,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
attn_norm_output = build_norm(inpL,
|
attn_norm_output = build_norm(inpL,
|
||||||
|
@ -6306,7 +6317,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
auto * residual = inpL;
|
auto * residual = inpL;
|
||||||
|
@ -6452,7 +6463,7 @@ struct llm_build_plamo : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
|
@ -6560,7 +6571,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||||
cb(pos, "pos_embd", -1);
|
cb(pos, "pos_embd", -1);
|
||||||
|
@ -6668,7 +6679,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
@ -6781,7 +6792,7 @@ struct llm_build_orion : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -6902,7 +6913,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -7032,7 +7043,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -7236,7 +7247,7 @@ struct llm_build_gemma : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// norm
|
// norm
|
||||||
|
@ -7346,7 +7357,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// norm
|
// norm
|
||||||
|
@ -7481,13 +7492,13 @@ struct llm_build_gemma3 : public llm_graph_context {
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
// TODO: is causal == true correct? might need some changes
|
// TODO: is causal == true correct? might need some changes
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const bool is_sliding = hparams.is_sliding(il);
|
const bool is_swa = hparams.is_swa(il);
|
||||||
|
|
||||||
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||||
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
@ -7610,7 +7621,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -7923,7 +7934,7 @@ struct llm_build_command_r : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
|
@ -8073,10 +8084,10 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, true);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const bool is_sliding = hparams.is_sliding(il);
|
const bool is_swa = hparams.is_swa(il);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
||||||
|
@ -8110,7 +8121,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_sliding) {
|
if (is_swa) {
|
||||||
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
|
||||||
beta_fast, beta_slow);
|
beta_fast, beta_slow);
|
||||||
|
@ -8205,7 +8216,7 @@ struct llm_build_olmo : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -8327,7 +8338,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -8453,7 +8464,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -8576,7 +8587,7 @@ struct llm_build_openelm : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
@ -8706,7 +8717,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
@ -8852,7 +8863,7 @@ struct llm_build_arctic : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -8984,7 +8995,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
|
|
||||||
|
@ -9149,7 +9160,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -9369,7 +9380,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -9627,7 +9638,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
||||||
|
|
||||||
const int64_t n_outputs_enc = embd_enc->ne[1];
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
||||||
|
|
||||||
auto * inp_attn_self = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn_self = build_attn_inp_kv_unified();
|
||||||
auto * inp_attn_cross = build_attn_inp_cross();
|
auto * inp_attn_cross = build_attn_inp_cross();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -9793,7 +9804,7 @@ struct llm_build_jais : public llm_graph_context {
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
@ -9889,7 +9900,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -10021,7 +10032,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -10144,7 +10155,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
@ -10660,7 +10671,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified(true, false);
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue