mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-14 10:59:41 +00:00
parent
97ae5961a4
commit
3f196be84b
3 changed files with 107 additions and 133 deletions
|
@ -1223,8 +1223,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
ggml_tensor * v,
|
ggml_tensor * v,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks,
|
||||||
|
ggml_tensor * v_mla,
|
||||||
float kq_scale) const {
|
float kq_scale) const {
|
||||||
const bool v_trans = v->nb[1] > v->nb[2];
|
const bool v_trans = v->nb[1] > v->nb[2];
|
||||||
|
|
||||||
|
@ -1360,6 +1360,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
@ -1381,7 +1382,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k = k_cur;
|
ggml_tensor * k = k_cur;
|
||||||
ggml_tensor * v = v_cur;
|
ggml_tensor * v = v_cur;
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
@ -1443,6 +1444,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
@ -1469,7 +1471,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
@ -1495,33 +1497,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
|
||||||
int il) const {
|
|
||||||
return build_attn_with_sinks(
|
|
||||||
inp,
|
|
||||||
wo,
|
|
||||||
wo_b,
|
|
||||||
q_cur,
|
|
||||||
k_cur,
|
|
||||||
v_cur,
|
|
||||||
kq_b,
|
|
||||||
v_mla,
|
|
||||||
nullptr,
|
|
||||||
kq_scale,
|
|
||||||
il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
|
||||||
llm_graph_input_attn_kv_iswa * inp,
|
|
||||||
ggml_tensor * wo,
|
|
||||||
ggml_tensor * wo_b,
|
|
||||||
ggml_tensor * q_cur,
|
|
||||||
ggml_tensor * k_cur,
|
|
||||||
ggml_tensor * v_cur,
|
|
||||||
ggml_tensor * kq_b,
|
|
||||||
ggml_tensor * v_mla,
|
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks,
|
||||||
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
|
@ -1561,7 +1538,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
@ -1600,6 +1577,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
@ -1615,7 +1593,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k = k_cur;
|
ggml_tensor * k = k_cur;
|
||||||
ggml_tensor * v = v_cur;
|
ggml_tensor * v = v_cur;
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
|
|
@ -685,7 +685,7 @@ struct llm_graph_context {
|
||||||
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale) const;
|
float kq_scale) const;
|
||||||
|
|
||||||
|
@ -699,6 +699,7 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
@ -713,6 +714,7 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
@ -728,21 +730,8 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
|
||||||
int il) const;
|
|
||||||
|
|
||||||
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
|
||||||
ggml_tensor * build_attn_with_sinks(
|
|
||||||
llm_graph_input_attn_kv_iswa * inp,
|
|
||||||
ggml_tensor * wo,
|
|
||||||
ggml_tensor * wo_b,
|
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
||||||
ggml_tensor * kq_b,
|
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
ggml_tensor * sinks, // [n_head_q]
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
|
@ -756,6 +745,7 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
|
@ -6050,7 +6050,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6224,7 +6224,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6401,7 +6401,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -6533,7 +6533,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -6648,7 +6648,7 @@ struct llm_build_xverse : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -6771,7 +6771,7 @@ struct llm_build_falcon : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -6901,7 +6901,7 @@ struct llm_build_grok : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7050,7 +7050,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7164,7 +7164,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7263,7 +7263,7 @@ struct llm_build_refact : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7426,7 +7426,7 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7571,7 +7571,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, nullptr,
|
model.layers[il].wo, nullptr,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7671,7 +7671,7 @@ struct llm_build_bloom : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7819,7 +7819,7 @@ struct llm_build_mpt : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -7965,7 +7965,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8086,7 +8086,7 @@ struct llm_build_qwen : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8206,7 +8206,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8320,8 +8320,9 @@ struct llm_build_dream : public llm_graph_context {
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
|
cur = build_attn(inp_attn,
|
||||||
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8420,8 +8421,9 @@ struct llm_build_llada : public llm_graph_context {
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
|
cur = build_attn(inp_attn,
|
||||||
1.0f / sqrtf(float(n_embd_head)), il);
|
model.layers[il].wo, NULL,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8534,7 +8536,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8661,7 +8663,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8814,7 +8816,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -8935,7 +8937,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9075,7 +9077,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9212,7 +9214,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9346,7 +9348,7 @@ struct llm_build_plamo : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9454,7 +9456,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9568,7 +9570,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9697,7 +9699,7 @@ struct llm_build_orion : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -9824,7 +9826,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -10012,7 +10014,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -10142,7 +10144,7 @@ struct llm_build_gemma : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -10257,7 +10259,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -10399,7 +10401,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -10580,7 +10582,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||||
} else {
|
} else {
|
||||||
// no KV layers
|
// no KV layers
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
@ -10598,7 +10600,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = build_norm(cur,
|
cur = build_norm(cur,
|
||||||
|
@ -10963,7 +10965,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -11390,7 +11392,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
// No RoPE :)
|
// No RoPE :)
|
||||||
cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
cur = build_attn(inp_hybrid->get_attn(),
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -11548,7 +11552,7 @@ struct llm_build_command_r : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -11683,7 +11687,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -11814,7 +11818,7 @@ struct llm_build_olmo : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, nullptr,
|
model.layers[il].wo, nullptr,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -11934,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12067,7 +12071,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12200,7 +12204,7 @@ struct llm_build_openelm : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12312,7 +12316,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12462,7 +12466,7 @@ struct llm_build_arctic : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12617,7 +12621,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -12845,7 +12849,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||||
} else {
|
} else {
|
||||||
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
||||||
cb(kv, "kv", il);
|
cb(kv, "kv", il);
|
||||||
|
@ -12879,7 +12883,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13046,7 +13050,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
|
||||||
cur = build_norm(cur,
|
cur = build_norm(cur,
|
||||||
model.layers[il].attn_sub_norm, NULL,
|
model.layers[il].attn_sub_norm, NULL,
|
||||||
|
@ -13169,7 +13173,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo_enc, nullptr,
|
model.layers[il].wo_enc, nullptr,
|
||||||
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13275,7 +13279,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn_self,
|
cur = build_attn(inp_attn_self,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13307,7 +13311,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn_cross,
|
cur = build_attn(inp_attn_cross,
|
||||||
model.layers[il].wo_cross, nullptr,
|
model.layers[il].wo_cross, nullptr,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
@ -13439,7 +13443,7 @@ struct llm_build_jais : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -13571,7 +13575,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -13704,7 +13708,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -13853,7 +13857,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||||
|
@ -14007,7 +14011,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -14138,7 +14142,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -14269,7 +14273,7 @@ struct llm_build_exaone4 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15204,7 +15208,7 @@ struct llm_build_granite : public llm_graph_context {
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
@ -15423,7 +15427,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
@ -15608,7 +15612,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, nullptr,
|
model.layers[il].wo, nullptr,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -15964,7 +15968,7 @@ struct llm_build_plm : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -16087,7 +16091,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -16227,7 +16231,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
@ -16382,7 +16386,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
|
@ -16515,7 +16519,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16668,7 +16672,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
||||||
|
|
||||||
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(attn_out, "attn_out", il);
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
@ -16878,7 +16882,9 @@ private:
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
cur = build_attn(inp,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
|
@ -17125,7 +17131,7 @@ struct llm_build_arcee : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17270,7 +17276,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17430,7 +17436,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17560,7 +17566,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17682,9 +17688,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
cur = build_attn_with_sinks(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
|
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||||
|
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -17847,7 +17853,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
||||||
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
|
||||||
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
||||||
|
|
||||||
|
@ -17991,7 +17997,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
||||||
|
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue