mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
kv-cache : avoid modifying recurrent cells when setting inputs (#13834)
* kv-cache : avoid modifying recurrent cells when setting inputs * kv-cache : remove inp_s_mask It was replaced with equivalent and simpler functionality with rs_z (the first zeroed state) and the already-existing inp_s_copy. * kv-cache : fix non-consecutive token pos warning for recurrent models The problem was apparently caused by how the tail cells were swapped. * graph : simplify logic for recurrent state copies * kv-cache : use cell without src refs for rs_z in recurrent cache * llama-graph : fix recurrent state copy The `state_copy` shuffle assumes everything is moved at once, which is not true when `states_extra` is copied back to the cache before copying the range of states between `head` and `head + n_seqs`. This is only a problem if any of the cells in [`head`, `head + n_seqs`) have an `src` in [`head + n_seqs`, `head + n_kv`), which does happen when `n_ubatch > 1` in the `llama-parallel` example. Changing the order of the operations avoids the potential overwrite before use, although when copies are avoided (like with Mamba2), this will require further changes. * llama-graph : rename n_state to state_size in build_recurrent_state This naming should reduce confusion between the state size and the number of states.
This commit is contained in:
parent
55f6b9fa65
commit
dad5c44398
6 changed files with 117 additions and 180 deletions
|
@ -8857,7 +8857,6 @@ struct llm_build_mamba : public llm_graph_context {
|
|||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
|
@ -8866,8 +8865,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
//cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
|
||||
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
||||
cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
|
@ -8908,7 +8906,6 @@ struct llm_build_mamba : public llm_graph_context {
|
|||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
@ -8935,12 +8932,12 @@ struct llm_build_mamba : public llm_graph_context {
|
|||
ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
|
||||
|
||||
// (ab)using the KV cache to store the states
|
||||
ggml_tensor * conv = build_copy_mask_state(
|
||||
gf, conv_states_all, state_copy, state_mask,
|
||||
ggml_tensor * conv = build_recurrent_state(
|
||||
gf, conv_states_all, state_copy,
|
||||
hparams.n_embd_k_s(), n_seqs);
|
||||
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
||||
ggml_tensor * ssm = build_copy_mask_state(
|
||||
gf, ssm_states_all, state_copy, state_mask,
|
||||
ggml_tensor * ssm = build_recurrent_state(
|
||||
gf, ssm_states_all, state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
||||
|
||||
|
@ -11656,7 +11653,6 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
@ -11780,8 +11776,8 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|||
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
||||
}
|
||||
|
||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
||||
ggml_tensor * wkv_state = build_recurrent_state(
|
||||
gf, kv_state->get_v_l(il), state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
|
||||
ggml_tensor * wkv_output;
|
||||
|
@ -11837,7 +11833,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
@ -11848,7 +11843,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||
|
@ -11864,7 +11859,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
@ -11935,7 +11930,6 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
@ -11946,7 +11940,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||
|
@ -11959,7 +11953,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||
|
||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||
|
@ -12051,7 +12045,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
ggml_tensor *& first_layer_value,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
|
@ -12134,8 +12127,8 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|||
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
||||
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
||||
|
||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
||||
ggml_tensor * wkv_state = build_recurrent_state(
|
||||
gf, kv_state->get_v_l(il), state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
|
||||
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
||||
|
@ -12193,7 +12186,6 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
@ -12204,7 +12196,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||
|
@ -12220,7 +12212,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
@ -12287,7 +12279,6 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
@ -12298,7 +12289,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||
|
@ -12311,7 +12302,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||
|
||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue