Merge commit 'd4e0d95cf5' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # common/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp # scripts/sync-ggml.last # tests/CMakeLists.txt
2025-09-12 09:59:41 +00:00 · 2025-06-14 01:58:53 +08:00 · 2025-06-14 01:58:53 +08:00 · 69e4a32ca2
commit 69e4a32ca2
parent 33809c9e82 d4e0d95cf5
18 changed files with 870 additions and 538 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -556,6 +556,9 @@ class TextModel(ModelBase):
            logger.info(f"gguf: experts used count = {n_experts_used}")
        if (head_dim := self.hparams.get("head_dim")) is not None:
            # Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
            # https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
            if self.hparams.get("model_type") != "deepseek_v3":
                self.gguf_writer.add_key_length(head_dim)
                self.gguf_writer.add_value_length(head_dim)
@ -4798,25 +4801,6 @@ class OlmoeModel(TextModel):
 class JinaBertV2Model(BertModel):
    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.intermediate_size = self.hparams["intermediate_size"]
    def get_tensors(self):
        for name, data in super().get_tensors():
            if 'gated_layer' in name:
                d1 = data[:self.intermediate_size, :]
                name1 = name.replace('gated_layers', 'gated_layers_w')
                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
                d2 = data[self.intermediate_size:, :]
                name2 = name.replace('gated_layers', 'gated_layers_v')
                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
                yield name1, d1
                yield name2, d2
                continue
            yield name, data
    def set_vocab(self):
        tokenizer_class = 'BertTokenizer'
        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@ -4832,14 +4816,6 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_bos_token(True)
        self.gguf_writer.add_add_eos_token(True)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # if name starts with "bert.", remove the prefix
        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
        if name.startswith("bert."):
            name = name[5:]
        return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("OpenELMForCausalLM")
 class OpenELMModel(TextModel):
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -518,11 +518,14 @@ void ggml_barrier(struct ggml_threadpool * tp);
 #elif defined(__GNUC__)
 // GCC/Clang on *nix
 # define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
-#elif defined(_MSC_VER) && defined (_WIN64)
+#elif defined(_MSC_VER) && defined(_WIN64)
 // MSVC
 // Note: C name mangling varies across different calling conventions
 // see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
 # define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
 #elif defined(_MSC_VER) && defined(WIN32)
 // ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
 # define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
 #else
 # error "Unsupported compiler for GGML_WEAK_ALIAS"
 #endif
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -3333,8 +3333,6 @@ kernel void kernel_flash_attn_ext(
    threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +                0*DK); // holds the query data
    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +                0*DK); // same as above but in q4_t
    threadgroup o_t  * so  = (threadgroup o_t  *) (shmem_f16 +                0*DK); // reuse query data for accumulation
    threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 +                0*DK); // same as above but in o4_t
    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
    threadgroup k_t    * sk    = (threadgroup k_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
            // O = diag(ms)*O
            {
-                s8x8_t mm;
+                s8x8_t ms;
-                simdgroup_load(mm, ss + 2*C, TS, 0, false);
+                simdgroup_load(ms, ss + 2*C, TS, 0, false);
                #pragma unroll(DV8)
                for (short i = 0; i < DV8; ++i) {
-                    simdgroup_multiply(lo[i], mm, lo[i]);
+                    simdgroup_multiply(lo[i], ms, lo[i]);
                }
            }
            // O = O + (Q*K^T)*V
            {
                for (short cc = 0; cc < C/8; ++cc) {
-                    s8x8_t ms;
+                    s8x8_t vs;
-                    simdgroup_load(ms, ss + 8*cc, TS, 0, false);
+                    simdgroup_load(vs, ss + 8*cc, TS, 0, false);
                    if (is_same<vd4x4_t, v4x4_t>::value) {
                        // we can read directly from global memory
@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
                            v8x8_t mv;
                            simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
-                            simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
+                            simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
                        }
                    } else {
                        for (short ii = 0; ii < DV16; ii += 4) {
@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
                                    v8x8_t mv;
                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
                                }
                            } else {
                                if (ii + tx < DV16) {
@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
                                    v8x8_t mv;
                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
                                }
                            }
                        }
@ -3624,83 +3622,80 @@ kernel void kernel_flash_attn_ext(
        }
        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
-        for (short j = 0; j < Q; ++j) {
+        for (short j = tiisg; j < Q; j += NW) {
            if (tiisg == 0) {
            ss[j*TS + 0] = S[j];
            ss[j*TS + 1] = M[j];
        }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    threadgroup float  * so  = (threadgroup float  *) (shmem_f16 + 0*DK); // reuse query data for accumulation
    threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
    // store result to shared memory in F32
    if (sgitg == 0) {
        for (short i = 0; i < DV8; ++i) {
            //simdgroup_store(lo[i], so + i*8, DV, 0, false);
            simdgroup_float8x8 t(1.0f);
            simdgroup_multiply(t, lo[i], t);
            simdgroup_store(t, so + i*8, DV, 0, false);
        }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // reduce the warps sequentially
    for (ushort sg = 1; sg < nsg; ++sg) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        // each simdgroup stores its output to shared memory, reusing sq
        if (sgitg == sg) {
-            for (short i = 0; i < DV8; ++i) {
+            for (short j = tiisg; j < Q; j += NW) {
-                simdgroup_store(lo[i], so + i*8, DV, 0, false);
+                const float S0 = ss[j*TS - 1*SH + 0];
-            }
+                const float S1 = ss[j*TS        + 0];
        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+                const float M0 = ss[j*TS - 1*SH + 1];
-
+                const float M1 = ss[j*TS        + 1];
        // the first simdgroup accumulates the results from the other simdgroups
        if (sgitg == 0) {
            for (short j = 0; j < Q; ++j) {
                const float S0 = ss[j*TS +         0];
                const float S1 = ss[j*TS + sg*SH + 0];
                const float M0 = ss[j*TS +         1];
                const float M1 = ss[j*TS + sg*SH + 1];
                const float M = max(M0, M1);
-                const float ms0 = exp(M0 - M);
+                float ms0 = exp(M0 - M);
-                const float ms1 = exp(M1 - M);
+                float ms1 = exp(M1 - M);
                const float S = S0*ms0 + S1*ms1;
                if (tiisg == 0) {
                ss[j*TS + 0] = S;
                ss[j*TS + 1] = M;
-                    ss[j*TS + 2*C + j        ] = ms0;
+                ss[j*TS + 2*C + j - 1*SH] = ms0;
-                    ss[j*TS + 2*C + j + sg*SH] = ms1;
+                ss[j*TS + 2*C + j       ] = ms1;
                }
            }
            //simdgroup_barrier(mem_flags::mem_threadgroup);
            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
            {
                s8x8_t ms0;
                s8x8_t ms1;
-                simdgroup_load(ms0, ss + 2*C,         TS, 0, false);
+                simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
-                simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
+                simdgroup_load(ms1, ss + 2*C,        TS, 0, false);
                #pragma unroll(DV8)
                for (short i = 0; i < DV8; ++i) {
-                    o8x8_t t;
+                    simdgroup_float8x8 t;
                    simdgroup_load    (t, so + i*8, DV, 0, false);
-                    simdgroup_multiply(t, ms1, t);
+                    simdgroup_multiply(t, ms0, t);
-                    simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
+                    simdgroup_multiply_accumulate(t, ms1, lo[i], t);
                    simdgroup_store(t, so + i*8, DV, 0, false);
                }
            }
        }
    }
    // store result to shared memory (reuse sq)
    if (sgitg == 0) {
        for (short i = 0; i < DV8; ++i) {
            simdgroup_store(lo[i], so + i*8, DV, 0, false);
        }
    }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK);
+    threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
    // final rescale with 1/S and store to global memory
    for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
@ -3723,8 +3718,8 @@ kernel void kernel_flash_attn_ext(
    half,   half4x4,   simdgroup_half8x8,  \
    float,             simdgroup_float8x8, \
    float,             simdgroup_float8x8, \
-    float,  float4,    simdgroup_float8x8
+    half,   half4,     simdgroup_half8x8
-    //half,   half4,     simdgroup_half8x8
+    //float,  float4,    simdgroup_float8x8
 #define FA_TYPES_BF \
    bfloat, bfloat4,   simdgroup_bfloat8x8, \
@ -3732,8 +3727,8 @@ kernel void kernel_flash_attn_ext(
    bfloat, bfloat4x4, simdgroup_bfloat8x8, \
    float,             simdgroup_float8x8,  \
    float,             simdgroup_float8x8,  \
-    float,  float4,    simdgroup_float8x8
+    half,   half4,     simdgroup_half8x8
-    //half,   half4,     simdgroup_half8x8
+    //float,  float4,    simdgroup_float8x8
 typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@ -0,0 +1,283 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 // This function requires the original shuffled weights.
 // As a reminder, the original weights are shuffled so that (q[0], q[16]) are
 // packed together in a byte, so are (q[1], q[17]) and so on.
 inline float block_q_4_0_dot_y_flat(
        global uchar * x,
        global half  * dh,
        float sumy,
        float16 yl,
        int il
 ) {
    float           d   = *dh;
    global ushort * qs  = ((global ushort *)x + il/2);
    float           acc = 0.f;
    acc += yl.s0 * (qs[0] & 0x000F);
    acc += yl.s1 * (qs[0] & 0x0F00);
    acc += yl.s8 * (qs[0] & 0x00F0);
    acc += yl.s9 * (qs[0] & 0xF000);
    acc += yl.s2 * (qs[1] & 0x000F);
    acc += yl.s3 * (qs[1] & 0x0F00);
    acc += yl.sa * (qs[1] & 0x00F0);
    acc += yl.sb * (qs[1] & 0xF000);
    acc += yl.s4 * (qs[2] & 0x000F);
    acc += yl.s5 * (qs[2] & 0x0F00);
    acc += yl.sc * (qs[2] & 0x00F0);
    acc += yl.sd * (qs[2] & 0xF000);
    acc += yl.s6 * (qs[3] & 0x000F);
    acc += yl.s7 * (qs[3] & 0x0F00);
    acc += yl.se * (qs[3] & 0x00F0);
    acc += yl.sf * (qs[3] & 0xF000);
    return d * (sumy * -8.f + acc);
 }
 //
 // This variant outputs 8 values.
 //
 #undef N_DST
 #undef N_SIMDGROUP
 #undef N_SIMDWIDTH
 #ifdef INTEL_GPU
 #define N_DST 8 // each SIMD group works on 8 rows
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // subgroup size
 #elif defined (ADRENO_GPU)
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 inline void mul_vec_q_n_f32_8x_flat(
        global char  * src0_q,
        global half  * src0_d,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const ulong nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = 0;
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    // The number of scales is the same as the number of blocks.
    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
    global uchar * x = (global uchar *) src0_q + offset0_q;
    global half  * d = (global half  *) src0_d + offset0_d;
    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
    float16 yl;
    float8 sumf = 0.f;
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix*QK4_0 + il;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0.f;
        sumy += yb[0];
        sumy += yb[1];
        sumy += yb[2];
        sumy += yb[3];
        sumy += yb[4];
        sumy += yb[5];
        sumy += yb[6];
        sumy += yb[7];
        sumy += yb[16];
        sumy += yb[17];
        sumy += yb[18];
        sumy += yb[19];
        sumy += yb[20];
        sumy += yb[21];
        sumy += yb[22];
        sumy += yb[23];
        yl.s0 = yb[0];
        yl.s1 = yb[1]/256.f;
        yl.s2 = yb[2];
        yl.s3 = yb[3]/256.f;
        yl.s4 = yb[4];
        yl.s5 = yb[5]/256.f;
        yl.s6 = yb[6];
        yl.s7 = yb[7]/256.f;
        yl.s8 = yb[16]/16.f;
        yl.s9 = yb[17]/4096.f;
        yl.sa = yb[18]/16.f;
        yl.sb = yb[19]/4096.f;
        yl.sc = yb[20]/16.f;
        yl.sd = yb[21]/4096.f;
        yl.se = yb[22]/16.f;
        yl.sf = yb[23]/4096.f;
        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    float8 tot = (float8)(
        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
    );
    if (get_sub_group_local_id() == 0) {
        if (first_row + 0 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
        }
        if (first_row + 1 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
        }
        if (first_row + 2 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
        }
        if (first_row + 3 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
        }
        if (first_row + 4 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
        }
        if (first_row + 5 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
        }
        if (first_row + 6 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
        }
        if (first_row + 7 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
        global char  *  src0_q,
        global half  *  src0_d,
        global float *  src1,
        ulong           offset1,
        global char  *  src2,
        ulong           offset2,
        global float *  dst,
        ulong           offsetd,
        int             ne00,
        int             ne01,
        int             ne02,
        ulong           nb00,
        ulong           nb02,
        int             ne10,
        int             ne11,
        int             ne12,
        ulong           nb11,
        ulong           nb12,
        int             ne20,
        int             ne21,
        ulong           nb21,
        int             ne0,
        int             ne1,
        int             r2,
        int             r3
 ) {
    src1 = (global float *)((global char *)src1 + offset1);
    src2 = (global char  *)((global char *)src2 + offset2);
    dst  = (global float *)((global char *)dst  + offsetd);
    const int iid1 = get_group_id(2)/ne20;
    const int idx  = get_group_id(2)%ne20;
    const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
    const int i11 = idx%ne11;
    const int i12 = iid1;
    const int i1 = idx;
    const int i2 = i12;
    global char  * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
    global half  * src0_d_cur = src0_d + (i02*nb02/nb00);
    global float * src1_cur   = (global float *)((global char *) src1  + i11*nb11 + i12*nb12);
    global float * dst_cur    = dst + i1*ne0 + i2*ne1*ne0;
    mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -333,7 +333,9 @@ class TensorNameMap:
            "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
            "encoder.layers.{bid}.mlp.fc1",                           # nomic-bert-moe
            "model.layers.{bid}.mlp.c_fc",                            # starcoder2
-            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2 (split up/gate, no longer used)
            "encoder.layer.{bid}.mlp.gated_layers",                   # jina-bert-v2 (GEGLU)
            "encoder.layer.{bid}.mlp.up_gated_layer",                 # jina-v2-code (GEGLU)
            "model.layers.{bid}.residual_mlp.w3",                     # arctic
            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
@ -370,7 +372,7 @@ class TensorNameMap:
            "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
            "model.layers.{bid}.feed_forward.w1",         # internlm2
            "encoder.layers.{bid}.mlp.fc12",              # nomic-bert
-            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2 (split up/gate, no longer used)
            "transformer.h.{bid}.mlp.linear_1",           # refact
            "model.layers.{bid}.residual_mlp.w1",         # arctic
            "transformer.h.{bid}.mlp.c_fc_0",             # exaone
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -250,22 +250,6 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
    }
 }
 void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
    GGML_UNUSED(ubatch);
    const int64_t n_kv = kv_state->get_n_kv();
    if (s_mask) {
        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
        float * data = (float *) s_mask->data;
        // clear unused states
        for (int i = 0; i < n_kv; ++i) {
            data[i] = kv_state->s_mask(i);
        }
    }
 }
 void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
    GGML_UNUSED(ubatch);
@ -650,6 +634,7 @@ ggml_tensor * llm_graph_context::build_ffn(
            {
                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
                int64_t split_point = cur->ne[0] / 2;
                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
@ -663,7 +648,7 @@ ggml_tensor * llm_graph_context::build_ffn(
            {
                // Split into two equal parts
                int64_t split_point = cur->ne[0] / 2;
-                // TODO: these conts should not be needed
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
@ -986,23 +971,6 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
    return cur;
 }
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
    const auto n_kv = kv_state->get_n_kv();
    auto & cur = inp->s_mask;
    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
    ggml_set_input(cur);
    res->add_input(std::move(inp));
    return cur;
 }
 ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
    auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
@ -1455,43 +1423,53 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }
-ggml_tensor * llm_graph_context::build_copy_mask_state(
+ggml_tensor * llm_graph_context::build_recurrent_state(
         ggml_cgraph * gf,
         ggml_tensor * s,
         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
+             int32_t   state_size,
-             int32_t   n_state,
+             int32_t   n_seqs,
-             int32_t   n_seqs) const {
+                bool   avoid_copies) const {
    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
    const auto n_kv    = kv_state->get_n_kv();
    const auto kv_head = kv_state->get_head();
    const auto rs_zero = kv_state->get_rs_z();
-    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
+    ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_state->get_size());
    // Clear a single state which will then be copied to the other cleared states.
    // Note that this is a no-op when the view is zero-sized.
    ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
    ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
    ggml_tensor * output_states;
    if (!avoid_copies) {
        // copy states
        // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
-    // this shrinks the tensors's ne[1] to n_kv
+        // {state_size, kv_size} -> {state_size, n_seqs}
-    states = ggml_get_rows(ctx0, states, state_copy);
+        output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
        ggml_build_forward_expand(gf, output_states);
    } else {
        // FIXME: make the gathering operation happen before the copy below
        //        (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
        output_states = states;
    }
-    // clear states of sequences which are starting at the beginning of this batch
+    // copy extra states which won't be changed further (between n_seqs and n_kv)
-    // FIXME: zero-out NANs?
+    ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
    states = ggml_mul(ctx0, states, state_mask);
    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(gf,
        ggml_cpy(ctx0,
-            ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs          )*n_state*ggml_element_size(states)),
+            states_extra,
-            ggml_view_1d(ctx0, s,      n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
+            ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
-    // the part of the states that will be used and modified
+    return output_states;
    return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
 }
 ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
         ggml_cgraph * gf,
         ggml_tensor * state_copy,
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
                 int   il) const {
    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -1502,8 +1480,8 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
    ggml_tensor * token_shift_all = kv_state->get_k_l(il);
-    ggml_tensor * token_shift = build_copy_mask_state(
+    ggml_tensor * token_shift = build_recurrent_state(
-            gf, token_shift_all, state_copy, state_mask,
+            gf, token_shift_all, state_copy,
            hparams.n_embd_k_s(), n_seqs);
    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -200,18 +200,6 @@ public:
    const llama_kv_cache_recurrent_state * kv_state;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
    llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
    virtual ~llm_graph_input_s_mask() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_mask; // F32 [1, n_kv]
    const llama_kv_cache_recurrent_state * kv_state;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
 public:
    llm_graph_input_cross_embd(
@ -521,7 +509,6 @@ struct llm_graph_context {
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
@ -606,18 +593,17 @@ struct llm_graph_context {
    // recurrent
    //
-    ggml_tensor * build_copy_mask_state(
+    ggml_tensor * build_recurrent_state(
             ggml_cgraph * gf,
             ggml_tensor * s,
             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
+                 int32_t   state_size,
-                 int32_t   n_state,
+                 int32_t   n_seqs,
-                 int32_t   n_seqs) const;
+                    bool   avoid_copies = false) const;
    ggml_tensor * build_rwkv_token_shift_load(
             ggml_cgraph * gf,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const;
--- a/src/llama-kv-cache-recurrent.cpp
+++ b/src/llama-kv-cache-recurrent.cpp
@ -406,21 +406,12 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
    bool success = true;
-    // TODO: here we have to verify that all ubatches can fit in the cells
+    for (const auto & ubatch : ubatches) {
-    //       however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
+        if (!find_slot(ubatch)) {
-    //         during the compute of each ubatch. to reproduce, uncomment the following loop and run:
+            success = false;
-    //
+            break;
-    //           $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
+        }
-    //
+    }
    //       recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
    //
    GGML_UNUSED(ubatches);
    //for (const auto & ubatch : ubatches) {
    //    if (!find_slot(ubatch)) {
    //        success = false;
    //        break;
    //    }
    //}
    // restore the original state
    cells = std::move(org_cells);
@ -431,14 +422,13 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
 }
 bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
    const uint32_t n_tokens = ubatch.n_tokens;
    const uint32_t n_seqs = ubatch.n_seqs;
    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
    // if we have enough unused cells before the current head ->
    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head > used + 2*n_tokens) {
+    if (head > used + 2*n_seqs) {
        head = 0;
    }
@ -534,16 +524,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
                empty_cell.src = orig_cell.src;
                orig_cell.seq_id.erase(seq_id);
                empty_cell.seq_id.insert(seq_id); // will be overwritten
                GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
            }
            seq_meta.tail = next_empty_cell;
            // find next empty cell
            if (s + 1 < n_seqs) {
                next_empty_cell += 1;
                for (uint32_t i = 0; i < size; ++i) {
                    next_empty_cell += 1;
                    if (next_empty_cell >= size) { next_empty_cell -= size; }
                    kv_cell & cell = cells[next_empty_cell];
                    if (cell.is_empty()) { break; }
                    next_empty_cell += 1;
                }
            }
        }
@ -553,8 +543,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
    // gather and re-order
    for (uint32_t s = 0; s < n_seqs; ++s) {
-        int32_t dst_id = s + min;
+        const int32_t dst_id = s + min;
-        int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+        const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
        if (dst_id != src_id) {
            kv_cell & dst_cell = cells[dst_id];
            kv_cell & src_cell = cells[src_id];
@ -563,12 +553,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
            std::swap(dst_cell.src, src_cell.src);
            std::swap(dst_cell.seq_id, src_cell.seq_id);
-            // swap tails (assuming they NEVER overlap)
+            // swap tails
-            for (const llama_seq_id seq_id : src_cell.seq_id) {
+            for (uint32_t i = 0; i < size; ++i) {
-                cells[seq_id].tail = src_id;
+                int32_t & tail = cells[i].tail;
                if (tail == src_id) {
                    tail = dst_id;
                } else if (tail == dst_id) {
                    tail = src_id;
                }
            for (const llama_seq_id seq_id : dst_cell.seq_id) {
                cells[seq_id].tail = dst_id;
            }
        }
    }
@ -576,7 +568,7 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
    // update the pos of the used seqs
    for (uint32_t s = 0; s < n_seqs; ++s) {
        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
-        int32_t cell_id = s + min;
+        const int32_t cell_id = s + min;
        kv_cell & cell = cells[cell_id];
        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
@ -594,6 +586,38 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
        }
    }
    // Find first cell without src refs, to use as the zero-ed state
    {
        // TODO: bake-in src refcounts in the cell metadata
        std::vector<int32_t> refcounts(size, 0);
        for (size_t i = 0; i < size; ++i) {
            const int32_t src = cells[i].src;
            if (src >= 0) {
                refcounts[src] += 1;
            }
        }
        rs_z = -1;
        for (int i = min; i <= max; ++i) {
            if (refcounts[i] == 0) {
                rs_z = i;
                break;
            }
        }
        for (int i = min; i <= max; ++i) {
            if (cells[i].src < 0) {
                GGML_ASSERT(rs_z >= 0);
                cells[i].src0 = rs_z;
            } else {
                // Stage the source ids for all used cells to allow correct seq_* behavior
                // and still make these values available when setting the inputs
                cells[i].src0 = cells[i].src;
            }
            cells[i].src = i; // avoid moving or clearing twice
        }
    }
    // allow getting the range of used cells, from head to head + n
    head = min;
    n    = max - min + 1;
@ -605,47 +629,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
 }
 bool llama_kv_cache_recurrent::get_can_shift() const {
-    return false;
+    // shifting the pos is trivial for recurrent models
-}
+    return true;
 int32_t llama_kv_cache_recurrent::s_copy(int i) const {
    const uint32_t cell_id = i + head;
    //////////////////////////////////////////////
    // TODO: this should not mutate the KV cache !
    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
    // prevent out-of-bound sources
    if (cell.src < 0 || (uint32_t) cell.src >= size) {
        cell.src = cell_id;
    }
    int32_t res = cell.src;
    // TODO: do not mutate the KV cache
    // ensure copy only happens once
    if (cell.src != (int32_t) cell_id) {
        cell.src = cell_id;
    }
    return res;
 }
 float llama_kv_cache_recurrent::s_mask(int i) const {
    const uint32_t cell_id = i + head;
    //////////////////////////////////////////////
    // TODO: this should not mutate the KV cache !
    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
    float res = (float) (cell.src >= 0);
    // only clear once
    if (cell.src < 0) {
        cell.src = cell_id;
    }
    return res;
 }
 size_t llama_kv_cache_recurrent::total_size() const {
@ -1111,6 +1096,10 @@ uint32_t llama_kv_cache_recurrent_state::get_head() const {
    return is_full ? 0 : kv->head;
 }
 int32_t llama_kv_cache_recurrent_state::get_rs_z() const {
    return is_full ? 0 : kv->rs_z;
 }
 uint32_t llama_kv_cache_recurrent_state::get_size() const {
    return kv->size;
 }
@ -1124,9 +1113,5 @@ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
 }
 int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
-    return kv->s_copy(i);
+    return  kv->cells[i + kv->head].src0;
 }
 float llama_kv_cache_recurrent_state::s_mask(int i) const {
    return kv->s_mask(i);
 }
--- a/src/llama-kv-cache-recurrent.h
+++ b/src/llama-kv-cache-recurrent.h
@ -57,10 +57,6 @@ public:
    bool get_can_shift() const override;
    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
    int32_t s_copy(int i) const;
    float   s_mask(int i) const;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
@ -73,10 +69,14 @@ public:
    // computed before each graph build
    uint32_t n = 0;
    // first zero-ed state
    int32_t rs_z = -1;
    // TODO: optimize for recurrent state needs
    struct kv_cell {
        llama_pos pos  = -1;
-        int32_t   src  = -1; // used to copy states
+        int32_t   src  = -1; // used to know where states should be copied from
        int32_t   src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
        int32_t   tail = -1;
        std::set<llama_seq_id> seq_id;
@ -157,13 +157,13 @@ public:
    uint32_t get_n_kv() const;
    uint32_t get_head() const;
    int32_t  get_rs_z() const;
    uint32_t get_size() const;
    ggml_tensor * get_k_l(int32_t il) const;
    ggml_tensor * get_v_l(int32_t il) const;
    int32_t s_copy(int i) const;
    float   s_mask(int i) const;
 private:
    const llama_memory_status status;
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
    }
    const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
    debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
 }
 void llama_kv_cache_unified::clear(bool data) {
@ -462,7 +465,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
            for (uint32_t i = 0; i < n_kv; ++i) {
                assert(dinfo.ids[i] <= n_kv);
-                if (dinfo.ids[i] == n_kv) {
+                if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
                    continue;
                }
@ -512,21 +515,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
        head_cur = 0;
    }
    // otherwise, one cell per token.
    if (n_tokens > cells.size()) {
        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
        return -1;
    }
-//#define FIND_SLOT_DEBUG 1
+    if (debug > 0) {
-#if FIND_SLOT_DEBUG
+        LLAMA_LOG_CONT("\n");
-    LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
+        LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
-    // for debugging
+        if ((debug == 2 && n_swa > 0) || debug > 2) {
    {
            std::string ss;
        if (n_swa > 0) {
            for (uint32_t i = 0; i < cells.size(); ++i) {
                if (cells.is_empty(i)) {
                    ss += '.';
@ -534,11 +533,35 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
                    ss += std::to_string(cells.seq_get(i));
                }
                if (i%256 == 255) {
                    ss += " *";
                    ss += '\n';
                }
            }
            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
        }
-        LLAMA_LOG_WARN("\n%s\n", ss.c_str());
+
        if ((debug == 2 && n_swa > 0) || debug > 2) {
            std::string ss;
            for (uint32_t i = 0; i < cells.size(); ++i) {
                std::string cur;
                if (cells.is_empty(i)) {
                    cur = '.';
                } else {
                    cur = std::to_string(cells.pos_get(i));
                }
                const int n = cur.size();
                for (int j = 0; j < 5 - n; ++j) {
                    cur += ' ';
                }
                ss += cur;
                if (i%256 == 255) {
                    ss += " *";
                }
                if (i%64 == 63) {
                    ss += '\n';
                }
            }
            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
        }
        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
@ -546,9 +569,9 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
                continue;
            }
-        LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+            LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
        }
    }
 #endif
    uint32_t n_tested = 0;
@ -559,21 +582,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
            continue;
        }
        // keep track of what the minimum sequence positions would be if we accept the ubatch
        llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
            seq_pos_min[s] = cells.seq_pos_min(s);
        }
        bool found = true;
        for (uint32_t i = 0; i < n_tokens; i++) {
-            const llama_pos    pos    = ubatch.pos[i];
+            //const llama_pos    pos    = ubatch.pos[i];
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            //const llama_seq_id seq_id = ubatch.seq_id[i][0];
            // can we use this cell? either:
            //  - the cell is empty
            //  - the cell is occupied only by one sequence:
-            //    - mask causally, if the sequence is the same as the one we are inserting
+            //    - (disabled) mask causally, if the sequence is the same as the one we are inserting
            //    - mask SWA, using current max pos for that sequence in the cache
            //                always insert in the cell with minimum pos
            bool can_use = cells.is_empty(head_cur + i);
@ -581,21 +598,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
            if (!can_use && cells.seq_count(head_cur + i) == 1) {
                const llama_pos pos_cell = cells.pos_get(head_cur + i);
-                // causal mask
+                // (disabled) causal mask
-                if (cells.seq_has(head_cur + i, seq_id)) {
+                // note: it's better to purge any "future" tokens beforehand
-                    can_use = pos_cell >= pos;
+                //if (cells.seq_has(head_cur + i, seq_id)) {
-                }
+                //    can_use = pos_cell >= pos;
                //}
                if (!can_use) {
                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
                    // SWA mask
-                    // note: we insert only in the cell with minimum pos in order to preserve the invariant that
+                    if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
                    //       all positions between [pos_min, pos_max] for each sequence will be present in the cache
                    //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
                    if (pos_cell == seq_pos_min[seq_id_cell] &&
                        is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
                        seq_pos_min[seq_id_cell]++;
                        can_use = true;
                    }
                }
@ -623,8 +636,22 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
 }
 void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
    // keep track of the max sequence position that we would overwrite with this ubatch
    // for non-SWA cache, this would be always empty
    llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
        seq_pos_max_rm[s] = -1;
    }
    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
        if (!cells.is_empty(head_cur + i)) {
            assert(cells.seq_count(head_cur + i) == 1);
            const llama_seq_id seq_id = cells.seq_get(head_cur + i);
            const llama_pos    pos    = cells.pos_get(head_cur + i);
            seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
            cells.rm(head_cur + i);
        }
@ -635,6 +662,22 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
        }
    }
    // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
    //       will be present in the cache. so we have to purge any position which is less than those we would overwrite
    //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
        if (seq_pos_max_rm[s] == -1) {
            continue;
        }
        if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
            LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
                    __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
            seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
        }
    }
    // move the head at the end of the slot
    head = head_cur + ubatch.n_tokens;
 }
@ -944,11 +987,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
    const auto & n_embd_head_k = hparams.n_embd_head_k;
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
    //GGML_ASSERT(kv_self->size == n_ctx);
    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
-    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
+    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
    ggml_set_input(inp->k_shift);
    for (const auto & layer : layers) {
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@ -158,6 +158,8 @@ private:
    // SWA
    const uint32_t n_swa = 0;
    int debug = 0;
    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    std::vector<ggml_context_ptr>        ctxs;
--- a/src/llama-kv-cells.h
+++ b/src/llama-kv-cells.h
@ -80,6 +80,9 @@ public:
        assert(isrc < pos.size());
        assert(idst < pos.size());
        assert(pos[idst] == -1);
        assert(pos[isrc] != -1);
        pos  [idst] = pos  [isrc];
        shift[idst] = shift[isrc];
        seq  [idst] = seq  [isrc];
@ -144,9 +147,10 @@ public:
        assert(pos[i] != -1);
        seq_pos_rm(i);
        seq[i].reset();
        pos[i] = -1;
-        seq[i].reset();
+        shift[i] = 0;
        used.erase(i);
    }
@ -164,6 +168,7 @@ public:
        if (seq[i].none()) {
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
@ -192,6 +197,7 @@ public:
            seq[i].reset();
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
@ -317,21 +323,20 @@ public:
        pos[i]   += d;
        shift[i] += d;
        seq_pos_add(i);
        has_shift = true;
        if (pos[i] < 0) {
            seq_pos_rm(i);
            seq[i].reset();
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
            return true;
        }
        seq_pos_add(i);
        return false;
    }
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -2320,8 +2320,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@ -6143,7 +6143,7 @@ struct llm_build_bert : public llm_graph_context {
                        model.layers[il].ffn_gate, NULL,                        NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                        model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
            } else {
                cur = build_ffn(cur,
@ -8957,7 +8957,6 @@ struct llm_build_mamba : public llm_graph_context {
        inpL = build_inp_embd(model.tok_embd);
        ggml_tensor * state_copy = build_inp_s_copy();
        ggml_tensor * state_mask = build_inp_s_mask();
        for (int il = 0; il < n_layer; ++il) {
            // norm
@ -8966,8 +8965,7 @@ struct llm_build_mamba : public llm_graph_context {
                    LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
-            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
+            cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
            cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
@ -9008,7 +9006,6 @@ struct llm_build_mamba : public llm_graph_context {
             ggml_cgraph * gf,
             ggml_tensor * cur,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const {
        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -9035,12 +9032,12 @@ struct llm_build_mamba : public llm_graph_context {
        ggml_tensor * ssm_states_all  = kv_state->get_v_l(il);
        // (ab)using the KV cache to store the states
-        ggml_tensor * conv = build_copy_mask_state(
+        ggml_tensor * conv = build_recurrent_state(
-                gf, conv_states_all, state_copy, state_mask,
+                gf, conv_states_all, state_copy,
                hparams.n_embd_k_s(), n_seqs);
        conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
-        ggml_tensor * ssm = build_copy_mask_state(
+        ggml_tensor * ssm = build_recurrent_state(
-                gf, ssm_states_all, state_copy, state_mask,
+                gf, ssm_states_all, state_copy,
                hparams.n_embd_v_s(), n_seqs);
        ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
@ -11756,7 +11753,6 @@ struct llm_build_rwkv6_base : public llm_graph_context {
            ggml_tensor * cur,
            ggml_tensor * x_prev,
            ggml_tensor * state_copy,
            ggml_tensor * state_mask,
            const llama_ubatch & ubatch,
            int   il) const {
        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -11880,8 +11876,8 @@ struct llm_build_rwkv6_base : public llm_graph_context {
            k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
        }
-        ggml_tensor * wkv_state = build_copy_mask_state(
+        ggml_tensor * wkv_state = build_recurrent_state(
-                gf, kv_state->get_v_l(il), state_copy, state_mask,
+                gf, kv_state->get_v_l(il), state_copy,
                hparams.n_embd_v_s(), n_seqs);
        ggml_tensor * wkv_output;
@ -11937,7 +11933,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
        ggml_tensor * state_copy = build_inp_s_copy();
        ggml_tensor * state_mask = build_inp_s_mask();
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -11948,7 +11943,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
+                    gf, state_copy, ubatch, il
                    );
            ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@ -11964,7 +11959,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
                    1
                    );
-            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
+            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
            cb(ffn_inp, "ffn_inp", il);
@ -12035,7 +12030,6 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
        inpL = build_inp_embd(model.tok_embd);
        ggml_tensor * state_copy = build_inp_s_copy();
        ggml_tensor * state_mask = build_inp_s_mask();
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12046,7 +12040,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
+                    gf, state_copy, ubatch, il
                    );
            ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@ -12059,7 +12053,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
                    1
                    );
-            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
+            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
            ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@ -12151,7 +12145,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
            ggml_tensor * cur,
            ggml_tensor * x_prev,
            ggml_tensor * state_copy,
            ggml_tensor * state_mask,
            ggml_tensor *& first_layer_value,
            const llama_ubatch & ubatch,
            int   il) const {
@ -12234,8 +12227,8 @@ struct llm_build_rwkv7_base : public llm_graph_context {
        v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
        a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
-        ggml_tensor * wkv_state = build_copy_mask_state(
+        ggml_tensor * wkv_state = build_recurrent_state(
-                gf, kv_state->get_v_l(il), state_copy, state_mask,
+                gf, kv_state->get_v_l(il), state_copy,
                hparams.n_embd_v_s(), n_seqs);
        ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@ -12293,7 +12286,6 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
        ggml_tensor * state_copy = build_inp_s_copy();
        ggml_tensor * state_mask = build_inp_s_mask();
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12304,7 +12296,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
+                    gf, state_copy, ubatch, il
                    );
            ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@ -12320,7 +12312,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
                    1
                    );
-            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
+            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
            cb(ffn_inp, "ffn_inp", il);
@ -12387,7 +12379,6 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
        inpL = build_inp_embd(model.tok_embd);
        ggml_tensor * state_copy = build_inp_s_copy();
        ggml_tensor * state_mask = build_inp_s_mask();
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12398,7 +12389,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
+                    gf, state_copy, ubatch, il
                    );
            ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@ -12411,7 +12402,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
                    1
                    );
-            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
+            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
            ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
--- a/tests/test-tokenizers-repo.sh
+++ b/tests/test-tokenizers-repo.sh
@ -0,0 +1,36 @@
 #!/bin/bash
 if [ $# -lt 2 ]; then
    printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
    exit 1
 fi
 if [ $# -eq 3 ]; then
    toktest=$3
 else
    toktest="./test-tokenizer-0"
 fi
 if [ ! -x $toktest ]; then
    printf "Test executable \"$toktest\" not found!\n"
    exit 1
 fi
 repo=$1
 folder=$2
 if [ -d $folder ] && [ -d $folder/.git ]; then
    (cd $folder; git pull)
 else
    git clone $repo $folder
 fi
 shopt -s globstar
 for gguf in $folder/**/*.gguf; do
    if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
        $toktest $gguf
    else
        printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
    fi
 done
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -233,6 +233,7 @@ struct server_task {
        slot_params defaults;
        defaults.sampling    = params_base.sampling;
        defaults.speculative = params_base.speculative;
        defaults.n_keep      = params_base.n_keep;
        // enabling this will output extra debug information in the HTTP responses from the server
        params.verbose           = params_base.verbosity > 9;
@ -2060,6 +2061,7 @@ struct server_context {
            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
            slot.params.sampling = params_base.sampling;
            slot.params.n_keep = params_base.n_keep;
            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -3556,9 +3558,6 @@ struct server_context {
                const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
                // keep track of total number of tokens generated in the draft
                slot.n_draft_total += draft.size();
                // ignore small drafts
                if (slot.params.speculative.n_min > (int) draft.size()) {
                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@ -3566,6 +3565,9 @@ struct server_context {
                    continue;
                }
                // keep track of total number of drafted tokens tested
                slot.n_draft_total += draft.size();
                // construct the speculation batch
                common_batch_clear(slot.batch_spec);
                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
@ -3584,7 +3586,7 @@ struct server_context {
                slot.n_past    += ids.size();
                slot.n_decoded += ids.size();
-                // update how many tokens out of draft was accepted
+                // update how many tokens out of those tested were accepted
                slot.n_draft_accepted += ids.size() - 1;
                slot.cache_tokens.push_back(id);
--- a/tools/server/webui/src/index.scss
+++ b/tools/server/webui/src/index.scss
@ -41,6 +41,10 @@ html {
  max-width: 900px;
 }
 .chat-bubble {
  @apply break-words;
 }
 .chat-bubble-base-300 {
  --tw-bg-opacity: 1;
  --tw-text-opacity: 1;