diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 5f54909dc..02ff18885 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -238,14 +238,19 @@ jobs:
       matrix:
         include:
           - build: 'cpu-x64'
+            arch: 'x64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
           #- build: 'openblas-x64'
+          #  arch: 'x64'
           #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'vulkan-x64'
+            arch: 'x64'
             defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
           - build: 'cpu-arm64'
+            arch: 'arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
           - build: 'opencl-adreno-arm64'
+            arch: 'arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
 
     steps:
@@ -312,6 +317,8 @@ jobs:
       - name: libCURL
         id: get_libcurl
         uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
 
       - name: Build
         id: cmake_build
@@ -339,7 +346,7 @@ jobs:
         env:
           CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
 
       - name: Upload artifacts
diff --git a/common/arg.cpp b/common/arg.cpp
index 835f9f5a8..a06967eb4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2586,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_junk = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"--pos"}, "N",
         string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2649,7 +2649,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.is_pp_shared = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
@@ -2881,6 +2881,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.chat_template = read_file(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/common.h b/common/common.h
index 0ea544919..fd712bd8c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -364,6 +364,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;
 
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 34b79d2d0..7c3683745 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -415,6 +415,13 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
     GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
@@ -1362,6 +1369,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,        flash_attn_ext_q8_0_h256,        has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,      flash_attn_ext_vec_f16_h64,      has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,     flash_attn_ext_vec_bf16_h64,     has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,     flash_attn_ext_vec_q4_0_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,     flash_attn_ext_vec_q4_1_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,     flash_attn_ext_vec_q5_0_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,     flash_attn_ext_vec_q5_1_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,     flash_attn_ext_vec_q8_0_h64,     has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,      flash_attn_ext_vec_f16_h96,      has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,     flash_attn_ext_vec_bf16_h96,     has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,     flash_attn_ext_vec_q4_0_h96,     has_simdgroup_reduction);
@@ -4358,7 +4372,7 @@ static bool ggml_metal_encode_node(
                 // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
                 //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                 //       these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
-                if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
+                if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
                     switch (src1->type) {
                         case GGML_TYPE_F16:
                             {
@@ -4539,6 +4553,24 @@ static bool ggml_metal_encode_node(
                     use_vec_kernel = true;
 
                     switch (ne00) {
+                        case 64:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
                         case 96:
                             {
                                 switch (src1->type) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 122ae5973..e94b6cd75 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4124,6 +4124,16 @@ kernel void kernel_flash_attn_ext_vec(
 
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
 
+template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  64, 64, 8>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 64, 64, 8>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0,        8, dequantize_q4_0_t4, block_q4_0,  8, dequantize_q4_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1,        8, dequantize_q4_1_t4, block_q4_1,  8, dequantize_q4_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0,        8, dequantize_q5_0_t4, block_q5_0,  8, dequantize_q5_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1,        8, dequantize_q5_1_t4, block_q5_1,  8, dequantize_q5_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0,        8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 64, 64, 8>;
+
 template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  96, 96, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 96, 96, 4>;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 9be30cb24..948a49956 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5896,10 +5896,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     vk_pipeline *pipelines;
     bool small_rows = N <= get_fa_num_small_rows(path);
 
+    // coopmat1 does not actually support "small rows" (it needs 16 rows).
+    // So use scalar instead.
     if (small_rows && path == FA_COOPMAT1) {
         path = FA_SCALAR;
     }
 
+    // scalar is faster than coopmat2 when N==1
+    if (N == 1 && path == FA_COOPMAT2) {
+        path = FA_SCALAR;
+    }
+
     bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
 
     switch (path) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
index 168355768..ce230a8f7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@@ -9,60 +9,13 @@
 #extension GL_KHR_shader_subgroup_shuffle : enable
 
 #include "types.comp"
+#include "flash_attn_base.comp"
 
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
-layout (constant_id = 1) const uint32_t Br = 1;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-
-layout (constant_id = 5) const uint32_t D_split = 16;
 const uint32_t D_per_thread = D / D_split;
 
 const uint32_t cols_per_iter = WorkGroupSize / D_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;
 
-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;
 
 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
@@ -71,39 +24,6 @@ layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
 layout (binding = 2) readonly buffer V {float16_t data_v[];};
 layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#if defined(A_TYPE_PACKED16)
-#define BINDING_IDX_K 0
-#define BINDING_IDX_V 1
-layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
-#endif
-
-#if defined(DATA_A_Q4_0)
-#define BLOCK_BYTE_SIZE 18
-
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
-    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
-    uint shift = (iqs & 0x10) >> 2;
-    vui_lo >>= shift;
-    vui_hi >>= shift;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
-}
-#endif
-
-#if defined(DATA_A_Q8_0)
-#define BLOCK_BYTE_SIZE 34
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
-    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
-}
-#endif
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 
 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
@@ -114,27 +34,6 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
     return elem;
 }
 
-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 shared FLOAT_TYPE tmpsh[WorkGroupSize];
 shared vec4 tmpshv4[WorkGroupSize];
 
@@ -146,58 +45,12 @@ void main() {
     init_iq_shmem(gl_WorkGroupSize);
 #endif
 
-    const uint32_t tid = gl_LocalInvocationIndex;
-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;
+    init_indices();
 
+    const uint32_t tid = gl_LocalInvocationIndex;
     const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
     const uint32_t col_tid = gl_LocalInvocationIndex / D_split;
 
-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
-
-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
-
     uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
 
     [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp
new file mode 100644
index 000000000..61d90e2d8
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp
@@ -0,0 +1,162 @@
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
+layout (constant_id = 1) const uint32_t Br = 1;
+layout (constant_id = 2) const uint32_t Bc = 32;
+layout (constant_id = 3) const uint32_t D = 32;
+layout (constant_id = 4) const uint32_t Clamp = 0;
+layout (constant_id = 5) const uint32_t D_split = 16;
+
+
+layout (push_constant) uniform parameter {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+
+    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
+} p;
+
+layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
+
+#if defined(A_TYPE_PACKED16)
+#define BINDING_IDX_K 0
+#define BINDING_IDX_V 1
+layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
+#endif
+
+#if defined(DATA_A_Q4_0)
+#define BLOCK_BYTE_SIZE 18
+
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+    uint shift = (iqs & 0x10) >> 2;
+    vui_lo >>= shift;
+    vui_hi >>= shift;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+#define BLOCK_BYTE_SIZE 34
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+}
+#endif
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+
+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
+// Load the slope matrix, indexed by Q's dimension 2.
+ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
+{
+    const uint32_t h = iq2 + (r % p.gqa_ratio);
+
+    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
+}
+
+uint32_t i, N, KV, split_k_index, Tr, start_j, end_j,
+         iq2, iq3, rk2, rk3, rv2, rv3, ik2, ik3, iv2, iv3,
+         q_stride, k_stride, v_stride, m_stride;
+
+void init_indices()
+{
+    N = p.N;
+    KV = p.KV;
+
+    i = gl_WorkGroupID.x;
+    split_k_index = 0;
+
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    Tr = CEIL_DIV(N, Br);
+
+    start_j = split_k_index * p.split_kv / Bc;
+    end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
+
+    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
+    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
+    iq2 = gl_WorkGroupID.y * p.gqa_ratio;
+    iq3 = gl_WorkGroupID.z;
+
+    // broadcast factors
+    rk2 = p.neq2/p.nek2;
+    rk3 = p.neq3/p.nek3;
+
+    rv2 = p.neq2/p.nev2;
+    rv3 = p.neq3/p.nev3;
+
+    // k indices
+    ik3 = iq3 / rk3;
+    ik2 = iq2 / rk2;
+
+    // v indices
+    iv3 = iq3 / rv3;
+    iv2 = iq2 / rv2;
+
+    // nb?1 are already divided by the type size and are in units of elements.
+    // When using grouped query attention, Q is indexed by iq2, so the stride
+    // should be nb02 (which is in bytes).
+    q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
+    k_stride = p.nb11;
+    v_stride = p.nb21;
+    // When using grouped query attention, all rows use the same mask (stride 0).
+    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
+    // that prevents the compiler from folding the "&" through the select
+    // and breaking the alignment detection.
+    m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
index 8b86b623b..da478be24 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -11,14 +11,7 @@
 #extension GL_KHR_cooperative_matrix : enable
 
 #include "types.comp"
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 1) const uint32_t Br = 1;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-
-layout (constant_id = 5) const uint32_t D_split = 16;
+#include "flash_attn_base.comp"
 
 const uint32_t D_per_thread = D / D_split;
 const uint32_t row_split = 4;
@@ -26,46 +19,6 @@ const uint32_t rows_per_thread = Br / row_split;
 const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;
 
-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;
 
 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
@@ -74,39 +27,6 @@ layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
 layout (binding = 2) readonly buffer V {float16_t data_v[];};
 layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#if defined(A_TYPE_PACKED16)
-#define BINDING_IDX_K 0
-#define BINDING_IDX_V 1
-layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
-#endif
-
-#if defined(DATA_A_Q4_0)
-#define BLOCK_BYTE_SIZE 18
-
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
-    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
-    uint shift = (iqs & 0x10) >> 2;
-    vui_lo >>= shift;
-    vui_hi >>= shift;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
-}
-#endif
-
-#if defined(DATA_A_Q8_0)
-#define BLOCK_BYTE_SIZE 34
-vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
-    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
-    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
-
-    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
-}
-#endif
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 
 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
@@ -117,27 +37,6 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
     return elem;
 }
 
-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 // These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
 const uint32_t MatBr = 16;
 const uint32_t MatBc = 16;
@@ -162,9 +61,9 @@ void main() {
     init_iq_shmem(gl_WorkGroupSize);
 #endif
 
+    init_indices();
+
     const uint32_t tid = gl_LocalInvocationIndex;
-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;
 
     const uint32_t threads_per_rowgroup = gl_WorkGroupSize.x / row_split;
     const uint32_t row_tid = gl_LocalInvocationIndex / threads_per_rowgroup;
@@ -173,51 +72,6 @@ void main() {
 
 #define tile_row(r) (row_tid * rows_per_thread + (r))
 
-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
-
-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
-
     uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
 
     [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index b926a578a..6acf67a03 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -18,62 +18,12 @@
 
 #include "types.comp"
 #include "dequant_funcs_cm2.comp"
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (constant_id = 1) const uint32_t Br = 32;
-layout (constant_id = 2) const uint32_t Bc = 32;
-layout (constant_id = 3) const uint32_t D = 32;
-layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV;
-
-layout (push_constant) uniform parameter {
-    uint32_t N;
-    uint32_t KV;
-
-    uint32_t ne1;
-    uint32_t ne2;
-    uint32_t ne3;
-
-    uint32_t neq2;
-    uint32_t neq3;
-    uint32_t nek2;
-    uint32_t nek3;
-    uint32_t nev2;
-    uint32_t nev3;
-    uint32_t nem1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-    uint32_t nb21;
-    uint32_t nb22;
-    uint32_t nb23;
-    uint32_t nb31;
-
-    float scale;
-    float max_bias;
-    float logit_softcap;
-
-    uint32_t mask;
-    uint32_t n_head_log2;
-    float m0;
-    float m1;
-
-    uint32_t gqa_ratio;
-    uint32_t split_kv;
-    uint32_t k_num;
-} p;
+#include "flash_attn_base.comp"
 
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
 layout (binding = 2) readonly buffer V {uint8_t data_v[];};
 layout (binding = 3) readonly buffer M {uint8_t data_m[];};
-layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 
 ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
     return max(x, y);
@@ -118,67 +68,12 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
     return elem;
 }
 
-// Store column zero. This is used to save per-row m and L values for split_k.
-ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
-{
-    if (r < N && c == 0) {
-        uint32_t offset = iq2 + r;
-        data_o[o_offset + offset] = D_TYPE(elem);
-    }
-    return elem;
-}
-
-// Load the slope matrix, indexed by Q's dimension 2.
-ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
-{
-    const uint32_t h = iq2 + (r % p.gqa_ratio);
-
-    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
-    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
-
-    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
-}
-
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
     init_iq_shmem(gl_WorkGroupSize);
 #endif
 
-    const uint32_t N = p.N;
-    const uint32_t KV = p.KV;
-
-    uint32_t i = gl_WorkGroupID.x;
-    uint32_t split_k_index = 0;
-
-    if (p.k_num > 1) {
-        i = 0;
-        split_k_index = gl_WorkGroupID.x;
-    }
-
-    const uint32_t Tr = CEIL_DIV(N, Br);
-
-    const uint32_t start_j = split_k_index * p.split_kv / Bc;
-    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
-
-    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
-    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
-    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
-    const uint32_t iq3 = gl_WorkGroupID.z;
-
-    // broadcast factors
-    const uint32_t rk2 = p.neq2/p.nek2;
-    const uint32_t rk3 = p.neq3/p.nek3;
-
-    const uint32_t rv2 = p.neq2/p.nev2;
-    const uint32_t rv3 = p.neq3/p.nev3;
-
-    // k indices
-    const uint32_t ik3 = iq3 / rk3;
-    const uint32_t ik2 = iq2 / rk2;
-
-    // v indices
-    const uint32_t iv3 = iq3 / rv3;
-    const uint32_t iv2 = iq2 / rv2;
+    init_indices();
 
     tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
     tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp);
@@ -195,17 +90,6 @@ void main() {
     tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
     tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
 
-    // nb?1 are already divided by the type size and are in units of elements.
-    // When using grouped query attention, Q is indexed by iq2, so the stride
-    // should be nb02 (which is in bytes).
-    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
-    uint32_t k_stride = p.nb11;
-    uint32_t v_stride = p.nb21;
-    // When using grouped query attention, all rows use the same mask (stride 0).
-    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
-    // that prevents the compiler from folding the "&" through the select
-    // and breaking the alignment detection.
-    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
     // hint to the compiler that strides are aligned for the aligned variant of the shader
     if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
     {
diff --git a/klite.embd b/klite.embd
index 9650fbd08..8fe1265f5 100644
--- a/klite.embd
+++ b/klite.embd
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script>
-	const LITEVER = 242;
+	const LITEVER = 243;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -3256,6 +3256,7 @@ Current version indicated by LITEVER below.
 		instruct_has_latex: true,
 		placeholder_tags: true,
 		render_special_tags: false,
+		inject_randomness_seed: -1,
 		request_logprobs: false,
 		persist_session: true,
 		speech_synth: 0, //0 is disabled, 1000 is xtts
@@ -3392,7 +3393,7 @@ Current version indicated by LITEVER below.
 			rep_pen_slope: defaultsettings.rep_pen_slope,
 			sampler_order: defaultsettings.sampler_order
 		},
-		{"preset":"Simple Logical","description":"A very predictable preset with low randomness.","temp":0.3,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.6,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.02,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Simple Balanced","description":"A good balanced preset with medium randomness.","temp":0.75,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.92,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.07,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Simple Creative","description":"A wild and unpredictable preset with higher randomness.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.98,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.15,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Basic Min-P","description":"A good default for Min-P, only works on backends with min-p.","temp":1.25,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.1,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic Top-nsigma","description":"A good default for Top-nsigma, only works on backends with Top-nsigma.","temp":1,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":1.0,"top_k":0,"top_p":1,"min_p":0.01,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic DynaTemp","description":"A good default for DynaTemp, only works on backends with it.","temp":1.25,"dynatemp_range":0.75,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.05,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic SmoothSample","description":"A good default for Smooth Sampling, only works on backends with it.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.25,"top_k":0,"top_p":1,"min_p":0.05,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic SillyTavern","description":"Similar to default preset used in SillyTavern.","temp":0.75,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":40,"top_p":0.6,"min_p":0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1.0,"rep_pen":1.18,"rep_pen_range":1024,"rep_pen_slope":0.8,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Neutral (Disabled)","description":"Sets all samplers neutralized, allowing you to customize your own.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":200,"top_p":1.0,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.0,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"CoherentCreativity (Legacy)","description":"Legacy preset. A good balance between coherence, creativity, and quality of prose.","rep_pen":1.2,"rep_pen_range":360,"rep_pen_slope":0,"sampler_order":[6,5,0,2,3,1,4],"temp":0.5,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"tfs":0.99,"top_a":0,"top_k":0,"top_p":1,"min_p":0.0,"presence_penalty":0.0,"typical":1},{"preset":"Godlike (Legacy)","description":"Legacy preset. Makes AI give a descriptive and sensual output.","temp":0.7,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":0.5,"min_p":0.0,"presence_penalty":0.0,"top_a":0.75,"typical":0.19,"tfs":0.97,"rep_pen":1.1,"rep_pen_range":1024,"rep_pen_slope":0.7,"sampler_order":[6,5,4,3,2,1,0]},{"preset":"LiminalDrift (Legacy)","description":"Legacy preset. Sometimes surreal situations arise based on information already present in the story.","temp":0.66,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.0,"presence_penalty":0.0,"top_a":0.96,"typical":0.6,"tfs":1,"rep_pen":1.1,"rep_pen_range":1024,"rep_pen_slope":0.7,"sampler_order":[6,4,5,1,0,2,3]}
+		{"preset":"Simple Logical","description":"A very predictable preset with low randomness.","temp":0.3,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.6,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.02,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Simple Balanced","description":"A good balanced preset with medium randomness.","temp":0.75,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.92,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.07,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Simple Creative","description":"A wild and unpredictable preset with higher randomness.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":100,"top_p":0.98,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.15,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Basic Min-P","description":"A good default for Min-P, only works on backends with min-p.","temp":1.25,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.1,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic Top-nsigma","description":"A good default for Top-nsigma, only works on backends with Top-nsigma.","temp":1,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":1.0,"top_k":0,"top_p":1,"min_p":0.01,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic DynaTemp","description":"A good default for DynaTemp, only works on backends with it.","temp":1.25,"dynatemp_range":0.75,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.05,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic SmoothSample","description":"A good default for Smooth Sampling, only works on backends with it.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.25,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.05,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.03,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,5,0,1,3,4,2]},{"preset":"Basic SillyTavern","description":"Similar to default preset used in SillyTavern.","temp":0.75,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":40,"top_p":0.6,"min_p":0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1.0,"rep_pen":1.18,"rep_pen_range":1024,"rep_pen_slope":0.8,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"Immortal","description":"Modernized version of the Godlike preset, designed for creative and longer story co-writing use.","temp":0.7,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":1.75,"top_k":0,"top_p":1.0,"min_p":0.0,"presence_penalty":0.0,"top_a":0.75,"typical":0.19,"tfs":0.97,"rep_pen":1.1,"rep_pen_range":1024,"rep_pen_slope":0.7,"sampler_order":[6,5,4,3,2,1,0]},{"preset":"Neutral (Disabled)","description":"Sets all samplers neutralized, allowing you to customize your own.","temp":1.0,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":200,"top_p":1.0,"min_p":0.0,"presence_penalty":0.0,"top_a":0,"typical":1,"tfs":1,"rep_pen":1.0,"rep_pen_range":360,"rep_pen_slope":0.7,"sampler_order":[6,0,1,3,4,2,5]},{"preset":"CoherentCreativity (Legacy)","description":"Legacy preset. A good balance between coherence, creativity, and quality of prose.","rep_pen":1.2,"rep_pen_range":360,"rep_pen_slope":0,"sampler_order":[6,5,0,2,3,1,4],"temp":0.5,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"tfs":0.99,"top_a":0,"top_k":0,"top_p":1,"min_p":0.0,"presence_penalty":0.0,"typical":1},{"preset":"Godlike (Legacy)","description":"Legacy preset. Makes AI give a descriptive and sensual output.","temp":0.7,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":0.5,"min_p":0.0,"presence_penalty":0.0,"top_a":0.75,"typical":0.19,"tfs":0.97,"rep_pen":1.1,"rep_pen_range":1024,"rep_pen_slope":0.7,"sampler_order":[6,5,4,3,2,1,0]},{"preset":"LiminalDrift (Legacy)","description":"Legacy preset. Sometimes surreal situations arise based on information already present in the story.","temp":0.66,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"smoothing_factor":0.0,"nsigma":0.0,"top_k":0,"top_p":1,"min_p":0.0,"presence_penalty":0.0,"top_a":0.96,"typical":0.6,"tfs":1,"rep_pen":1.1,"rep_pen_range":1024,"rep_pen_slope":0.7,"sampler_order":[6,4,5,1,0,2,3]}
 	];
 
 	const instructpresets = [
@@ -3588,6 +3589,15 @@ Current version indicated by LITEVER below.
 		//truncate to first 3 bytes
 		return hsh.substring(0, 6);
 	};
+	function basic_lcg(seed) { // simple RNG for reproducible dice rolls
+		var m = Math.pow(2, 35) - 31;
+		var a = 185852;
+		var s = seed % m;
+		return function () {
+			s = (s * a) % m;
+			return s / m;
+		};
+	}
 	function import_props_into_object(existingObj, objToImport) { //import new fields from one object into another while preserving exsting
 		for (var k in objToImport) {
 			existingObj[k] = objToImport[k];
@@ -4325,7 +4335,7 @@ Current version indicated by LITEVER below.
 		text = remove_all_instruct_tags(text);
 
 		// Replace {{user}} and other placeholders
-		text = replace_placeholders(text);
+		text = replace_placeholders(text,false,false,false);
 
 		return text;
 	}
@@ -4347,7 +4357,7 @@ Current version indicated by LITEVER below.
 		inputtxt = replaceAll(inputtxt,instructsysplaceholder_end.trim(),get_instruct_systag_end(false));
 		return inputtxt;
 	}
-	function replace_noninstruct_placeholders(inputtxt,escape=false)
+	function replace_noninstruct_placeholders(inputtxt,escape=false,isgametext=true,persistent_countmap=null)
 	{
 		let firstopponent = localsettings.chatopponent;
 		if (firstopponent && firstopponent.includes("||$||")) {
@@ -4373,15 +4383,92 @@ Current version indicated by LITEVER below.
 				inputtxt = replaceAll(inputtxt,localsettings.placeholder_tags_data[i].p,localsettings.placeholder_tags_data[i].r);
 			}
 		}
+
+		if(localsettings.inject_randomness_seed>0)
+		{
+			// define helper functions
+			let dicenotation = function(formula,seed=0){
+				formula = formula.trim();
+				if (/^\d+$/.test(formula)) { formula = `1d${formula}`; }
+				const pieces = formula.match(/^(\d+)?d(\d+)([x*\u00D7]\d*\.?\d+)?([+-]\d*\.?\d+)?$/i);
+				if (!pieces) {
+					return ""; //if the dice test fails we still need to return the original unswapped content!
+				}
+				let RNG = seed ? basic_lcg(seed) : Math.random;
+				let numDice = parseInt(pieces[1]) || 1;
+				let dieSides = parseInt(pieces[2]);
+				if (isNaN(numDice) || isNaN(dieSides) || dieSides < 0 || dieSides > 99999) { //dice above 99999 cause extreme slowdown
+					console.warn("Avoid potential dice issues");
+					return "";
+				}
+				let total = 0;
+				for (let i = 0; i < numDice; ++i) {
+					total += Math.floor(RNG() * dieSides) + 1; // Standard dice rolls are 1-based
+				}
+				total *= pieces[3] ? parseFloat(pieces[3].substring(1)) : 1;
+				total += pieces[4] ? parseFloat(pieces[4]) : 0;
+				return String(total);
+			}
+			let pickfromlist = function(listString,seed=0){
+				let RNG = seed ? basic_lcg(seed) : Math.random;
+				const list = listString.includes('::') ? listString.split('::') : listString.replace(/\\,/g, '%%COMMA%%').split(',').map(item => item.trim().replace(/%%COMMA%%/g, ','));
+				return (list.length===0 ? '' : list[Math.floor(RNG()*list.length)]);
+			}
+
+			//don't waste time if we dont have any such tags
+			let lowerinputtxt = inputtxt.toLowerCase();
+			if (lowerinputtxt.includes("\{\{pick") || lowerinputtxt.includes("\{\{random") || lowerinputtxt.includes("\{\{roll"))
+			{
+				let stablereg = "";
+				if(!isgametext)
+				{
+					inputtxt = inputtxt.replace(/\{\{roll\s?:?:?([^\}\n]{1,500})\}\}/gi, function (m,matchValue) {
+						return dicenotation(matchValue, 0);
+					});
+					inputtxt = inputtxt.replace(/\{\{random\s?:?:?([^\}\n]{1,500})\}\}/gi, function (m,listString) {
+						return pickfromlist(listString,0);
+					});
+					stablereg = /\{\{(pick)\s?:?:?([^\}\n]{1,500})\}\}/gi;
+				}
+				else
+				{
+					stablereg = /\{\{(pick|random|roll)\s?:?:?([^\}\n]{1,500})\}\}/gi;
+				}
+
+				let countmap = (persistent_countmap?persistent_countmap:new Map());
+				let parts = [];
+				let lastIndex = 0;
+				let match;
+
+				while ((match = stablereg.exec(inputtxt)) !== null) {
+					const tagType = match[1].toLowerCase();
+					const matchtxt = match[2];
+					parts.push(inputtxt.substring(lastIndex, match.index));
+					let count = countmap.get(matchtxt) || 0;
+					countmap.set(matchtxt, count + 1);
+					let hashstr = matchtxt + localsettings.inject_randomness_seed;
+					if (isgametext || tagType !== "pick") {
+						hashstr += count;
+					}
+					let seed = parseInt(cyrb_hash(hashstr), 16);
+					let replacement = (tagType === "roll" ? dicenotation : pickfromlist)(matchtxt, seed);
+					parts.push(replacement);
+					lastIndex = stablereg.lastIndex;
+				}
+				parts.push(inputtxt.substring(lastIndex));	// Add remaining text after the last match
+				inputtxt = parts.join('');
+			}
+		}
 		return inputtxt;
 	}
 	//if alwaysreplace, then settings are not considered, otherwise checks settings
-	function replace_placeholders(inputtxt, escape=false, alwaysreplace=false)
+	//if isgametext is true, then keeping the random values stable will be prioritized
+	function replace_placeholders(inputtxt, escape=false, alwaysreplace=false, isgametext=true)
 	{
 		inputtxt = replace_instruct_placeholders(inputtxt);
 		if(alwaysreplace || localsettings.placeholder_tags)
 		{
-			inputtxt = replace_noninstruct_placeholders(inputtxt,escape);
+			inputtxt = replace_noninstruct_placeholders(inputtxt,escape,isgametext,null);
 		}
 		return inputtxt;
 	}
@@ -4775,15 +4862,18 @@ Current version indicated by LITEVER below.
 		var unzip = new Zlib.Unzip(compressed);
 		var filenames = unzip.getFilenames();
 		let foundfile = filenames.filter(x=>x.includes(".json"));
+		let foundfile2 = filenames.filter(x=>x.includes("story.json"));
 		if(foundfile.length>0)
 		{
 			try {
-				var plainfile = unzip.decompress(filenames[0]);
-				let readtxt = "";
-				for(let i=0;i<plainfile.length;++i)
+				let correctfile = foundfile[0];
+				if(foundfile2.length>0)
 				{
-					readtxt += String.fromCharCode(plainfile[i]);
+					correctfile = foundfile2[0];
 				}
+				var plainfile = unzip.decompress(correctfile);
+				const decoder = new TextDecoder('utf-8');
+            	const readtxt = decoder.decode(plainfile);
 				var decoded = JSON.parse(readtxt);
 				console.log(decoded);
 				return decoded;
@@ -4794,6 +4884,77 @@ Current version indicated by LITEVER below.
 		}
 		return null;
     };
+
+	function extractRisuData(arr) {
+		function strToBytes(str) {
+			return new TextEncoder().encode(str);
+		}
+
+		function indexOfSequence(haystack, needle, from = 0) {
+			for (let i = from; i <= haystack.length - needle.length; i++) {
+				let match = true;
+				for (let j = 0; j < needle.length; j++) {
+					if (haystack[i + j] !== needle[j]) {
+						match = false;
+						break;
+					}
+				}
+				if (match) return i;
+			}
+			return -1;
+		}
+		function lastIndexOfSequence(haystack, needle) {
+			for (let i = haystack.length - needle.length; i >= 0; i--) {
+				let match = true;
+				for (let j = 0; j < needle.length; j++) {
+					if (haystack[i + j] !== needle[j]) {
+						match = false;
+						break;
+					}
+				}
+				if (match) return i;
+			}
+			return -1;
+		}
+		console.log("Attempting RISU import...");
+		const charCardFinderSig = strToBytes("chara_card_v");
+		const charcardpos = indexOfSequence(arr, charCardFinderSig);
+		if(charcardpos!=-1)
+		{
+			const zipStartSig = strToBytes("PK\x03\x04");
+			const zipEndSig   = strToBytes("PK\x05\x06");
+			const start = indexOfSequence(arr, zipStartSig);
+			const end = lastIndexOfSequence(arr, zipEndSig);
+
+			if (start !== -1 && end !== -1 && end > start && charcardpos > start && charcardpos < end) {
+				try {
+					const zipData = arr.slice(start, end + 22); // Add 22 bytes for EOCD record
+					var unzip = new Zlib.Unzip(zipData);
+					var filenames = unzip.getFilenames();
+					let foundfile = filenames.filter(x => x=="card.json");
+					if (foundfile.length > 0) {
+						var plainfile = unzip.decompress(foundfile[0]);
+						const decoder = new TextDecoder('utf-8');
+						const readtxt = decoder.decode(plainfile);
+						var decoded = JSON.parse(readtxt);
+						console.log(decoded);
+						return decoded;
+					}
+					console.log("RISU card missing card.json.");
+					return null;
+				} catch (e) {
+					console.log("Error decoding RISU card: " + e);
+					return null;
+				}
+			} else {
+				console.log("Not a valid RISU card.");
+				return null;
+			}
+		}
+		console.log("Not a RISU card.");
+		return null;
+	}
+
 	function generate_compressed_story(save_images,export_settings,export_aesthetic_settings) {
 		//encode the current story into a sharable url
 		//a tiny json format which gets compressed by LZMA then b64url
@@ -7006,61 +7167,80 @@ Current version indicated by LITEVER below.
 					}
 				}
 			} catch (e) {
-				console.log(e)
+				console.log(e);
 
 				//attempt to parse it as a png file
-				var pngfr = new FileReader();
-				pngfr.onload = function (img) {
-					var data = pngfr.result;
-					var arr = new Uint8Array(data);
-					var result = convertTavernPng(arr);
-					if (result != null) {
-						load_tavern_obj(result);
-						//replace portraits
-						compressImage(data, (compressedImageURI, aspectratio)=>{
+				function handlePngLoadDone(img) {
+					const data = pngfr.result;
+					const arr = new Uint8Array(data);
+
+					function setImgAsAvatar(data)
+					{
+						compressImage(data, (compressedImageURI, aspectratio) => {
 							aestheticInstructUISettings.AI_portrait = compressedImageURI;
 							document.getElementById('portrait_ratio_AI').value = aspectratio.toFixed(2);
 							refreshAestheticPreview(true);
 							render_gametext();
 						}, true, AVATAR_PX);
 					}
-					else {
-						//attempt to read as WEBP
-						result = getTavernExifJSON(arr);
-						if (result != null) {
-							load_tavern_obj(result);
-						}
-						else {
-							//attempt to read as KAISTORY
-							try {
-								result = UnzipKAISTORYFile(arr);
-							} catch (error) {
-								console.log("Unzip failed: " + error);
-								result = null;
-							}
 
-							if (result != null) {
-								kai_json_load(result,false);
-							}
-							else {
-								if (selectedFilename.endsWith(".txt")) {
-									msgboxYesNo("Could not load selected file!<br><span class=\"color_red\">It appears to be invalid or corrupted!</span><br><br>Do you still want to import it as plaintext?", "Loading Failed",
-										() => {
-											//raw text import
-											restart_new_game(false);
-											gametext_arr.push(text);
-											render_gametext(true);
-											sync_multiplayer(true);
-											update_for_sidepanel();
-										}, null, true)
-								} else {
-									msgbox("Could not load selected file. Is it valid?");
-								}
-
-							}
-						}
+					// 1. Try Tavern PNG
+					let result = convertTavernPng(arr);
+					if (result) {
+						load_tavern_obj(result);
+						setImgAsAvatar(data);
+						return;
 					}
-				};
+
+					// 2. Try Tavern EXIF
+					result = getTavernExifJSON(arr);
+					if (result) {
+						load_tavern_obj(result);
+						setImgAsAvatar(data);
+						return;
+					}
+
+					// 3. Try Risu V3
+					result = extractRisuData(arr);
+					if (result) {
+						load_tavern_obj(result);
+						setImgAsAvatar(data);
+						return;
+					}
+
+					// 4. Try KAISTORY
+					try {
+						result = UnzipKAISTORYFile(arr);
+						if (result) {
+							kai_json_load(result, false);
+							return;
+						}
+					} catch (error) {
+						console.log("Unzip failed: " + error);
+					}
+
+					// 5. Fallback to plaintext if .txt
+					if (selectedFilename.endsWith(".txt")) {
+						msgboxYesNo(
+							"Could not load selected file!<br><span class=\"color_red\">It appears to be invalid or corrupted!</span><br><br>Do you still want to import it as plaintext?",
+							"Loading Failed",
+							() => {
+								restart_new_game(false);
+								gametext_arr.push(text);
+								render_gametext(true);
+								sync_multiplayer(true);
+								update_for_sidepanel();
+							},
+							null,
+							true
+						);
+					} else {
+						msgbox("Could not load selected file. Is it valid?");
+					}
+				}
+
+				const pngfr = new FileReader();
+				pngfr.onload = handlePngLoadDone;
 				pngfr.readAsArrayBuffer(selectedFile);
 			}
 
@@ -7507,7 +7687,7 @@ Current version indicated by LITEVER below.
 		let selectedgreeting = "";
 		let load_tav_obj_confirm_p1 = function(usechatmode) // need second input for alt greeting
 		{
-			if(obj.spec=="chara_card_v2" && obj.data!=null)
+			if((obj.spec=="chara_card_v2"||obj.spec=="chara_card_v3") && obj.data!=null)
 			{
 				obj = obj.data;
 			}
@@ -7973,8 +8153,8 @@ Current version indicated by LITEVER below.
 			gametext_arr = [];
 			if(!localsettings.placeholder_tags) //do a one-time replace instead
 			{
-				greeting = replace_placeholders(greeting, false, true);
-				combinedmem = replace_placeholders(combinedmem, false, true);
+				greeting = replace_placeholders(greeting, false, true, false); //this is all onetime replacements, no need for stability
+				combinedmem = replace_placeholders(combinedmem, false, true, false);
 			}
 			if(greeting)
 			{
@@ -8033,7 +8213,7 @@ Current version indicated by LITEVER below.
 			let prompttxt = temp_scenario.prompt;
 			if(!localsettings.placeholder_tags) //do a one-time replace instead
 			{
-				prompttxt = replace_placeholders(prompttxt, false, true);
+				prompttxt = replace_placeholders(prompttxt, false, true, false);
 			}
 			gametext_arr.push(prompttxt);
 		}
@@ -8041,14 +8221,14 @@ Current version indicated by LITEVER below.
 			current_anote = temp_scenario.authorsnote;
 			if(!localsettings.placeholder_tags)
 			{
-				current_anote = replace_placeholders(current_anote, false, true);
+				current_anote = replace_placeholders(current_anote, false, true, false);
 			}
 		}
 		if (temp_scenario.memory != "") {
 			current_memory = temp_scenario.memory;
 			if(!localsettings.placeholder_tags)
 			{
-				current_memory = replace_placeholders(current_memory, false, true);
+				current_memory = replace_placeholders(current_memory, false, true, false);
 			}
 		}
 		if (temp_scenario.image && temp_scenario.image != "") {
@@ -8253,6 +8433,7 @@ Current version indicated by LITEVER below.
 					userInput = userInput.split(/characterhub\.org\//i)[1].split("#")[0].split("?")[0];
 				}
 				userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
+				userInput = userInput.startsWith('/') ? userInput.slice(1) : userInput;
 				return userInput;
 			},
 			fetch: (userInput) => {
@@ -8270,16 +8451,8 @@ Current version indicated by LITEVER below.
 				};
 
 				//try to obtain the full portrait image
-				return fetch("https://api.chub.ai/api/characters/download", {
-					method: 'POST',
-					headers: {
-						'Content-Type': 'application/json',
-					},
-					body: JSON.stringify({
-						"format": "tavern",
-						"fullPath": userInput,
-						"version": "main"
-					}),
+				return fetch(`https://avatars.charhub.io/avatars/${userInput}/chara_card_v2.png`, {
+					method: 'GET',
 					referrerPolicy: 'no-referrer',
 				})
 				.then(rb => {
@@ -11656,6 +11829,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("instruct_has_markdown").checked = localsettings.instruct_has_markdown;
 		document.getElementById("instruct_has_latex").checked = localsettings.instruct_has_latex;
 		document.getElementById("placeholder_tags").checked = localsettings.placeholder_tags;
+		document.getElementById("inject_randomness_seed").value = localsettings.inject_randomness_seed;
 		document.getElementById("run_in_background").checked = run_in_background;
 		document.getElementById("auto_ctxlen").checked = localsettings.auto_ctxlen;
 		document.getElementById("auto_genamt").checked = localsettings.auto_genamt;
@@ -12121,6 +12295,7 @@ Current version indicated by LITEVER below.
 		localsettings.instruct_has_markdown = (document.getElementById("instruct_has_markdown").checked ? true : false);
 		localsettings.instruct_has_latex = (document.getElementById("instruct_has_latex").checked ? true : false);
 		localsettings.placeholder_tags = (document.getElementById("placeholder_tags").checked ? true : false);
+		localsettings.inject_randomness_seed = document.getElementById("inject_randomness_seed").value;
 		run_in_background = (document.getElementById("run_in_background").checked ? true : false);
 		background_audio_loop(run_in_background);
 		localsettings.generate_images_model = document.getElementById("generate_images_model").value;
@@ -13084,6 +13259,11 @@ Current version indicated by LITEVER below.
 			documentdb_chunksize = 800;
 			documentdb_data = "";
 		}
+		if(localsettings.inject_randomness_seed>0)
+		{
+			new_randomness_seed();
+			localsettings.inject_randomness_seed = document.getElementById("inject_randomness_seed").value;
+		}
 		warn_unsaved = false;
 		last_used_saveslot = -1;
 		show_corpo_leftpanel(false);
@@ -13113,6 +13293,11 @@ Current version indicated by LITEVER below.
 
 	}
 
+	function new_randomness_seed()
+	{
+		document.getElementById("inject_randomness_seed").value = (Math.floor(Math.random() * 99999)); //keep within a simple range
+	}
+
 	function btn_editmode()
 	{
 		document.getElementById("allowediting").checked = true;
@@ -13434,6 +13619,10 @@ Current version indicated by LITEVER below.
 
 	function do_manual_gen_image(sentence, base64img="") //b64 for img2img
 	{
+		if(localsettings.placeholder_tags)
+		{
+			sentence = replace_placeholders(sentence)
+		}
 		generate_new_image(sentence, base64img);
 		document.getElementById("btn_genimg").disabled = true;
 		document.getElementById("btn_genimg2").disabled = true;
@@ -14959,6 +15148,8 @@ Current version indicated by LITEVER below.
 			//only do this processing if memory or anote is not blank
 			if (truncated_memory.length > 0 || current_anote.length > 0)
 			{
+				truncated_memory = replace_placeholders(truncated_memory,false,false,true);
+				truncated_anote = replace_placeholders(truncated_anote,false,false,false);
 				if(!is_using_kcpp_with_added_memory())
 				{
 					let augmented_len = truncated_memory.length + truncated_context.length + truncated_anote.length;
@@ -14992,8 +15183,7 @@ Current version indicated by LITEVER below.
 				}
 			}
 
-			truncated_memory = replace_placeholders(truncated_memory);
-			truncated_context = replace_placeholders(truncated_context);
+			truncated_context = replace_placeholders(truncated_context,false,false,true);
 
 			if(is_using_kcpp_with_added_memory())
 			{
@@ -19351,6 +19541,7 @@ Current version indicated by LITEVER below.
 
 		let incomplete_resp = (synchro_pending_stream!="" || pending_response_id!="");
 
+		let countmap = new Map();
 		for(var i=0;i<chatunits.length;++i)
 		{
 			let curr = chatunits[i];
@@ -19359,7 +19550,7 @@ Current version indicated by LITEVER below.
 			processed_msg = apply_display_only_regex(processed_msg);
 			if(processed_msg && processed_msg!="")
 			{
-				processed_msg = replace_noninstruct_placeholders(processed_msg,true);
+				processed_msg = replace_noninstruct_placeholders(processed_msg,true,true,countmap);
 				let codeblockcount = (processed_msg.match(/```/g) || []).length;
 				if(codeblockcount>0 && codeblockcount%2!=0 )
 				{
@@ -21428,9 +21619,9 @@ Current version indicated by LITEVER below.
 		allText = replaceAll(allText,recentTextStr,"");
 
 		// Ensure placeholders are replaced to allow searching for user / character
-		allText = replace_search_placeholders(allText)
-		searchStr = replace_search_placeholders(searchStr)
-		recentTextStr = replace_search_placeholders(recentTextStr)
+		allText = replace_search_placeholders(allText);
+		searchStr = replace_search_placeholders(searchStr);
+		recentTextStr = replace_search_placeholders(recentTextStr);
 
 		let i = 0, startLoc = 0;
 		while (startLoc < allText.length && i < Number.MAX_SAFE_INTEGER) {
@@ -22811,6 +23002,8 @@ Current version indicated by LITEVER below.
 						</div>
 						<table id="placeholder_replace_table" class="settinglabel" style="text-align: center; border-spacing: 3px 2px; border-collapse: separate;">
 						</table>
+						<div style="padding:3px" class="settinglabel justifyleft">
+							Randomness Seed: <span class="helpicon">?<span class="helptext">A seed to generate randomness for {{pick}}, {{roll}} and {{random}}. Set to -1 to disable these three placeholders.</span></span><input class="settinglabel miniinput" style="margin-left:4px;width:70px;" type="text" inputmode="decimal" placeholder="0" value="" id="inject_randomness_seed"><button title="Regen" type="button" class="btn btn-primary bg_green" style="padding:1px 3px;margin:0px 0px 0px auto;font-size:10px;" onclick="new_randomness_seed()">Regen</button></div>
 					</div>
 					<div style="padding:3px;" class="justifyleft settinglabel">Classifier-Free Guidance <span class="helpicon">?<span
 						class="helptext">Functions as a negative prompt when Guidance Scale is above 1.</span></span>
@@ -22818,7 +23011,7 @@ Current version indicated by LITEVER below.
 					</div>
 					<div id="expandguidance" class="hidden">
 						<div class="color_red hidden" id="noguidance">Classifier-Free Guidance may be unavailable.</div>
-						<div style="color:#ffffff;">Classifier-Free Guidance prompt functions as a negative prompt when Guidance Scale is above 1, and a positive prompt at Guidance Scale is above 1. Disabled if scale is exactly 1 or CFG prompt is blank.</em><br></div>
+						<div style="color:#ffffff;">Classifier-Free Guidance prompt functions as a negative prompt when Guidance Scale is above 1, and a positive prompt at Guidance Scale is below 1. Disabled if scale is exactly 1 or CFG prompt is blank.</em><br></div>
 						<div style="display: flex; column-gap: 4px; margin-top: 4px; margin-bottom: 4px;">
 						<input class="form-control menuinput_inline" type="text" placeholder="Enter CFG Prompt" value="" id="guidance_prompt">
 						<div style="padding:1px" class="settinglabel">Scale<br>(0-5): </div><input class="form-control menuinput_inline" style="margin-left:4px;width:70px;" inputmode="numeric" placeholder="(Off)" value="" id="guidance_scale"></div>
@@ -23515,7 +23708,6 @@ Current version indicated by LITEVER below.
 					<div><input type="checkbox" id="useoainonstandard" title="Send Non-Standard Fields">
 					<div class="box-label">Non-Standard Fields</div></div>
 				</div>
-				<div class="todoremove color_yellow">Looking for the Streaming Toggle? It's now in Advanced Settings -> Streaming!</div>
 				<span id="useoaichatcomplbox" class="hidden" onload="toggleoaichatcompl();">
 					<br>
 					Main Message Role:
@@ -23652,7 +23844,6 @@ Current version indicated by LITEVER below.
 						<div class="box-label">Allow Thinking</div>
 					</div>
 				</div>
-				<div class="todoremove color_yellow">Looking for the Streaming Toggle? It's now in Advanced Settings -> Streaming!</div>
 			</div>
 			<div id="coherecustom" class="menutext hidden">
 				Uses Cohere's models through their own API.<br><br>
diff --git a/src/llama.cpp b/src/llama.cpp
index b8ceaddc2..02311eec5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -168,6 +168,11 @@ static struct llama_model * llama_model_load_from_file_impl(
         struct llama_model_params params) {
     ggml_time_init();
 
+    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
+        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
+        return nullptr;
+    }
+
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 01eec46e8..02fb00339 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index f32f3c86a..348588a2c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2251,6 +2251,14 @@ struct server_context {
             slot.has_next_token = true;
         }
 
+        // if context shifting is disabled, make sure that we don't run out of context
+        if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
+        }
+
         // check the limits
         if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
             slot.stop           = STOP_TYPE_LIMIT;
@@ -4340,6 +4348,7 @@ int main(int argc, char ** argv) {
         json data = oaicompat_completion_params_parse(
             body,
             params.use_jinja,
+            params.prefill_assistant,
             params.reasoning_format,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
@@ -4361,6 +4370,7 @@ int main(int argc, char ** argv) {
         json data = oaicompat_completion_params_parse(
             body,
             params.use_jinja,
+            params.prefill_assistant,
             params.reasoning_format,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
diff --git a/tools/server/tests/unit/test_ctx_shift.py b/tools/server/tests/unit/test_ctx_shift.py
index be93a6d31..2431ac708 100644
--- a/tools/server/tests/unit/test_ctx_shift.py
+++ b/tools/server/tests/unit/test_ctx_shift.py
@@ -65,3 +65,21 @@ def test_ctx_shift_disabled_long_prompt():
     assert res.status_code != 200
     assert "error" in res.body
     assert "exceeds the available context size" in res.body["error"]["message"]
+
+def test_ctx_shift_disabled_stream():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_stream_request("POST", "/v1/completions", data={
+        "n_predict": 256,
+        "prompt": "Once",
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] == "length":
+            assert len(content) > 0
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["text"]
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 232eef195..3e7733539 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
+    bool prefill_assistant,
     common_reasoning_format reasoning_format,
     const struct common_chat_templates * tmpls,
     bool allow_non_text,
@@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();
diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx
index 3b00a8f90..1b673bbaa 100644
--- a/tools/server/webui/src/App.tsx
+++ b/tools/server/webui/src/App.tsx
@@ -28,13 +28,13 @@ function AppLayout() {
   return (
     <>
       <Sidebar />
-      <div
+      <main
         className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
         id="main-scroll"
       >
         <Header />
         <Outlet />
-      </div>
+      </main>
       {
         <SettingDialog
           show={showSettings}
diff --git a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
index ac416fa90..4f28f8874 100644
--- a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
+++ b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
@@ -18,16 +18,26 @@ export default function ChatInputExtraContextItem({
   if (!items) return null;
 
   return (
-    <div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
+    <div
+      className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
+      role="group"
+      aria-description="Selected files"
+    >
       {items.map((item, i) => (
         <div
           className="indicator"
           key={i}
           onClick={() => clickToShow && setShow(i)}
+          tabIndex={0}
+          aria-description={
+            clickToShow ? `Click to show: ${item.name}` : undefined
+          }
+          role={clickToShow ? 'button' : 'menuitem'}
         >
           {removeItem && (
             <div className="indicator-item indicator-top">
               <button
+                aria-label="Remove file"
                 className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
                 onClick={() => removeItem(i)}
               >
@@ -46,13 +56,16 @@ export default function ChatInputExtraContextItem({
               <>
                 <img
                   src={item.base64Url}
-                  alt={item.name}
+                  alt={`Preview image for ${item.name}`}
                   className="w-14 h-14 object-cover rounded-md"
                 />
               </>
             ) : (
               <>
-                <div className="w-14 h-14 flex items-center justify-center">
+                <div
+                  className="w-14 h-14 flex items-center justify-center"
+                  aria-description="Document icon"
+                >
                   <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
                 </div>
 
@@ -66,16 +79,25 @@ export default function ChatInputExtraContextItem({
       ))}
 
       {showingItem && (
-        <dialog className="modal modal-open">
+        <dialog
+          className="modal modal-open"
+          aria-description={`Preview ${showingItem.name}`}
+        >
           <div className="modal-box">
             <div className="flex justify-between items-center mb-4">
               <b>{showingItem.name ?? 'Extra content'}</b>
-              <button className="btn btn-ghost btn-sm">
+              <button
+                className="btn btn-ghost btn-sm"
+                aria-label="Close preview dialog"
+              >
                 <XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
               </button>
             </div>
             {showingItem.type === 'imageFile' ? (
-              <img src={showingItem.base64Url} alt={showingItem.name} />
+              <img
+                src={showingItem.base64Url}
+                alt={`Preview image for ${showingItem.name}`}
+              />
             ) : (
               <div className="overflow-x-auto">
                 <pre className="whitespace-pre-wrap break-words text-sm">
diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx
index 08eb42352..ee59de450 100644
--- a/tools/server/webui/src/components/ChatMessage.tsx
+++ b/tools/server/webui/src/components/ChatMessage.tsx
@@ -83,13 +83,20 @@ export default function ChatMessage({
 
   if (!viewingChat) return null;
 
+  const isUser = msg.role === 'user';
+
   return (
-    <div className="group" id={id}>
+    <div
+      className="group"
+      id={id}
+      role="group"
+      aria-description={`Message from ${msg.role}`}
+    >
       <div
         className={classNames({
           chat: true,
-          'chat-start': msg.role !== 'user',
-          'chat-end': msg.role === 'user',
+          'chat-start': !isUser,
+          'chat-end': isUser,
         })}
       >
         {msg.extra && msg.extra.length > 0 && (
@@ -99,7 +106,7 @@ export default function ChatMessage({
         <div
           className={classNames({
             'chat-bubble markdown': true,
-            'chat-bubble bg-transparent': msg.role !== 'user',
+            'chat-bubble bg-transparent': !isUser,
           })}
         >
           {/* textarea for editing message */}
@@ -142,7 +149,7 @@ export default function ChatMessage({
               ) : (
                 <>
                   {/* render message as markdown */}
-                  <div dir="auto">
+                  <div dir="auto" tabIndex={0}>
                     {thought && (
                       <ThoughtProcess
                         isThinking={!!isThinking && !!isPending}
@@ -196,13 +203,18 @@ export default function ChatMessage({
           })}
         >
           {siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && (
-            <div className="flex gap-1 items-center opacity-60 text-sm">
+            <div
+              className="flex gap-1 items-center opacity-60 text-sm"
+              role="navigation"
+              aria-description={`Message version ${siblingCurrIdx + 1} of ${siblingLeafNodeIds.length}`}
+            >
               <button
                 className={classNames({
                   'btn btn-sm btn-ghost p-1': true,
                   'opacity-20': !prevSibling,
                 })}
                 onClick={() => prevSibling && onChangeSibling(prevSibling)}
+                aria-label="Previous message version"
               >
                 <ChevronLeftIcon className="h-4 w-4" />
               </button>
@@ -215,6 +227,7 @@ export default function ChatMessage({
                   'opacity-20': !nextSibling,
                 })}
                 onClick={() => nextSibling && onChangeSibling(nextSibling)}
+                aria-label="Next message version"
               >
                 <ChevronRightIcon className="h-4 w-4" />
               </button>
@@ -223,7 +236,7 @@ export default function ChatMessage({
           {/* user message */}
           {msg.role === 'user' && (
             <BtnWithTooltips
-              className="btn-mini show-on-hover w-8 h-8"
+              className="btn-mini w-8 h-8"
               onClick={() => setEditingContent(msg.content)}
               disabled={msg.content === null}
               tooltipsContent="Edit message"
@@ -236,7 +249,7 @@ export default function ChatMessage({
             <>
               {!isPending && (
                 <BtnWithTooltips
-                  className="btn-mini show-on-hover w-8 h-8"
+                  className="btn-mini w-8 h-8"
                   onClick={() => {
                     if (msg.content !== null) {
                       onRegenerateMessage(msg as Message);
@@ -250,10 +263,7 @@ export default function ChatMessage({
               )}
             </>
           )}
-          <CopyButton
-            className="btn-mini show-on-hover w-8 h-8"
-            content={msg.content}
-          />
+          <CopyButton className="btn-mini w-8 h-8" content={msg.content} />
         </div>
       )}
     </div>
@@ -271,6 +281,8 @@ function ThoughtProcess({
 }) {
   return (
     <div
+      role="button"
+      aria-label="Toggle thought process display"
       tabIndex={0}
       className={classNames({
         'collapse bg-none': true,
@@ -292,7 +304,11 @@ function ThoughtProcess({
           )}
         </div>
       </div>
-      <div className="collapse-content text-base-content/70 text-sm p-1">
+      <div
+        className="collapse-content text-base-content/70 text-sm p-1"
+        tabIndex={0}
+        aria-description="Thought process content"
+      >
         <div className="border-l-2 border-base-content/20 pl-4 mb-4">
           <MarkdownDisplay content={content} />
         </div>
diff --git a/tools/server/webui/src/components/ChatScreen.tsx b/tools/server/webui/src/components/ChatScreen.tsx
index 661fe1490..09c601ef2 100644
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@@ -279,7 +279,11 @@ export default function ChatScreen() {
 function ServerInfo() {
   const { serverProps } = useAppContext();
   return (
-    <div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
+    <div
+      className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
+      tabIndex={0}
+      aria-description="Server information"
+    >
       <div className="card-body">
         <b>Server Info</b>
         <p>
@@ -311,6 +315,8 @@ function ChatInput({
 
   return (
     <div
+      role="group"
+      aria-label="Chat input"
       className={classNames({
         'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
         'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
@@ -400,13 +406,15 @@ function ChatInput({
                     'btn w-8 h-8 p-0 rounded-full': true,
                     'btn-disabled': isGenerating,
                   })}
+                  aria-label="Upload file"
+                  tabIndex={0}
+                  role="button"
                 >
                   <PaperClipIcon className="h-5 w-5" />
                 </label>
                 <input
                   id="file-upload"
                   type="file"
-                  className="hidden"
                   disabled={isGenerating}
                   {...getInputProps()}
                   hidden
@@ -422,6 +430,7 @@ function ChatInput({
                   <button
                     className="btn btn-primary w-8 h-8 p-0 rounded-full"
                     onClick={onSend}
+                    aria-label="Send message"
                   >
                     <ArrowUpIcon className="h-5 w-5" />
                   </button>
diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx
index 45775ff7a..ccddc21dd 100644
--- a/tools/server/webui/src/components/Header.tsx
+++ b/tools/server/webui/src/components/Header.tsx
@@ -38,8 +38,12 @@ export default function Header() {
 
       {/* action buttons (top right) */}
       <div className="flex items-center">
-        <div className="tooltip tooltip-bottom" data-tip="Settings">
-          <button className="btn" onClick={() => setShowSettings(true)}>
+        <div
+          className="tooltip tooltip-bottom"
+          data-tip="Settings"
+          onClick={() => setShowSettings(true)}
+        >
+          <button className="btn" aria-hidden={true}>
             {/* settings button */}
             <Cog8ToothIcon className="w-5 h-5" />
           </button>
diff --git a/tools/server/webui/src/components/SettingDialog.tsx b/tools/server/webui/src/components/SettingDialog.tsx
index 0240a17f4..e4684be7e 100644
--- a/tools/server/webui/src/components/SettingDialog.tsx
+++ b/tools/server/webui/src/components/SettingDialog.tsx
@@ -335,14 +335,22 @@ export default function SettingDialog({
   };
 
   return (
-    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+    <dialog
+      className={classNames({ modal: true, 'modal-open': show })}
+      aria-label="Settings dialog"
+    >
       <div className="modal-box w-11/12 max-w-3xl">
         <h3 className="text-lg font-bold mb-6">Settings</h3>
         <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
           {/* Left panel, showing sections - Desktop version */}
-          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+          <div
+            className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200"
+            role="complementary"
+            aria-description="Settings sections"
+            tabIndex={0}
+          >
             {SETTING_SECTIONS.map((section, idx) => (
-              <div
+              <button
                 key={idx}
                 className={classNames({
                   'btn btn-ghost justify-start font-normal w-44 mb-1': true,
@@ -352,12 +360,16 @@ export default function SettingDialog({
                 dir="auto"
               >
                 {section.title}
-              </div>
+              </button>
             ))}
           </div>
 
           {/* Left panel, showing sections - Mobile version */}
-          <div className="md:hidden flex flex-row gap-2 mb-4">
+          {/* This menu is skipped on a11y, otherwise it's repeated the desktop version */}
+          <div
+            className="md:hidden flex flex-row gap-2 mb-4"
+            aria-disabled={true}
+          >
             <details className="dropdown">
               <summary className="btn bt-sm w-full m-1">
                 {SETTING_SECTIONS[sectionIdx].title}
diff --git a/tools/server/webui/src/components/Sidebar.tsx b/tools/server/webui/src/components/Sidebar.tsx
index 8e79e00b8..8cac52f4c 100644
--- a/tools/server/webui/src/components/Sidebar.tsx
+++ b/tools/server/webui/src/components/Sidebar.tsx
@@ -50,44 +50,72 @@ export default function Sidebar() {
         id="toggle-drawer"
         type="checkbox"
         className="drawer-toggle"
+        aria-label="Toggle sidebar"
         defaultChecked
       />
 
-      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+      <div
+        className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"
+        role="complementary"
+        aria-label="Sidebar"
+        tabIndex={0}
+      >
         <label
           htmlFor="toggle-drawer"
-          aria-label="close sidebar"
+          aria-label="Close sidebar"
           className="drawer-overlay"
         ></label>
+
+        <a
+          href="#main-scroll"
+          className="absolute -left-80 top-0 w-1 h-1 overflow-hidden"
+        >
+          Skip to main content
+        </a>
+
         <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
           <div className="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 className="font-bold ml-4">Conversations</h2>
+            <h2 className="font-bold ml-4" role="heading">
+              Conversations
+            </h2>
 
             {/* close sidebar button */}
-            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+            <label
+              htmlFor="toggle-drawer"
+              className="btn btn-ghost lg:hidden"
+              aria-label="Close sidebar"
+              role="button"
+              tabIndex={0}
+            >
               <XMarkIcon className="w-5 h-5" />
             </label>
           </div>
 
           {/* new conversation button */}
-          <div
+          <button
             className={classNames({
               'btn btn-ghost justify-start px-2': true,
               'btn-soft': !currConv,
             })}
             onClick={() => navigate('/')}
+            aria-label="New conversation"
           >
             <PencilSquareIcon className="w-5 h-5" />
             New conversation
-          </div>
+          </button>
 
           {/* list of conversations */}
           {groupedConv.map((group, i) => (
-            <div key={i}>
+            <div key={i} role="group">
               {/* group name (by date) */}
               {group.title ? (
                 // we use btn class here to make sure that the padding/margin are aligned with the other items
-                <b className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold">
+                <b
+                  className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold"
+                  role="note"
+                  aria-description={group.title}
+                  tabIndex={0}
+                >
                   {group.title}
                 </b>
               ) : (
@@ -184,20 +212,23 @@ function ConversationItem({
 }) {
   return (
     <div
+      role="menuitem"
+      tabIndex={0}
+      aria-label={conv.name}
       className={classNames({
         'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
           true,
         'btn-soft': isCurrConv,
       })}
     >
-      <div
+      <button
         key={conv.id}
         className="w-full overflow-hidden truncate text-start"
         onClick={onSelect}
         dir="auto"
       >
         {conv.name}
-      </div>
+      </button>
       <div className="dropdown dropdown-end h-5">
         <BtnWithTooltips
           // on mobile, we always show the ellipsis icon
@@ -211,22 +242,23 @@ function ConversationItem({
         </BtnWithTooltips>
         {/* dropdown menu */}
         <ul
+          aria-label="More options"
           tabIndex={0}
           className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
         >
-          <li onClick={onRename}>
+          <li onClick={onRename} tabIndex={0}>
             <a>
               <PencilIcon className="w-4 h-4" />
               Rename
             </a>
           </li>
-          <li onClick={onDownload}>
+          <li onClick={onDownload} tabIndex={0}>
             <a>
               <ArrowDownTrayIcon className="w-4 h-4" />
               Download
             </a>
           </li>
-          <li className="text-error" onClick={onDelete}>
+          <li className="text-error" onClick={onDelete} tabIndex={0}>
             <a>
               <TrashIcon className="w-4 h-4" />
               Delete
diff --git a/tools/server/webui/src/index.scss b/tools/server/webui/src/index.scss
index 563e7a461..64460b740 100644
--- a/tools/server/webui/src/index.scss
+++ b/tools/server/webui/src/index.scss
@@ -34,9 +34,6 @@ html {
   /* TODO: fix markdown table */
 }
 
-.show-on-hover {
-  @apply md:opacity-0 md:group-hover:opacity-100;
-}
 .btn-mini {
   @apply cursor-pointer;
 }
diff --git a/tools/server/webui/src/utils/common.tsx b/tools/server/webui/src/utils/common.tsx
index 372f464a2..7dd64508a 100644
--- a/tools/server/webui/src/utils/common.tsx
+++ b/tools/server/webui/src/utils/common.tsx
@@ -52,13 +52,20 @@ export function BtnWithTooltips({
   tooltipsContent: string;
   disabled?: boolean;
 }) {
+  // the onClick handler is on the container, so screen readers can safely ignore the inner button
+  // this prevents the label from being read twice
   return (
-    <div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
+    <div
+      className="tooltip tooltip-bottom"
+      data-tip={tooltipsContent}
+      role="button"
+      onClick={onClick}
+    >
       <button
         className={`${className ?? ''} flex items-center justify-center`}
-        onClick={onClick}
         disabled={disabled}
         onMouseLeave={onMouseLeave}
+        aria-hidden={true}
       >
         {children}
       </button>