Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # docs/development/HOWTO-add-model.md # ggml/src/ggml-sycl/rope.cpp # tests/test-backend-ops.cpp
2025-09-11 09:34:37 +00:00 · 2025-07-09 19:25:28 +08:00 · 2025-07-09 19:25:28 +08:00 · b8c1fc7c9e
commit b8c1fc7c9e
parent f8a49aa8e6 ffd59e7d18
30 changed files with 1784 additions and 263 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -2722,7 +2722,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 4 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);

    for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@ -6276,13 +6276,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count : 16;

    // Try to use split_k when KV is large enough to be worth the overhead
-    if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
+    if (workgroups_x == 1 && shader_core_count > 0) {
        // Try to run two workgroups per SM.
        split_k = shader_core_count * 2 / (workgroups_y * workgroups_z);
        if (split_k > 1) {
            // Try to evenly split KV into split_k chunks, but it needs to be a multiple
            // of "align", so recompute split_k based on that.
-            split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
+            split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), pipelines[1]->align);
            split_k = CEIL_DIV(KV, split_kv);
            workgroups_x = split_k;
        }
@ -6416,7 +6416,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
                                    },
-                                    pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
+                                    pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
    } else {
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                    {