Merge commit '4ccea213bc' into concedo_experimental

# Conflicts: # .devops/cpu.Dockerfile # .devops/cuda.Dockerfile # .devops/intel.Dockerfile # .devops/musa.Dockerfile # .devops/rocm.Dockerfile # .github/workflows/bench.yml.disabled # .github/workflows/build.yml # .github/workflows/server.yml # CMakeLists.txt # build-xcframework.sh # ci/run.sh # common/CMakeLists.txt # examples/llama.android/llama/build.gradle.kts # examples/perplexity/perplexity.cpp # examples/run/CMakeLists.txt # examples/server/tests/README.md # examples/sycl/win-build-sycl.bat # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/ggml-cpu.c # licenses/LICENSE-linenoise # scripts/sync-ggml.last # tests/CMakeLists.txt
2025-09-11 09:34:37 +00:00 · 2025-04-08 21:26:23 +08:00 · 2025-04-08 21:26:23 +08:00 · b99ee451f8
commit b99ee451f8
parent 822cf2430e 4ccea213bc
29 changed files with 11032 additions and 12914 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1842,6 +1842,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
            // can't use 256 for D==80.
            uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128;
            auto rows_cols = fa_rows_cols(D, clamp, type, small_rows);
+            // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
+            GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
            return {wg_size, rows_cols[0], rows_cols[1], (D), clamp};
        };

@ -5528,6 +5530,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                   // the "aligned" shader variant will forcibly align strides, for performance
                   (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;

+    // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
+    GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);
+
    vk_pipeline pipeline = pipelines[aligned];
    assert(pipeline);