ggml : fix FA mask dim 2 and 3 (#14505)

* ggml : fix FA mask dim 2 and 3 ggml-ci * backends : unsupport batched FA in CUDA and Vulkan ggml-ci * vulkan : disable FA for mask->ne[2] != 1
2025-09-11 09:34:37 +00:00 · 2025-07-03 10:46:57 +03:00 · 2025-07-03 10:46:57 +03:00 · 9067487c44
commit 9067487c44
parent d4cdd9c1c3
9 changed files with 26 additions and 15 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -10265,6 +10265,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
                    return false;
                }
+                // TODO: support broadcast
+                // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but
+                //       the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
+                if (op->src[0]->ne[3] != 1 || (op->src[3] && op->src[3]->ne[2] != 1)) {
+                    return false;
+                }
                // It's straightforward to support different K/V dequant, but would
                // significantly increase the number of pipelines
                if (op->src[1]->type != op->src[2]->type) {