ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (#14435)

ggml-ci
2025-09-11 09:34:37 +00:00 · 2025-06-27 21:50:57 +03:00 · 2025-06-27 21:50:57 +03:00 · ec68e84c32
commit ec68e84c32
parent 307e79d33d
11 changed files with 250 additions and 156 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -10248,6 +10248,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
                    return false;
                }
+                // TODO: support broadcast
+                // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+                if (op->src[0]->ne[3] != 1) {
+                    return false;
+                }
                // It's straightforward to support different K/V dequant, but would
                // significantly increase the number of pipelines
                if (op->src[1]->type != op->src[2]->type) {
@ -10406,7 +10411,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_SCALE:
        case GGML_OP_PAD:
        case GGML_OP_DIAG_MASK_INF:
+            return true;
        case GGML_OP_SOFT_MAX:
+            // TODO: support batching
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
        case GGML_OP_SOFT_MAX_BACK:
        case GGML_OP_ARGSORT:
        case GGML_OP_SUM: