vulkan: fix noncontig check for mat_mul_id splitting (#14683)

* vulkan: fix noncontig check for mat_mul_id splitting

Remove supports_op check for > 4096 (splitting fixes this)

* vulkan: fix batched matmul dequant for Q*_K
This commit is contained in:
Jeff Bolz 2025-07-15 14:51:09 -05:00 committed by GitHub
parent 10a0351a97
commit ba1ceb3456
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 6 additions and 10 deletions

View file

@ -4922,7 +4922,7 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
return return
tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]);
} }
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
@ -10356,10 +10356,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
return false; return false;
} }
// Check against size of shared memory variable
if (op->src[2]->ne[0] > 4096) {
return false;
}
} }
switch (src0_type) { switch (src0_type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:

View file

@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy; const uint i = gl_WorkGroupID.x * 256 + wgy;
if (i >= p.M * p.K / QUANT_K) { if (i >= p.nel / QUANT_K) {
return; return;
} }

View file

@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = uint(gl_WorkGroupID.x * 256 + wgy); const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
if (i >= p.M * p.K / QUANT_K) { if (i >= p.nel / QUANT_K) {
return; return;
} }

View file

@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint ib = gl_WorkGroupID.x * 256 + wgy; const uint ib = gl_WorkGroupID.x * 256 + wgy;
if (ib >= p.M * p.K / QUANT_K) { if (ib >= p.nel / QUANT_K) {
return; return;
} }

View file

@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint ib = gl_WorkGroupID.x * 256 + wgy; const uint ib = gl_WorkGroupID.x * 256 + wgy;
if (ib >= p.M * p.K / QUANT_K) { if (ib >= p.nel / QUANT_K) {
return; return;
} }

View file

@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy; const uint i = gl_WorkGroupID.x * 256 + wgy;
if (i >= p.M * p.K / QUANT_K) { if (i >= p.nel / QUANT_K) {
return; return;
} }
const uint tid = gl_LocalInvocationID.x; const uint tid = gl_LocalInvocationID.x;