SYCL: add BF16 to DMMV kernel path (~4x tg speedup on Intel Arc) (#21580)

* SYCL: add BF16 to DMMV kernel path for ~4x token generation speedup

BF16 models had no dedicated token generation kernel — they fell through
to the generic full-GEMM path, resulting in ~14% memory bandwidth
utilization on Intel Arc GPUs. This adds BF16 support to the DMMV
(dequantize mul-mat-vec) path, matching the existing F16 implementation.

Fixes #20478

* SYCL: fix BF16 DMMV out-of-bounds when ncols % 64 != 0

The qk=1 kernel (used for F16 and BF16) iterates with stride
2*GGML_SYCL_DMMV_X (= 64 on Intel targets where WARP_SIZE=16). When
ncols is a multiple of DMMV_X (32) but not of 2*DMMV_X (64), the last
warp iteration accesses elements at col >= ncols, producing NaN for the
final row and wrong values for interior rows.

Fix: tighten can_use_dequantize_mul_mat_vec to require ne[0] %
(2*DMMV_X) == 0 for F16/BF16 types, and update the ASSERT in the BF16
launcher to match. Quantized types use block-structured kernels with
different access patterns and keep the existing DMMV_X check.

Verified: test-backend-ops MUL_MAT passes 913/913 on Intel Arc Pro B70.
Previously failing: m=128/129 n=1 k=1056 cases (NaN and ERR > 0.0005).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Katostrofik 2026-05-22 08:48:24 -04:00 committed by GitHub
parent 95feeab52e
commit 8cc67efcd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 53 additions and 2 deletions

View file

@ -3,6 +3,13 @@
#include "dequantize.hpp"
#include "presets.hpp"
#if defined(__INTEL_LLVM_COMPILER)
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
#include <sycl/ext/oneapi/bfloat16.hpp>
#define GGML_SYCL_DMMV_HAS_BF16
#endif
#endif
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const sycl::half *x = (const sycl::half *)vx;
@ -11,6 +18,16 @@ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat
v.y() = x[ib + iqs + 1];
}
#ifdef GGML_SYCL_DMMV_HAS_BF16
static void convert_bf16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const sycl::ext::oneapi::bfloat16 *x = (const sycl::ext::oneapi::bfloat16 *)vx;
// automatic bfloat16 -> float type cast if dfloat == float
v.x() = x[ib + iqs + 0];
v.y() = x[ib + iqs + 1];
}
#endif
static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const float * x = (const float *) vx;
@ -217,6 +234,28 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
}
}
#ifdef GGML_SYCL_DMMV_HAS_BF16
static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
// The qk=1 kernel iterates with stride 2*GGML_SYCL_DMMV_X, so ncols must be a
// multiple of that — not just GGML_SYCL_DMMV_X — to avoid out-of-bounds reads.
GGML_ASSERT(ncols % (2*GGML_SYCL_DMMV_X) == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
dequantize_mul_mat_vec<1, 1, convert_bf16>(vx, y, dst, ncols,
nrows, item_ct1);
});
}
}
#endif
/*
DPCT1110:4: The total declared local variable size in device function
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@ -1497,7 +1536,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
bool src1_convert_f16 =
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
src0->type == GGML_TYPE_BF16;
if (src1_convert_f16) {
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
@ -1565,6 +1605,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
case GGML_TYPE_F16:
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break;
#ifdef GGML_SYCL_DMMV_HAS_BF16
case GGML_TYPE_BF16:
convert_mul_mat_vec_bf16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break;
#endif
default:
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
GGML_ABORT("fatal error");

View file

@ -3455,6 +3455,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
return true;
default:
return false;
@ -3818,8 +3819,13 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
// a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
// need ne[0] % DMMV_X == 0.
const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
}
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {