From 8ad8aef447703e76cdcf74ed78dab16b92ec231a Mon Sep 17 00:00:00 2001 From: ymcki <84055651+ymcki@users.noreply.github.com> Date: Thu, 28 May 2026 12:23:21 +0800 Subject: [PATCH] opencl: OP_GATED_DELTA_NET (#23312) * OP_GATED_DELTA_NET impl * add back lanes_per_column declaration * removed has_subgroup_arithmetic and has_subgroup_clustered_reduce * removed trailing spaces and fixes indentation. Hard coded subgroup size for Adreno and Intel. Return not supported when K>1 state snapshot * support for K>1 state snapshot * removed picky indent multiple of 4 fixes * removed return that won\'t be executed --- ggml/src/ggml-opencl/CMakeLists.txt | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 345 ++++++++++++++++-- .../ggml-opencl/kernels/gated_delta_net.cl | 247 +++++++++++++ 3 files changed, 566 insertions(+), 27 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gated_delta_net.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index f75d089b5..446fb7279 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -164,6 +164,7 @@ set(GGML_OPENCL_KERNELS sqr sqrt ssm_conv + gated_delta_net sub sum_rows cumsum diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 42286435b..6d6c3e897 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -412,6 +412,7 @@ struct ggml_backend_opencl_context { size_t max_workgroup_size; bool fp16_support; bool has_vector_subgroup_broadcast; + bool has_qcom_subgroup_shuffle = false; // cl_qcom_subgroup_shuffle bool disable_fusion; std::regex *opfilter = nullptr; // regex of ops to not claim @@ -634,6 +635,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_conv_2d_f32; cl_kernel kernel_conv_2d_f16_f32; cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4; + // [size_idx][kda][tgpp] where size_idx: 0=S_V=16, 1=32, 2=64, 3=128; kda: 0 or 1. + // tgpp 0 = TG variant (COLS_PER_LANE_GROUP=1), tgpp 1 = prefill variant (COLS_PER_LANE_GROUP=4). + cl_kernel kernel_gated_delta_net_f32[4][2][2] = {}; + cl_kernel kernel_timestep_embedding; cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns; cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns; @@ -837,16 +842,16 @@ static std::vector g_ggml_backend_opencl_devices; static std::vector> g_ggml_backend_opencl_dev_ctxs; inline std::string read_file(const std::string &path) { - std::ifstream ifs(path); - if (!ifs) { - return ""; - } - std::string text; - ifs.seekg(0, std::ios::end); - text.resize(ifs.tellg()); - ifs.seekg(0, std::ios::beg); - ifs.read(&text[0], text.size()); - return text; + std::ifstream ifs(path); + if (!ifs) { + return ""; + } + std::string text; + ifs.seekg(0, std::ios::end); + text.resize(ifs.tellg()); + ifs.seekg(0, std::ios::beg); + ifs.read(&text[0], text.size()); + return text; } static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { @@ -2463,12 +2468,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err)); if (backend_ctx->program_upscale) { - cl_int err_bilinear; - backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); - if (err_bilinear != CL_SUCCESS) { + cl_int err_bilinear; + backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); + if (err_bilinear != CL_SUCCESS) { GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear); backend_ctx->kernel_upscale_bilinear = nullptr; - } + } } else { backend_ctx->kernel_upscale_bilinear = nullptr; } @@ -2538,8 +2543,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { GGML_LOG_CONT("."); } - // conv2d - { + // conv2d + { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "conv2d.cl.h" @@ -2597,6 +2602,86 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { GGML_LOG_CONT("."); } + // gated_delta_net: one kernel per (S_V, KDA, tgpp) triple. + { + #ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gated_delta_net.cl.h" + }; + #else + const std::string kernel_src = read_file("gated_delta_net.cl"); + #endif + + const int gdn_sizes[4] = { 16, 32, 64, 128 }; + const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1; + if (sg_size < 0) { + GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n"); + exit(1); + } + + for (int si = 0; si < 4; si++) { + const int S_V = gdn_sizes[si]; + + // MUST match the dispatcher heuristic in ggml_cl_gated_delta_net exactly. + int lanes_per_column; + if (S_V >= 128) { + lanes_per_column = 8; + } else { + lanes_per_column = std::min(S_V, sg_size); + } + + // Round LANES_PER_COLUMN down until it is: + // * power-of-two + // * divides both S_V and sg_size + while (lanes_per_column > 1 && + (((lanes_per_column & (lanes_per_column - 1)) != 0) || + (S_V % lanes_per_column) != 0 || + (sg_size % lanes_per_column) != 0)) { + lanes_per_column >>= 1; + } + + GGML_ASSERT(lanes_per_column >= 1); + GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0)); + GGML_ASSERT((S_V % lanes_per_column) == 0); + GGML_ASSERT((sg_size % lanes_per_column) == 0); + + const bool is_partial_reduce = (lanes_per_column != 1) && (lanes_per_column < sg_size); + int use_qcom_shuffle = 0; + if (is_partial_reduce) { + if (backend_ctx->has_qcom_subgroup_shuffle) { + use_qcom_shuffle = 1; + } + } + for (int kda = 0; kda < 2; kda++) { + for (int tgpp = 0; tgpp < 2; tgpp++) { + const int cpl = (tgpp == 0) ? 1 : 4; + const int spw = (tgpp == 0) ? 1 : 1; + + std::string opts = compile_opts; + opts += " -DS_V=" + std::to_string(S_V); + opts += " -DKDA=" + std::to_string(kda); + opts += " -DSUBGROUP_SIZE=" + std::to_string(sg_size); + opts += " -DLANES_PER_COLUMN=" + std::to_string(lanes_per_column); + opts += " -DCOLS_PER_LANE_GROUP=" + std::to_string(cpl); + opts += " -DUSE_QCOM_SUBGROUP_SHUFFLE=" + std::to_string(use_qcom_shuffle); + + // Since spw=1 is found to be optimal, SUBGROUPS_PER_WG > 1 code in + // the kernel is removed. If you want to experiment with spw > 1, + // Please remember to implement code to handle it. + opts += " -DSUBGROUPS_PER_WG=" + std::to_string(spw); + + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src.c_str(), opts); + + CL_CHECK((backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp] = + clCreateKernel(prog, "kernel_gated_delta_net", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + } + } + } + GGML_LOG_CONT("."); + } + // mul_mv_id_q4_0_f32_8x_flat { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2827,7 +2912,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q4_1_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl"); #endif @@ -2866,7 +2951,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_iq4_nl_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl"); #endif @@ -2905,7 +2990,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q8_0_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl"); #endif @@ -2946,7 +3031,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q4_k_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl"); #endif @@ -3781,6 +3866,16 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated + // check support for qcom_subgroup_shuffle + if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) { + GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n"); + if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) { + backend_ctx->has_qcom_subgroup_shuffle = true; + } + } + GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n", + backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false"); + // Check if ext_buffer contains cl_khr_fp16 backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false"); @@ -4832,17 +4927,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_UNARY_OP_SIGMOID: return ggml_is_contiguous(op->src[0]); case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_EXP: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; case GGML_UNARY_OP_EXPM1: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; case GGML_UNARY_OP_SOFTPLUS: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; default: return false; } @@ -4891,6 +4986,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32); case GGML_OP_SSM_CONV: return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32); + case GGML_OP_GATED_DELTA_NET: + { + // Match the Vulkan backend: only F32 -> F32, S_v in {16, 32, 64, 128}. + if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) { + return false; + } + const int64_t S_v = op->src[2]->ne[0]; + return S_v == 16 || S_v == 32 || S_v == 64 || S_v == 128; + } case GGML_OP_CONCAT: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_TIMESTEP_EMBEDDING: @@ -10555,7 +10659,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t size_t local_work_size[] = { lws0, 1, 1 }; size_t * local_work_size_ptr = local_work_size; - if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { + if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { local_work_size_ptr = nullptr; } @@ -17052,6 +17156,185 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) { + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + const ggml_tensor * src_q = dst->src[0]; + const ggml_tensor * src_k = dst->src[1]; + const ggml_tensor * src_v = dst->src[2]; + const ggml_tensor * src_g = dst->src[3]; + const ggml_tensor * src_beta = dst->src[4]; + const ggml_tensor * src_state = dst->src[5]; + + GGML_ASSERT(src_q && src_q->extra); + GGML_ASSERT(src_k && src_k->extra); + GGML_ASSERT(src_v && src_v->extra); + GGML_ASSERT(src_g && src_g->extra); + GGML_ASSERT(src_beta && src_beta->extra); + GGML_ASSERT(src_state && src_state->extra); + + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context; + + const cl_uint S_v = (cl_uint) src_v->ne[0]; + const cl_uint H_v = (cl_uint) src_v->ne[1]; + const cl_uint n_tokens = (cl_uint) src_v->ne[2]; + const cl_uint n_seqs = (cl_uint) src_v->ne[3]; + const cl_uint K = (cl_uint) src_state->ne[1]; + + int si; + switch (S_v) { + case 16: si = 0; break; + case 32: si = 1; break; + case 64: si = 2; break; + case 128: si = 3; break; + default: + GGML_ASSERT(false && "ggml_cl_gated_delta_net: unsupported S_v"); + } + + const int kda = (src_g->ne[0] == (int64_t) S_v) ? 1 : 0; + + // TODO: Optimize when S_v!=128. Not necessary for now as Qwen3.5/6 are all S_v=128 + // token generation mode (tgpp=0): + // process 1 token at a time, so columns per lane (cpl) == 1 + // prompt processing mode (tgpp=1): + // cpl=4 to process 4 tokens for single-token. 4 is chosen for Adreno 750 as per + // work-item/thread has at most 128 registers. + // All Qwen3.5/6 models are S_v == 128, so LANES_PER_COLUMN == 8 + // such that ROWS_PER_LANE = 128/8 = 16 + // Variables in the kernel: + // k_reg, q_reg, g_exp are all 16 floats + // s_shard has cpl*ROWS_PER_LANE = 4*16 = 64 floats + // Total 112 registers used. + // subgroups_per_workgroup (spw) can be set to 1,2,4,8,16 for tg and 1,2,4 for pp + // for S_v=128. + // Empirically found that when spw=1, we get the best performance for both tg and pp + const int tgpp = (n_tokens == 1) ? 0 : 1; + const int cpl = (tgpp == 0) ? 1 : 4; + // spw needs adjustment when S_v != 128 + const int spw = (tgpp == 0) ? 1 : 1; + + cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp]; + GGML_ASSERT(kernel != nullptr); + + const cl_uint s_off = S_v * H_v * n_tokens * n_seqs; + + const cl_uint sq1 = (cl_uint)(src_q->nb[1] / sizeof(float)); + const cl_uint sq2 = (cl_uint)(src_q->nb[2] / sizeof(float)); + const cl_uint sq3 = (cl_uint)(src_q->nb[3] / sizeof(float)); + const cl_uint sv1 = (cl_uint)(src_v->nb[1] / sizeof(float)); + const cl_uint sv2 = (cl_uint)(src_v->nb[2] / sizeof(float)); + const cl_uint sv3 = (cl_uint)(src_v->nb[3] / sizeof(float)); + const cl_uint sb1 = (cl_uint)(src_beta->nb[1] / sizeof(float)); + const cl_uint sb2 = (cl_uint)(src_beta->nb[2] / sizeof(float)); + const cl_uint sb3 = (cl_uint)(src_beta->nb[3] / sizeof(float)); + + const cl_uint H_k = (cl_uint) src_q->ne[1]; + const cl_uint rq3 = (cl_uint)(src_v->ne[3] / src_q->ne[3]); + + const float scale = 1.0f / sqrtf((float) S_v); + + ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *) src_q->extra; + ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *) src_k->extra; + ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *) src_v->extra; + ggml_tensor_extra_cl * extra_g = (ggml_tensor_extra_cl *) src_g->extra; + ggml_tensor_extra_cl * extra_beta = (ggml_tensor_extra_cl *) src_beta->extra; + ggml_tensor_extra_cl * extra_state = (ggml_tensor_extra_cl *) src_state->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *) dst->extra; + + const cl_ulong off_q = extra_q->offset + src_q->view_offs; + const cl_ulong off_k = extra_k->offset + src_k->view_offs; + const cl_ulong off_v = extra_v->offset + src_v->view_offs; + const cl_ulong off_g = extra_g->offset + src_g->view_offs; + const cl_ulong off_beta = extra_beta->offset + src_beta->view_offs; + const cl_ulong off_state = extra_state->offset + src_state->view_offs; + const cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + int idx = 0; + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_q->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_q)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_k->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_k)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_v->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_v)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_g->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_g)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_beta->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_beta)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_state->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_state)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_v)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_tokens)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_seqs)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s_off)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_k)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &rq3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &K)); + + // Subgroup size is 64 for Adreno and 32 for Intel + const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1; + if (sg_size < 0) { + GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n"); + exit(1); + } + + // For the subgroup-shuffle kernel, we can safely prefer 8 lanes/column for S_v>=128 + // For the subgroup-shuffle kernel: + // S_v >= 128 -> prefer 8 lanes/column (good occupancy & register pressure tradeoff) + // else -> min(S_v, subgroup_size) + int lanes_per_column; + if ((int)S_v >= 128) { + lanes_per_column = 8; + } else { + lanes_per_column = std::min((int)S_v, sg_size); + } + + // Max workgroup size for Adreno 750 is 1024 + const int wg_size = sg_size * spw; + + // Ensure lanes_per_column is a power-of-two and divides both S_v and subgroup_size. + // (Required for lane-group shuffle-xor reduction correctness.) + while (lanes_per_column > 1 && + (((lanes_per_column & (lanes_per_column - 1)) != 0) || + (((int)S_v % lanes_per_column) != 0) || + (sg_size % lanes_per_column) != 0)) { + lanes_per_column >>= 1; + } + GGML_ASSERT(lanes_per_column >= 1); + GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0)); + GGML_ASSERT(((int)S_v % lanes_per_column) == 0); + GGML_ASSERT((sg_size % lanes_per_column) == 0); + + const int cols_per_wg = spw * (sg_size / lanes_per_column) * cpl; + GGML_ASSERT(cols_per_wg > 0); + GGML_ASSERT(((int)S_v % cols_per_wg) == 0); + + size_t global_work_size[3]; + size_t local_work_size[3]; + + global_work_size[0] = (size_t) H_v * (size_t) wg_size; + global_work_size[1] = (size_t) n_seqs; + global_work_size[2] = (size_t) S_v / (size_t) cols_per_wg; + + local_work_size[0] = (size_t) wg_size; + local_work_size[1] = 1; + local_work_size[2] = 1; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + //------------------------------------------------------------------------------ // Op offloading //------------------------------------------------------------------------------ @@ -17267,8 +17550,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_group_norm; break; - case GGML_OP_REPEAT: - if (!any_on_device) { + case GGML_OP_REPEAT: + if (!any_on_device) { return false; } func = ggml_cl_repeat; @@ -17297,6 +17580,14 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_ssm_conv; break; + case GGML_OP_GATED_DELTA_NET: + if (!any_on_device) { + return false; + } + // GDN has 6 source tensors, so it cannot use the standard + // (src0, src1, dst) func signature. Dispatch directly and return. + ggml_cl_gated_delta_net(backend, tensor); + return true; case GGML_OP_CONCAT: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl new file mode 100644 index 000000000..d11192f58 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl @@ -0,0 +1,247 @@ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifndef S_V +#define S_V 128 +#endif +#ifndef KDA +#define KDA 0 +#endif +#ifndef SUBGROUP_SIZE +#define SUBGROUP_SIZE 64 +#endif +#ifndef LANES_PER_COLUMN +#define LANES_PER_COLUMN 8 +#endif +#ifndef COLS_PER_LANE_GROUP +#define COLS_PER_LANE_GROUP 1 +#endif +#ifndef SUBGROUPS_PER_WG +#define SUBGROUPS_PER_WG 1 +#endif +#ifndef USE_QCOM_SUBGROUP_SHUFFLE +#define USE_QCOM_SUBGROUP_SHUFFLE 0 +#endif + +#define WG_SIZE (SUBGROUP_SIZE * SUBGROUPS_PER_WG) +#define LANE_GROUPS_PER_SG (SUBGROUP_SIZE / LANES_PER_COLUMN) +#define COLS_PER_SG (LANE_GROUPS_PER_SG * COLS_PER_LANE_GROUP) +#define COLS_PER_WG (SUBGROUPS_PER_WG * COLS_PER_SG) +#define ROWS_PER_LANE (S_V / LANES_PER_COLUMN) + +#if USE_QCOM_SUBGROUP_SHUFFLE +#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable +#endif + +// XOR-based parallel sum +// This does a reduction across groups of LANES_PER_COLUMN +static inline float reduce_add_shmem(float partial, __local float * temp, uint lane) { +#if USE_QCOM_SUBGROUP_SHUFFLE + #pragma unroll + for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) { + partial += qcom_sub_group_shuffle_xor(partial, s, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, partial); + } + return partial; +#else + temp[lane] = partial; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) { + float other = temp[lane ^ s]; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + temp[lane] += other; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + } + const float result = temp[lane]; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + return result; +#endif +} + +#define REDUCE_PARTIAL(partial, temp_ptr, lid) \ + ((LANES_PER_COLUMN == 1u) ? (partial) : reduce_add_shmem((partial), (temp_ptr), (lid))) + +// force compiler to optimize kernel for a specific fixed work-group size +__attribute__((reqd_work_group_size(WG_SIZE, 1, 1))) +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_32 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_gated_delta_net( + global const char * q_buf, ulong off_q, + global const char * k_buf, ulong off_k, + global const char * v_buf, ulong off_v, + global const char * g_buf, ulong off_g, + global const char * beta_buf, ulong off_beta, + global const char * state_buf, ulong off_state, + global char * dst_buf, ulong off_dst, + uint H_v, + uint n_tokens, + uint n_seqs, + uint s_off, + uint sq1, uint sq2, uint sq3, + uint sv1, uint sv2, uint sv3, + uint sb1, uint sb2, uint sb3, + uint H_k, + uint rq3, + float scale, + uint K) { + + global const float * data_q = (global const float *)(q_buf + off_q); + global const float * data_k = (global const float *)(k_buf + off_k); + global const float * data_v = (global const float *)(v_buf + off_v); + global const float * data_g = (global const float *)(g_buf + off_g); + global const float * data_beta = (global const float *)(beta_buf + off_beta); + global const float * data_state = (global const float *)(state_buf + off_state); + global float * data_dst = (global float *)(dst_buf + off_dst); + + const uint head_id = get_group_id(0); + const uint seq_id = get_group_id(1); + const uint tid = (uint)get_local_id(0); + + const uint sg_id = get_sub_group_id(); // subgroup id + const uint sg_lid = get_sub_group_local_id(); // subgroup lane id + + const uint lane = sg_lid % LANES_PER_COLUMN; + const uint lane_group = sg_lid / LANES_PER_COLUMN; + const uint wg_col_base = get_group_id(2) * COLS_PER_WG; + const uint sg_col_base = wg_col_base + sg_id * COLS_PER_SG; + + const uint iq1 = head_id % H_k; // head index for Q and K + const uint iq3 = seq_id / rq3; // seq index for Q and K + + const uint state_size = S_V * S_V; + const uint state_base = (seq_id * K * H_v + head_id) * state_size; + const uint q_off_base = iq3 * sq3 + iq1 * sq1; + const uint v_off_base = seq_id * sv3 + head_id * sv1; + const uint gb_off_base = seq_id * sb3 + head_id * sb1; + const uint state_out_base = (seq_id * H_v + head_id) * state_size; + const uint state_size_per_snap = state_size * H_v * n_seqs; + + __local float reduce_temp[WG_SIZE]; + __local float * temp_ptr = reduce_temp + sg_id * SUBGROUP_SIZE; + + float s_shard[COLS_PER_LANE_GROUP][ROWS_PER_LANE]; + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + s_shard[cg][r] = data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane]; + } + } + + const int shift = (int)n_tokens - (int)K; + uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V; + + for (uint t = 0; t < n_tokens; t++) { + const uint q_off = q_off_base + t * sq2; + const uint k_off = q_off; + const uint v_off = v_off_base + t * sv2; + const uint gb_off = gb_off_base + t * sb2; + const float beta_val = data_beta[gb_off]; + + float k_reg[ROWS_PER_LANE]; + float q_reg[ROWS_PER_LANE]; +#if KDA + float g_exp[ROWS_PER_LANE]; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + const uint i = r * LANES_PER_COLUMN + lane; + k_reg[r] = data_k[k_off + i]; + q_reg[r] = data_q[q_off + i]; + g_exp[r] = exp(data_g[gb_off * S_V + i]); + } +#else + const float g_val = exp(data_g[gb_off]); + + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + const uint i = r * LANES_PER_COLUMN + lane; + k_reg[r] = data_k[k_off + i]; + q_reg[r] = data_q[q_off + i]; + } +#endif + + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + float v_val = data_v[v_off + col]; + + float kv_shard = 0.0f; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { +#if KDA + float gs = g_exp[r] * s_shard[cg][r]; + kv_shard += gs * k_reg[r]; +#else + kv_shard += s_shard[cg][r] * k_reg[r]; +#endif + } + +#if !KDA + kv_shard *= g_val; // Applied once instead of ROWS_PER_LANE times +#endif + + const float kv_col = REDUCE_PARTIAL(kv_shard, temp_ptr, sg_lid); + + const float delta_col = (v_val - kv_col) * beta_val; + + float attn_partial = 0.0f; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { +#if KDA + float gs = g_exp[r] * s_shard[cg][r]; +#else + float gs = g_val * s_shard[cg][r]; +#endif + s_shard[cg][r] = gs + k_reg[r] * delta_col; + attn_partial += s_shard[cg][r] * q_reg[r]; + } + const float attn_col = REDUCE_PARTIAL(attn_partial, temp_ptr, sg_lid); + + if (lane == 0) { + data_dst[attn_off + col] = attn_col * scale; + } + } + attn_off += S_V * H_v; + + if (K > 1u) { + const int target_slot = (int)t - shift; + if (target_slot >= 0 && target_slot < (int)K) { + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + const uint slot_base = s_off + (uint)target_slot * state_size_per_snap + state_out_base; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r]; + } + } + } + } + } + + if (K == 1u) { + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r]; + } + } + } +}