opencl: OP_GATED_DELTA_NET (#23312)

* OP_GATED_DELTA_NET impl

* add back lanes_per_column declaration

* removed has_subgroup_arithmetic and has_subgroup_clustered_reduce

* removed trailing spaces and fixes indentation. Hard coded subgroup size for Adreno and Intel. Return not supported when K>1 state snapshot

* support for K>1 state snapshot

* removed picky indent multiple of 4 fixes

* removed return that won\'t be executed
This commit is contained in:
ymcki 2026-05-28 12:23:21 +08:00 committed by GitHub
parent f12cc6d0fa
commit 8ad8aef447
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 566 additions and 27 deletions

View file

@ -164,6 +164,7 @@ set(GGML_OPENCL_KERNELS
sqr
sqrt
ssm_conv
gated_delta_net
sub
sum_rows
cumsum

View file

@ -412,6 +412,7 @@ struct ggml_backend_opencl_context {
size_t max_workgroup_size;
bool fp16_support;
bool has_vector_subgroup_broadcast;
bool has_qcom_subgroup_shuffle = false; // cl_qcom_subgroup_shuffle
bool disable_fusion;
std::regex *opfilter = nullptr; // regex of ops to not claim
@ -634,6 +635,10 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
// [size_idx][kda][tgpp] where size_idx: 0=S_V=16, 1=32, 2=64, 3=128; kda: 0 or 1.
// tgpp 0 = TG variant (COLS_PER_LANE_GROUP=1), tgpp 1 = prefill variant (COLS_PER_LANE_GROUP=4).
cl_kernel kernel_gated_delta_net_f32[4][2][2] = {};
cl_kernel kernel_timestep_embedding;
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
@ -837,16 +842,16 @@ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
inline std::string read_file(const std::string &path) {
std::ifstream ifs(path);
if (!ifs) {
return "";
}
std::string text;
ifs.seekg(0, std::ios::end);
text.resize(ifs.tellg());
ifs.seekg(0, std::ios::beg);
ifs.read(&text[0], text.size());
return text;
std::ifstream ifs(path);
if (!ifs) {
return "";
}
std::string text;
ifs.seekg(0, std::ios::end);
text.resize(ifs.tellg());
ifs.seekg(0, std::ios::beg);
ifs.read(&text[0], text.size());
return text;
}
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
@ -2463,12 +2468,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
if (backend_ctx->program_upscale) {
cl_int err_bilinear;
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
if (err_bilinear != CL_SUCCESS) {
cl_int err_bilinear;
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
if (err_bilinear != CL_SUCCESS) {
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
backend_ctx->kernel_upscale_bilinear = nullptr;
}
}
} else {
backend_ctx->kernel_upscale_bilinear = nullptr;
}
@ -2538,8 +2543,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
GGML_LOG_CONT(".");
}
// conv2d
{
// conv2d
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "conv2d.cl.h"
@ -2597,6 +2602,86 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
GGML_LOG_CONT(".");
}
// gated_delta_net: one kernel per (S_V, KDA, tgpp) triple.
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gated_delta_net.cl.h"
};
#else
const std::string kernel_src = read_file("gated_delta_net.cl");
#endif
const int gdn_sizes[4] = { 16, 32, 64, 128 };
const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
if (sg_size < 0) {
GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
exit(1);
}
for (int si = 0; si < 4; si++) {
const int S_V = gdn_sizes[si];
// MUST match the dispatcher heuristic in ggml_cl_gated_delta_net exactly.
int lanes_per_column;
if (S_V >= 128) {
lanes_per_column = 8;
} else {
lanes_per_column = std::min(S_V, sg_size);
}
// Round LANES_PER_COLUMN down until it is:
// * power-of-two
// * divides both S_V and sg_size
while (lanes_per_column > 1 &&
(((lanes_per_column & (lanes_per_column - 1)) != 0) ||
(S_V % lanes_per_column) != 0 ||
(sg_size % lanes_per_column) != 0)) {
lanes_per_column >>= 1;
}
GGML_ASSERT(lanes_per_column >= 1);
GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
GGML_ASSERT((S_V % lanes_per_column) == 0);
GGML_ASSERT((sg_size % lanes_per_column) == 0);
const bool is_partial_reduce = (lanes_per_column != 1) && (lanes_per_column < sg_size);
int use_qcom_shuffle = 0;
if (is_partial_reduce) {
if (backend_ctx->has_qcom_subgroup_shuffle) {
use_qcom_shuffle = 1;
}
}
for (int kda = 0; kda < 2; kda++) {
for (int tgpp = 0; tgpp < 2; tgpp++) {
const int cpl = (tgpp == 0) ? 1 : 4;
const int spw = (tgpp == 0) ? 1 : 1;
std::string opts = compile_opts;
opts += " -DS_V=" + std::to_string(S_V);
opts += " -DKDA=" + std::to_string(kda);
opts += " -DSUBGROUP_SIZE=" + std::to_string(sg_size);
opts += " -DLANES_PER_COLUMN=" + std::to_string(lanes_per_column);
opts += " -DCOLS_PER_LANE_GROUP=" + std::to_string(cpl);
opts += " -DUSE_QCOM_SUBGROUP_SHUFFLE=" + std::to_string(use_qcom_shuffle);
// Since spw=1 is found to be optimal, SUBGROUPS_PER_WG > 1 code in
// the kernel is removed. If you want to experiment with spw > 1,
// Please remember to implement code to handle it.
opts += " -DSUBGROUPS_PER_WG=" + std::to_string(spw);
cl_program prog = build_program_from_source(
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), opts);
CL_CHECK((backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp] =
clCreateKernel(prog, "kernel_gated_delta_net", &err), err));
CL_CHECK(clReleaseProgram(prog));
}
}
}
GGML_LOG_CONT(".");
}
// mul_mv_id_q4_0_f32_8x_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@ -2827,7 +2912,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_q4_1_f32.cl.h"
};
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl");
#endif
@ -2866,7 +2951,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_iq4_nl_f32.cl.h"
};
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl");
#endif
@ -2905,7 +2990,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_q8_0_f32.cl.h"
};
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl");
#endif
@ -2946,7 +3031,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_q4_k_f32.cl.h"
};
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
#endif
@ -3781,6 +3866,16 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
// check support for qcom_subgroup_shuffle
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) {
GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n");
if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
backend_ctx->has_qcom_subgroup_shuffle = true;
}
}
GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
// Check if ext_buffer contains cl_khr_fp16
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
@ -4832,17 +4927,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_UNARY_OP_SIGMOID:
return ggml_is_contiguous(op->src[0]);
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_EXP:
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
case GGML_UNARY_OP_EXPM1:
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
case GGML_UNARY_OP_SOFTPLUS:
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
default:
return false;
}
@ -4891,6 +4986,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
(op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
case GGML_OP_SSM_CONV:
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
case GGML_OP_GATED_DELTA_NET:
{
// Match the Vulkan backend: only F32 -> F32, S_v in {16, 32, 64, 128}.
if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) {
return false;
}
const int64_t S_v = op->src[2]->ne[0];
return S_v == 16 || S_v == 32 || S_v == 64 || S_v == 128;
}
case GGML_OP_CONCAT:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_TIMESTEP_EMBEDDING:
@ -10555,7 +10659,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
size_t local_work_size[] = { lws0, 1, 1 };
size_t * local_work_size_ptr = local_work_size;
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
@ -17052,6 +17156,185 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) {
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const ggml_tensor * src_q = dst->src[0];
const ggml_tensor * src_k = dst->src[1];
const ggml_tensor * src_v = dst->src[2];
const ggml_tensor * src_g = dst->src[3];
const ggml_tensor * src_beta = dst->src[4];
const ggml_tensor * src_state = dst->src[5];
GGML_ASSERT(src_q && src_q->extra);
GGML_ASSERT(src_k && src_k->extra);
GGML_ASSERT(src_v && src_v->extra);
GGML_ASSERT(src_g && src_g->extra);
GGML_ASSERT(src_beta && src_beta->extra);
GGML_ASSERT(src_state && src_state->extra);
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
const cl_uint S_v = (cl_uint) src_v->ne[0];
const cl_uint H_v = (cl_uint) src_v->ne[1];
const cl_uint n_tokens = (cl_uint) src_v->ne[2];
const cl_uint n_seqs = (cl_uint) src_v->ne[3];
const cl_uint K = (cl_uint) src_state->ne[1];
int si;
switch (S_v) {
case 16: si = 0; break;
case 32: si = 1; break;
case 64: si = 2; break;
case 128: si = 3; break;
default:
GGML_ASSERT(false && "ggml_cl_gated_delta_net: unsupported S_v");
}
const int kda = (src_g->ne[0] == (int64_t) S_v) ? 1 : 0;
// TODO: Optimize when S_v!=128. Not necessary for now as Qwen3.5/6 are all S_v=128
// token generation mode (tgpp=0):
// process 1 token at a time, so columns per lane (cpl) == 1
// prompt processing mode (tgpp=1):
// cpl=4 to process 4 tokens for single-token. 4 is chosen for Adreno 750 as per
// work-item/thread has at most 128 registers.
// All Qwen3.5/6 models are S_v == 128, so LANES_PER_COLUMN == 8
// such that ROWS_PER_LANE = 128/8 = 16
// Variables in the kernel:
// k_reg, q_reg, g_exp are all 16 floats
// s_shard has cpl*ROWS_PER_LANE = 4*16 = 64 floats
// Total 112 registers used.
// subgroups_per_workgroup (spw) can be set to 1,2,4,8,16 for tg and 1,2,4 for pp
// for S_v=128.
// Empirically found that when spw=1, we get the best performance for both tg and pp
const int tgpp = (n_tokens == 1) ? 0 : 1;
const int cpl = (tgpp == 0) ? 1 : 4;
// spw needs adjustment when S_v != 128
const int spw = (tgpp == 0) ? 1 : 1;
cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp];
GGML_ASSERT(kernel != nullptr);
const cl_uint s_off = S_v * H_v * n_tokens * n_seqs;
const cl_uint sq1 = (cl_uint)(src_q->nb[1] / sizeof(float));
const cl_uint sq2 = (cl_uint)(src_q->nb[2] / sizeof(float));
const cl_uint sq3 = (cl_uint)(src_q->nb[3] / sizeof(float));
const cl_uint sv1 = (cl_uint)(src_v->nb[1] / sizeof(float));
const cl_uint sv2 = (cl_uint)(src_v->nb[2] / sizeof(float));
const cl_uint sv3 = (cl_uint)(src_v->nb[3] / sizeof(float));
const cl_uint sb1 = (cl_uint)(src_beta->nb[1] / sizeof(float));
const cl_uint sb2 = (cl_uint)(src_beta->nb[2] / sizeof(float));
const cl_uint sb3 = (cl_uint)(src_beta->nb[3] / sizeof(float));
const cl_uint H_k = (cl_uint) src_q->ne[1];
const cl_uint rq3 = (cl_uint)(src_v->ne[3] / src_q->ne[3]);
const float scale = 1.0f / sqrtf((float) S_v);
ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *) src_q->extra;
ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *) src_k->extra;
ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *) src_v->extra;
ggml_tensor_extra_cl * extra_g = (ggml_tensor_extra_cl *) src_g->extra;
ggml_tensor_extra_cl * extra_beta = (ggml_tensor_extra_cl *) src_beta->extra;
ggml_tensor_extra_cl * extra_state = (ggml_tensor_extra_cl *) src_state->extra;
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *) dst->extra;
const cl_ulong off_q = extra_q->offset + src_q->view_offs;
const cl_ulong off_k = extra_k->offset + src_k->view_offs;
const cl_ulong off_v = extra_v->offset + src_v->view_offs;
const cl_ulong off_g = extra_g->offset + src_g->view_offs;
const cl_ulong off_beta = extra_beta->offset + src_beta->view_offs;
const cl_ulong off_state = extra_state->offset + src_state->view_offs;
const cl_ulong off_dst = extra_dst->offset + dst->view_offs;
int idx = 0;
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_q->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_q));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_k->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_k));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_v->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_v));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_g->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_g));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_beta->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_beta));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_state->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_state));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_dst->data_device));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_dst));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_v));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_tokens));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_seqs));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s_off));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq1));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq2));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq3));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv1));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv2));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv3));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb1));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb2));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb3));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_k));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &rq3));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(float), &scale));
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &K));
// Subgroup size is 64 for Adreno and 32 for Intel
const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
if (sg_size < 0) {
GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
exit(1);
}
// For the subgroup-shuffle kernel, we can safely prefer 8 lanes/column for S_v>=128
// For the subgroup-shuffle kernel:
// S_v >= 128 -> prefer 8 lanes/column (good occupancy & register pressure tradeoff)
// else -> min(S_v, subgroup_size)
int lanes_per_column;
if ((int)S_v >= 128) {
lanes_per_column = 8;
} else {
lanes_per_column = std::min((int)S_v, sg_size);
}
// Max workgroup size for Adreno 750 is 1024
const int wg_size = sg_size * spw;
// Ensure lanes_per_column is a power-of-two and divides both S_v and subgroup_size.
// (Required for lane-group shuffle-xor reduction correctness.)
while (lanes_per_column > 1 &&
(((lanes_per_column & (lanes_per_column - 1)) != 0) ||
(((int)S_v % lanes_per_column) != 0) ||
(sg_size % lanes_per_column) != 0)) {
lanes_per_column >>= 1;
}
GGML_ASSERT(lanes_per_column >= 1);
GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
GGML_ASSERT(((int)S_v % lanes_per_column) == 0);
GGML_ASSERT((sg_size % lanes_per_column) == 0);
const int cols_per_wg = spw * (sg_size / lanes_per_column) * cpl;
GGML_ASSERT(cols_per_wg > 0);
GGML_ASSERT(((int)S_v % cols_per_wg) == 0);
size_t global_work_size[3];
size_t local_work_size[3];
global_work_size[0] = (size_t) H_v * (size_t) wg_size;
global_work_size[1] = (size_t) n_seqs;
global_work_size[2] = (size_t) S_v / (size_t) cols_per_wg;
local_work_size[0] = (size_t) wg_size;
local_work_size[1] = 1;
local_work_size[2] = 1;
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
//------------------------------------------------------------------------------
// Op offloading
//------------------------------------------------------------------------------
@ -17267,8 +17550,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_group_norm;
break;
case GGML_OP_REPEAT:
if (!any_on_device) {
case GGML_OP_REPEAT:
if (!any_on_device) {
return false;
}
func = ggml_cl_repeat;
@ -17297,6 +17580,14 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_ssm_conv;
break;
case GGML_OP_GATED_DELTA_NET:
if (!any_on_device) {
return false;
}
// GDN has 6 source tensors, so it cannot use the standard
// (src0, src1, dst) func signature. Dispatch directly and return.
ggml_cl_gated_delta_net(backend, tensor);
return true;
case GGML_OP_CONCAT:
if (!any_on_device) {
return false;

View file

@ -0,0 +1,247 @@
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#ifndef S_V
#define S_V 128
#endif
#ifndef KDA
#define KDA 0
#endif
#ifndef SUBGROUP_SIZE
#define SUBGROUP_SIZE 64
#endif
#ifndef LANES_PER_COLUMN
#define LANES_PER_COLUMN 8
#endif
#ifndef COLS_PER_LANE_GROUP
#define COLS_PER_LANE_GROUP 1
#endif
#ifndef SUBGROUPS_PER_WG
#define SUBGROUPS_PER_WG 1
#endif
#ifndef USE_QCOM_SUBGROUP_SHUFFLE
#define USE_QCOM_SUBGROUP_SHUFFLE 0
#endif
#define WG_SIZE (SUBGROUP_SIZE * SUBGROUPS_PER_WG)
#define LANE_GROUPS_PER_SG (SUBGROUP_SIZE / LANES_PER_COLUMN)
#define COLS_PER_SG (LANE_GROUPS_PER_SG * COLS_PER_LANE_GROUP)
#define COLS_PER_WG (SUBGROUPS_PER_WG * COLS_PER_SG)
#define ROWS_PER_LANE (S_V / LANES_PER_COLUMN)
#if USE_QCOM_SUBGROUP_SHUFFLE
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
#endif
// XOR-based parallel sum
// This does a reduction across groups of LANES_PER_COLUMN
static inline float reduce_add_shmem(float partial, __local float * temp, uint lane) {
#if USE_QCOM_SUBGROUP_SHUFFLE
#pragma unroll
for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
partial += qcom_sub_group_shuffle_xor(partial, s, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, partial);
}
return partial;
#else
temp[lane] = partial;
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
float other = temp[lane ^ s];
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
temp[lane] += other;
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
}
const float result = temp[lane];
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
return result;
#endif
}
#define REDUCE_PARTIAL(partial, temp_ptr, lid) \
((LANES_PER_COLUMN == 1u) ? (partial) : reduce_add_shmem((partial), (temp_ptr), (lid)))
// force compiler to optimize kernel for a specific fixed work-group size
__attribute__((reqd_work_group_size(WG_SIZE, 1, 1)))
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_gated_delta_net(
global const char * q_buf, ulong off_q,
global const char * k_buf, ulong off_k,
global const char * v_buf, ulong off_v,
global const char * g_buf, ulong off_g,
global const char * beta_buf, ulong off_beta,
global const char * state_buf, ulong off_state,
global char * dst_buf, ulong off_dst,
uint H_v,
uint n_tokens,
uint n_seqs,
uint s_off,
uint sq1, uint sq2, uint sq3,
uint sv1, uint sv2, uint sv3,
uint sb1, uint sb2, uint sb3,
uint H_k,
uint rq3,
float scale,
uint K) {
global const float * data_q = (global const float *)(q_buf + off_q);
global const float * data_k = (global const float *)(k_buf + off_k);
global const float * data_v = (global const float *)(v_buf + off_v);
global const float * data_g = (global const float *)(g_buf + off_g);
global const float * data_beta = (global const float *)(beta_buf + off_beta);
global const float * data_state = (global const float *)(state_buf + off_state);
global float * data_dst = (global float *)(dst_buf + off_dst);
const uint head_id = get_group_id(0);
const uint seq_id = get_group_id(1);
const uint tid = (uint)get_local_id(0);
const uint sg_id = get_sub_group_id(); // subgroup id
const uint sg_lid = get_sub_group_local_id(); // subgroup lane id
const uint lane = sg_lid % LANES_PER_COLUMN;
const uint lane_group = sg_lid / LANES_PER_COLUMN;
const uint wg_col_base = get_group_id(2) * COLS_PER_WG;
const uint sg_col_base = wg_col_base + sg_id * COLS_PER_SG;
const uint iq1 = head_id % H_k; // head index for Q and K
const uint iq3 = seq_id / rq3; // seq index for Q and K
const uint state_size = S_V * S_V;
const uint state_base = (seq_id * K * H_v + head_id) * state_size;
const uint q_off_base = iq3 * sq3 + iq1 * sq1;
const uint v_off_base = seq_id * sv3 + head_id * sv1;
const uint gb_off_base = seq_id * sb3 + head_id * sb1;
const uint state_out_base = (seq_id * H_v + head_id) * state_size;
const uint state_size_per_snap = state_size * H_v * n_seqs;
__local float reduce_temp[WG_SIZE];
__local float * temp_ptr = reduce_temp + sg_id * SUBGROUP_SIZE;
float s_shard[COLS_PER_LANE_GROUP][ROWS_PER_LANE];
#pragma unroll
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
s_shard[cg][r] = data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane];
}
}
const int shift = (int)n_tokens - (int)K;
uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V;
for (uint t = 0; t < n_tokens; t++) {
const uint q_off = q_off_base + t * sq2;
const uint k_off = q_off;
const uint v_off = v_off_base + t * sv2;
const uint gb_off = gb_off_base + t * sb2;
const float beta_val = data_beta[gb_off];
float k_reg[ROWS_PER_LANE];
float q_reg[ROWS_PER_LANE];
#if KDA
float g_exp[ROWS_PER_LANE];
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
const uint i = r * LANES_PER_COLUMN + lane;
k_reg[r] = data_k[k_off + i];
q_reg[r] = data_q[q_off + i];
g_exp[r] = exp(data_g[gb_off * S_V + i]);
}
#else
const float g_val = exp(data_g[gb_off]);
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
const uint i = r * LANES_PER_COLUMN + lane;
k_reg[r] = data_k[k_off + i];
q_reg[r] = data_q[q_off + i];
}
#endif
#pragma unroll
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
float v_val = data_v[v_off + col];
float kv_shard = 0.0f;
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
#if KDA
float gs = g_exp[r] * s_shard[cg][r];
kv_shard += gs * k_reg[r];
#else
kv_shard += s_shard[cg][r] * k_reg[r];
#endif
}
#if !KDA
kv_shard *= g_val; // Applied once instead of ROWS_PER_LANE times
#endif
const float kv_col = REDUCE_PARTIAL(kv_shard, temp_ptr, sg_lid);
const float delta_col = (v_val - kv_col) * beta_val;
float attn_partial = 0.0f;
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
#if KDA
float gs = g_exp[r] * s_shard[cg][r];
#else
float gs = g_val * s_shard[cg][r];
#endif
s_shard[cg][r] = gs + k_reg[r] * delta_col;
attn_partial += s_shard[cg][r] * q_reg[r];
}
const float attn_col = REDUCE_PARTIAL(attn_partial, temp_ptr, sg_lid);
if (lane == 0) {
data_dst[attn_off + col] = attn_col * scale;
}
}
attn_off += S_V * H_v;
if (K > 1u) {
const int target_slot = (int)t - shift;
if (target_slot >= 0 && target_slot < (int)K) {
#pragma unroll
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
const uint slot_base = s_off + (uint)target_slot * state_size_per_snap + state_out_base;
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
}
}
}
}
}
if (K == 1u) {
#pragma unroll
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
#pragma unroll
for (uint r = 0; r < ROWS_PER_LANE; r++) {
data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
}
}
}
}