mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
* opencl: add `argsort` * opencl: add `div` * opencl: add `add_rows` * opencl: add `sub` * opencl: add `sigmoid`, both `f16` and `f32` * opencl: add `group_norm`
86 lines
2.7 KiB
Common Lisp
86 lines
2.7 KiB
Common Lisp
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
|
|
#ifdef cl_intel_subgroups
|
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
#else
|
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
#endif
|
|
|
|
#ifdef cl_intel_required_subgroup_size
|
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
#define INTEL_GPU 1
|
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
#define ADRENO_GPU 1
|
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
#endif
|
|
|
|
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
|
|
|
|
enum ggml_sort_order {
|
|
GGML_SORT_ORDER_ASC,
|
|
GGML_SORT_ORDER_DESC,
|
|
};
|
|
|
|
kernel void kernel_argsort_f32_i32(
|
|
global float * src0,
|
|
ulong offset0,
|
|
global int * dst,
|
|
ulong offsetd,
|
|
const int ne00,
|
|
const int ne00_pad,
|
|
const int order,
|
|
local int * dst_row
|
|
) {
|
|
// bitonic sort
|
|
int col = get_local_id(0);
|
|
int row = get_group_id(1);
|
|
|
|
if (col >= ne00_pad) {
|
|
return;
|
|
}
|
|
|
|
src0 = (global char *)((global char *)src0 + offset0);
|
|
dst = (global float *)((global char *)dst + offsetd);
|
|
|
|
global float * x_row = src0 + row * ne00;
|
|
|
|
// initialize indices
|
|
dst_row[col] = col;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
for (int k = 2; k <= ne00_pad; k *= 2) {
|
|
for (int j = k / 2; j > 0; j /= 2) {
|
|
int ixj = col ^ j;
|
|
if (ixj > col) {
|
|
if ((col & k) == 0) {
|
|
if (dst_row[col] >= ne00 ||
|
|
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
|
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
|
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
|
) {
|
|
SWAP(dst_row[col], dst_row[ixj], int);
|
|
}
|
|
} else {
|
|
if (dst_row[ixj] >= ne00 ||
|
|
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
|
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
|
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
|
) {
|
|
SWAP(dst_row[col], dst_row[ixj], int);
|
|
}
|
|
}
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
}
|
|
|
|
// copy the result to dst without the padding
|
|
if (col < ne00) {
|
|
dst[row * ne00 + col] = dst_row[col];
|
|
}
|
|
}
|