mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 11:16:08 +00:00
opencl: refactor backend initilization (#23318)
* opencl: refactor initialization * opencl: refactor GPU identification * opencl: rename for consistency * opencl: cache global mem size in dev_ctx * opencl: adjust log level * opencl: load argsort and flash_attn kernels in supports_op * argsort kernel must be built for supports_op for querying the max workgroups * flash_attn kernel has many variants, only load them when needed
This commit is contained in:
parent
510b5c2a35
commit
3a6db741a8
1 changed files with 254 additions and 175 deletions
|
|
@ -375,6 +375,11 @@ struct ggml_backend_opencl_device_context {
|
|||
ggml_backend_buffer_type buffer_type;
|
||||
|
||||
cl_context context = nullptr;
|
||||
|
||||
GPU_FAMILY gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
|
||||
|
||||
size_t global_mem_size = 0;
|
||||
};
|
||||
|
||||
// backend context
|
||||
|
|
@ -384,6 +389,18 @@ struct ggml_backend_opencl_context {
|
|||
cl_device_id device;
|
||||
std::string device_name;
|
||||
|
||||
ggml_cl_version platform_version;
|
||||
ggml_cl_version opencl_c_version;
|
||||
|
||||
// argsort is loaded in supports_op because its availability depends on how
|
||||
// many workgroups are allowed, which requires kernel compilation.
|
||||
bool kernels_loaded_argsort = false;
|
||||
// flash attn is loaded in supports_op because it contains multiple variants
|
||||
// and takes time to compile, so we want to only compile it when needed.
|
||||
bool kernels_loaded_flash_attn = false;
|
||||
// rest of the kernels are currently always loaded in alloc_buffer.
|
||||
bool kernels_loaded = false;
|
||||
|
||||
std::string driver_version;
|
||||
|
||||
GPU_FAMILY gpu_family;
|
||||
|
|
@ -781,6 +798,8 @@ struct ggml_backend_opencl_context {
|
|||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
clFinish(queue);
|
||||
|
||||
ref_count--;
|
||||
if (ref_count == 0) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
|
|
@ -793,6 +812,9 @@ struct ggml_backend_opencl_context {
|
|||
|
||||
// All registered devices with a default device in the front.
|
||||
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
||||
// All device contexts associated with the devices above.
|
||||
// The devices live as long as the process, so do the contexts.
|
||||
static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
|
||||
|
||||
inline std::string read_file(const std::string &path) {
|
||||
std::ifstream ifs(path);
|
||||
|
|
@ -836,12 +858,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|||
return p;
|
||||
}
|
||||
|
||||
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
|
||||
static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
// argsort
|
||||
if (!backend_ctx->kernels_loaded_argsort) {
|
||||
cl_int err;
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "argsort.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("argsort.cl");
|
||||
#endif
|
||||
backend_ctx->program_argsort_f32_i32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
||||
backend_ctx->kernels_loaded_argsort = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void load_cl_kernels_flash_attn(ggml_backend_opencl_context *backend_ctx) {
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
// flash_attn
|
||||
if (!backend_ctx->kernels_loaded_flash_attn) {
|
||||
cl_int err;
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_f16 {
|
||||
#include "flash_attn_f16.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32 {
|
||||
#include "flash_attn_f32.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32_f16 {
|
||||
#include "flash_attn_f32_f16.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
|
||||
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
|
||||
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
|
||||
#endif
|
||||
|
||||
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
|
||||
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
|
||||
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
||||
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
|
||||
{192, 192, 16, 16}, {256, 256, 16, 16},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
|
||||
const int dk = fa_dims[i].dk;
|
||||
const int dv = fa_dims[i].dv;
|
||||
const int bm = fa_dims[i].bm;
|
||||
const int bn = fa_dims[i].bn;
|
||||
std::string OPTS = compile_opts +
|
||||
" -D DK=" + std::to_string(dk) +
|
||||
" -D DV=" + std::to_string(dv) +
|
||||
" -D BLOCK_M=" + std::to_string(bm) +
|
||||
" -D BLOCK_N=" + std::to_string(bn);
|
||||
|
||||
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
|
||||
cl_kernel k_f16, k_f16_q1;
|
||||
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
|
||||
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
|
||||
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f16));
|
||||
|
||||
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
|
||||
cl_kernel k_f32, k_f32_q1;
|
||||
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
|
||||
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
|
||||
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32));
|
||||
|
||||
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
|
||||
cl_kernel k_f32_f16, k_f32_f16_q1;
|
||||
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
|
||||
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
|
||||
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32_f16));
|
||||
|
||||
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
|
||||
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
|
||||
}
|
||||
backend_ctx->kernels_loaded_flash_attn = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
if (backend_ctx->kernels_loaded) {
|
||||
return;
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
|
@ -1986,89 +2116,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// flash_attn
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_f16 {
|
||||
#include "flash_attn_f16.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32 {
|
||||
#include "flash_attn_f32.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32_f16 {
|
||||
#include "flash_attn_f32_f16.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
|
||||
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
|
||||
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
|
||||
#endif
|
||||
|
||||
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
|
||||
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
|
||||
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
||||
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
|
||||
{192, 192, 16, 16}, {256, 256, 16, 16},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
|
||||
const int dk = fa_dims[i].dk;
|
||||
const int dv = fa_dims[i].dv;
|
||||
const int bm = fa_dims[i].bm;
|
||||
const int bn = fa_dims[i].bn;
|
||||
std::string OPTS = compile_opts +
|
||||
" -D DK=" + std::to_string(dk) +
|
||||
" -D DV=" + std::to_string(dv) +
|
||||
" -D BLOCK_M=" + std::to_string(bm) +
|
||||
" -D BLOCK_N=" + std::to_string(bn);
|
||||
|
||||
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
|
||||
cl_kernel k_f16, k_f16_q1;
|
||||
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
|
||||
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
|
||||
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f16));
|
||||
|
||||
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
|
||||
cl_kernel k_f32, k_f32_q1;
|
||||
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
|
||||
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
|
||||
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32));
|
||||
|
||||
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
|
||||
cl_kernel k_f32_f16, k_f32_f16_q1;
|
||||
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
|
||||
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
|
||||
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32_f16));
|
||||
|
||||
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
|
||||
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
|
||||
// argsort
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "argsort.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("argsort.cl");
|
||||
#endif
|
||||
backend_ctx->program_argsort_f32_i32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// div
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -3335,13 +3382,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_LOG_CONT("\n");
|
||||
backend_ctx->kernels_loaded = true;
|
||||
}
|
||||
|
||||
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// XXX static bool initialized = false;
|
||||
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
|
||||
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
|
||||
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev);
|
||||
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev);
|
||||
|
||||
namespace /* anonymous */ {
|
||||
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
|
||||
|
|
@ -3554,13 +3603,13 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
|||
/* .context = */ dev_ctx.get(),
|
||||
});
|
||||
|
||||
if (!ggml_cl2_init(&found_devices.back())) {
|
||||
if (!ggml_opencl_is_device_supported(&found_devices.back())) {
|
||||
found_devices.pop_back();
|
||||
GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
|
||||
GGML_LOG_WARN("ggml_opencl: drop unsupported device '%s'.\n", dev->name);
|
||||
continue;
|
||||
}
|
||||
|
||||
dev_ctx.release();
|
||||
g_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx));
|
||||
}
|
||||
|
||||
if (found_devices.size()) {
|
||||
|
|
@ -3577,8 +3626,79 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
|||
return found_devices;
|
||||
}
|
||||
|
||||
// check if device should be accepted
|
||||
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(dev);
|
||||
GGML_ASSERT(dev->context);
|
||||
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
||||
GGML_ASSERT(dev_ctx->platform);
|
||||
GGML_ASSERT(dev_ctx->device);
|
||||
|
||||
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
||||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
||||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
||||
dev_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
|
||||
// Usually device version contains the detailed device name
|
||||
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
||||
if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
||||
}
|
||||
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
||||
dev_ctx->gpu_family = GPU_FAMILY::INTEL;
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str());
|
||||
dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_WARN("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
GGML_LOG_WARN("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
||||
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t ext_str_size;
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
||||
|
||||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0';
|
||||
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
if (!fp16_support) {
|
||||
GGML_LOG_WARN("ggml_opencl: device does not support FP16\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_WARN("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Initialize device if it is supported (returns nullptr if it is not).
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(dev);
|
||||
GGML_ASSERT(dev->context);
|
||||
|
||||
|
|
@ -3600,34 +3720,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
// when the associated device is initialized
|
||||
backend_ctx->ref_count = 0;
|
||||
|
||||
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
||||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
||||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
// Usually device version contains the detailed device name
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
||||
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
||||
}
|
||||
|
||||
backend_ctx->gpu_family = dev_ctx->gpu_family;
|
||||
backend_ctx->adreno_gen = dev_ctx->adreno_gen;
|
||||
if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) {
|
||||
// Use wave size of 64 for all Adreno GPUs.
|
||||
backend_ctx->adreno_wave_size = 64;
|
||||
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
||||
} else {
|
||||
GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
|
||||
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
||||
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Populate backend device name
|
||||
backend_ctx->device_name = dev_ctx->device_name;
|
||||
|
||||
|
|
@ -3635,13 +3734,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
cl_device_id device = backend_ctx->device;
|
||||
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->platform_version = platform_version;
|
||||
backend_ctx->opencl_c_version = opencl_c_version;
|
||||
|
||||
// Check driver version
|
||||
size_t driver_version_str_size;
|
||||
|
|
@ -3664,34 +3760,21 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
|
||||
// check Adreno large buffer support
|
||||
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
|
||||
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
cl_uint base_align_in_bits;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
||||
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
||||
backend_ctx->alignment = base_align_in_bits / 8u;
|
||||
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
|
||||
backend_ctx->global_mem_size = dev_ctx->global_mem_size;
|
||||
GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
||||
|
|
@ -3779,8 +3862,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
#endif
|
||||
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
|
||||
|
||||
// Load kernels
|
||||
load_cl_kernels(backend_ctx.get(), opencl_c_version);
|
||||
// delay kernel loading until the first buffer is created
|
||||
// load_cl_kernels(backend_ctx.get());
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Allocate intermediate buffers and images
|
||||
|
|
@ -3822,22 +3905,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
|
||||
static void ggml_cl2_free(ggml_backend_t backend) {
|
||||
static void ggml_cl_free(ggml_backend_t backend) {
|
||||
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
||||
ctx->free();
|
||||
|
||||
// The CL context is shared by all backends, release it if all backends have been released
|
||||
bool should_release_opencl = true;
|
||||
for (auto device : g_ggml_backend_opencl_devices) {
|
||||
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
||||
if (ctx_dev->backend_ctx->ref_count > 0) {
|
||||
should_release_opencl = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_release_opencl) {
|
||||
CL_CHECK(clReleaseContext(ctx->context));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
|
@ -4421,7 +4491,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|||
}
|
||||
|
||||
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
||||
ggml_cl2_free(backend);
|
||||
ggml_cl_free(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
|
|
@ -4460,14 +4530,17 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
|||
// enqueued to it won't start until commands in the other devices have
|
||||
// completed.
|
||||
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
||||
if (g_ggml_backend_opencl_devices.size() < 2)
|
||||
return; // No other devices to synchronize with.
|
||||
if (g_ggml_backend_opencl_devices.size() < 2) {
|
||||
return; // No other devices to synchronize with.
|
||||
}
|
||||
|
||||
std::vector<cl_event> events;
|
||||
events.reserve(g_ggml_backend_opencl_devices.size());
|
||||
|
||||
for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
|
||||
auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) backend_dev.context;
|
||||
auto * other_backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
if (backend_ctx != other_backend_ctx) {
|
||||
cl_event ev;
|
||||
CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
|
||||
|
|
@ -4880,6 +4953,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_ARGSORT: {
|
||||
load_cl_kernels_argsort(backend_ctx);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
|
||||
|
|
@ -4897,6 +4972,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
load_cl_kernels_flash_attn(backend_ctx);
|
||||
|
||||
const ggml_tensor * q = op->src[0];
|
||||
const ggml_tensor * k = op->src[1];
|
||||
const ggml_tensor * v = op->src[2];
|
||||
|
|
@ -4964,7 +5041,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
|||
|
||||
ggml_backend_t ggml_backend_opencl_init(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(dev);
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_opencl_guid(),
|
||||
|
|
@ -5343,15 +5420,13 @@ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|||
}
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
return (void *) (uintptr_t) backend_ctx->alignment;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
|
||||
ggml_cl2_init(buffer->buft->device);
|
||||
|
||||
if (tensor->view_src != nullptr) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
|
||||
|
|
@ -5391,7 +5466,8 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
|||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
|
@ -6626,7 +6702,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
|
@ -7470,8 +7547,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_dev_t dev = buffer->buft->device;
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
|
|
@ -7511,7 +7589,8 @@ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(buffer_type->device);
|
||||
load_cl_kernels(backend_ctx);
|
||||
|
||||
// clCreateBuffer returns -61 for size 0
|
||||
size = std::max(size, (size_t)1);
|
||||
|
|
@ -7534,15 +7613,15 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
|||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
return backend_ctx->alignment;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
|
||||
return dev_ctx->backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
||||
static size_t max_size = -1;
|
||||
if (max_size == (size_t)-1) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
max_size = backend_ctx->max_alloc_size;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
|
||||
max_size = dev_ctx->backend_ctx->max_alloc_size;
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
|
|
@ -7579,14 +7658,13 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
|
|||
|
||||
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
||||
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx;
|
||||
|
||||
static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
|
||||
|
||||
// OpenCL does not provide reliable currently-free device memory.
|
||||
// Use total/global memory as a best-effort upper bound.
|
||||
// Improved safety: Reduce by a 1GiB extra margin for common --fit
|
||||
*total = backend_ctx->global_mem_size;
|
||||
*total = dev_ctx->global_mem_size;
|
||||
*free = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
|
||||
}
|
||||
|
||||
|
|
@ -7610,7 +7688,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl_init(dev);
|
||||
// Getting a new reference to the backend, increase ref_count
|
||||
backend_ctx->ref_count++;
|
||||
|
||||
|
|
@ -7647,6 +7725,7 @@ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_bac
|
|||
}
|
||||
|
||||
static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
ggml_cl_init(dev);
|
||||
return ggml_opencl_supports_op(dev, op);
|
||||
}
|
||||
|
||||
|
|
@ -7659,8 +7738,8 @@ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggm
|
|||
|
||||
// Check cl_context is the same. clEnqueue* commands may not use
|
||||
// buffers from another cl_context.
|
||||
ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
|
||||
ggml_backend_opencl_context * backend_ctx0 = ggml_cl_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx1 = ggml_cl_init(buft->device);
|
||||
return backend_ctx0->context == backend_ctx1->context;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue