opencl: refactor backend initilization (#23318)

* opencl: refactor initialization

* opencl: refactor GPU identification

* opencl: rename for consistency

* opencl: cache global mem size in dev_ctx

* opencl: adjust log level

* opencl: load argsort and flash_attn kernels in supports_op

* argsort kernel must be built for supports_op for querying the max
  workgroups
* flash_attn kernel has many variants, only load them when needed
This commit is contained in:
lhez 2026-05-20 09:57:36 -07:00 committed by GitHub
parent 510b5c2a35
commit 3a6db741a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -375,6 +375,11 @@ struct ggml_backend_opencl_device_context {
ggml_backend_buffer_type buffer_type;
cl_context context = nullptr;
GPU_FAMILY gpu_family = GPU_FAMILY::UNKNOWN;
ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
size_t global_mem_size = 0;
};
// backend context
@ -384,6 +389,18 @@ struct ggml_backend_opencl_context {
cl_device_id device;
std::string device_name;
ggml_cl_version platform_version;
ggml_cl_version opencl_c_version;
// argsort is loaded in supports_op because its availability depends on how
// many workgroups are allowed, which requires kernel compilation.
bool kernels_loaded_argsort = false;
// flash attn is loaded in supports_op because it contains multiple variants
// and takes time to compile, so we want to only compile it when needed.
bool kernels_loaded_flash_attn = false;
// rest of the kernels are currently always loaded in alloc_buffer.
bool kernels_loaded = false;
std::string driver_version;
GPU_FAMILY gpu_family;
@ -781,6 +798,8 @@ struct ggml_backend_opencl_context {
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
void free() {
clFinish(queue);
ref_count--;
if (ref_count == 0) {
#ifdef GGML_OPENCL_PROFILING
@ -793,6 +812,9 @@ struct ggml_backend_opencl_context {
// All registered devices with a default device in the front.
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
// All device contexts associated with the devices above.
// The devices live as long as the process, so do the contexts.
static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
inline std::string read_file(const std::string &path) {
std::ifstream ifs(path);
@ -836,12 +858,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
return p;
}
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
// compiler options for general kernels
auto opencl_c_std =
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable -cl-unsafe-math-optimizations"
" -cl-finite-math-only -cl-fast-relaxed-math";
// argsort
if (!backend_ctx->kernels_loaded_argsort) {
cl_int err;
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "argsort.cl.h"
};
#else
const std::string kernel_src = read_file("argsort.cl");
#endif
backend_ctx->program_argsort_f32_i32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
backend_ctx->kernels_loaded_argsort = true;
}
}
static void load_cl_kernels_flash_attn(ggml_backend_opencl_context *backend_ctx) {
// compiler options for general kernels
auto opencl_c_std =
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable -cl-unsafe-math-optimizations"
" -cl-finite-math-only -cl-fast-relaxed-math";
// flash_attn
if (!backend_ctx->kernels_loaded_flash_attn) {
cl_int err;
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src_f16 {
#include "flash_attn_f16.cl.h"
};
const std::string kernel_src_f32 {
#include "flash_attn_f32.cl.h"
};
const std::string kernel_src_f32_f16 {
#include "flash_attn_f32_f16.cl.h"
};
#else
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
#endif
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
{192, 192, 16, 16}, {256, 256, 16, 16},
};
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
const int dk = fa_dims[i].dk;
const int dv = fa_dims[i].dv;
const int bm = fa_dims[i].bm;
const int bn = fa_dims[i].bn;
std::string OPTS = compile_opts +
" -D DK=" + std::to_string(dk) +
" -D DV=" + std::to_string(dv) +
" -D BLOCK_M=" + std::to_string(bm) +
" -D BLOCK_N=" + std::to_string(bn);
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
cl_kernel k_f16, k_f16_q1;
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
CL_CHECK(clReleaseProgram(prog_f16));
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
cl_kernel k_f32, k_f32_q1;
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
CL_CHECK(clReleaseProgram(prog_f32));
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
cl_kernel k_f32_f16, k_f32_f16_q1;
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
CL_CHECK(clReleaseProgram(prog_f32_f16));
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
}
backend_ctx->kernels_loaded_flash_attn = true;
}
}
}
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
if (backend_ctx->kernels_loaded) {
return;
}
cl_int err;
// compiler options for general kernels
auto opencl_c_std =
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable -cl-unsafe-math-optimizations"
" -cl-finite-math-only -cl-fast-relaxed-math";
@ -1986,89 +2116,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// flash_attn
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src_f16 {
#include "flash_attn_f16.cl.h"
};
const std::string kernel_src_f32 {
#include "flash_attn_f32.cl.h"
};
const std::string kernel_src_f32_f16 {
#include "flash_attn_f32_f16.cl.h"
};
#else
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
#endif
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
{192, 192, 16, 16}, {256, 256, 16, 16},
};
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
const int dk = fa_dims[i].dk;
const int dv = fa_dims[i].dv;
const int bm = fa_dims[i].bm;
const int bn = fa_dims[i].bn;
std::string OPTS = compile_opts +
" -D DK=" + std::to_string(dk) +
" -D DV=" + std::to_string(dv) +
" -D BLOCK_M=" + std::to_string(bm) +
" -D BLOCK_N=" + std::to_string(bn);
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
cl_kernel k_f16, k_f16_q1;
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
CL_CHECK(clReleaseProgram(prog_f16));
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
cl_kernel k_f32, k_f32_q1;
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
CL_CHECK(clReleaseProgram(prog_f32));
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
cl_kernel k_f32_f16, k_f32_f16_q1;
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
CL_CHECK(clReleaseProgram(prog_f32_f16));
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
}
GGML_LOG_CONT(".");
}
}
// argsort
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "argsort.cl.h"
};
#else
const std::string kernel_src = read_file("argsort.cl");
#endif
backend_ctx->program_argsort_f32_i32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
GGML_LOG_CONT(".");
}
// div
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@ -3335,13 +3382,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
GGML_LOG_CONT("\n");
backend_ctx->kernels_loaded = true;
}
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
// XXX static bool initialized = false;
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev);
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev);
namespace /* anonymous */ {
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
@ -3554,13 +3603,13 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
/* .context = */ dev_ctx.get(),
});
if (!ggml_cl2_init(&found_devices.back())) {
if (!ggml_opencl_is_device_supported(&found_devices.back())) {
found_devices.pop_back();
GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
GGML_LOG_WARN("ggml_opencl: drop unsupported device '%s'.\n", dev->name);
continue;
}
dev_ctx.release();
g_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx));
}
if (found_devices.size()) {
@ -3577,8 +3626,79 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
return found_devices;
}
// check if device should be accepted
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
GGML_ASSERT(dev);
GGML_ASSERT(dev->context);
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
GGML_ASSERT(dev_ctx->platform);
GGML_ASSERT(dev_ctx->device);
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
dev_ctx->gpu_family = GPU_FAMILY::ADRENO;
// Usually device version contains the detailed device name
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
}
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
dev_ctx->gpu_family = GPU_FAMILY::INTEL;
} else {
GGML_LOG_WARN("ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str());
dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
return false;
}
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
// Check device OpenCL version, OpenCL 2.0 or above is required
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device);
if (opencl_c_version.major < 2) {
GGML_LOG_WARN("ggml_opencl: OpenCL 2.0 or above is required\n");
return false;
}
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) {
GGML_LOG_WARN("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
return false;
}
#endif
size_t ext_str_size;
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
char *ext_buffer = (char *)alloca(ext_str_size + 1);
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
ext_buffer[ext_str_size] = '\0';
// Check if ext_buffer contains cl_khr_fp16
bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
if (!fp16_support) {
GGML_LOG_WARN("ggml_opencl: device does not support FP16\n");
return false;
}
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
GGML_LOG_WARN("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
return false;
}
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
return true;
}
// Initialize device if it is supported (returns nullptr if it is not).
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
GGML_ASSERT(dev);
GGML_ASSERT(dev->context);
@ -3600,34 +3720,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
// when the associated device is initialized
backend_ctx->ref_count = 0;
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
// Usually device version contains the detailed device name
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
}
backend_ctx->gpu_family = dev_ctx->gpu_family;
backend_ctx->adreno_gen = dev_ctx->adreno_gen;
if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) {
// Use wave size of 64 for all Adreno GPUs.
backend_ctx->adreno_wave_size = 64;
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
} else {
GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
return nullptr;
}
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
return nullptr;
}
#endif
// Populate backend device name
backend_ctx->device_name = dev_ctx->device_name;
@ -3635,13 +3734,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
cl_device_id device = backend_ctx->device;
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
// Check device OpenCL version, OpenCL 2.0 or above is required
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
if (opencl_c_version.major < 2) {
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
return nullptr;
}
backend_ctx->platform_version = platform_version;
backend_ctx->opencl_c_version = opencl_c_version;
// Check driver version
size_t driver_version_str_size;
@ -3664,34 +3760,21 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
char *ext_buffer = (char *)alloca(ext_str_size + 1);
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
// Check if ext_buffer contains cl_khr_fp16
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
// check Adreno large buffer support
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
// fp16 is required
if (!backend_ctx->fp16_support) {
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
return nullptr;
}
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
return nullptr;
}
cl_uint base_align_in_bits;
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
GGML_ASSERT(base_align_in_bits % 8u == 0);
backend_ctx->alignment = base_align_in_bits / 8u;
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
backend_ctx->global_mem_size = dev_ctx->global_mem_size;
GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@ -3779,8 +3862,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
#endif
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
// Load kernels
load_cl_kernels(backend_ctx.get(), opencl_c_version);
// delay kernel loading until the first buffer is created
// load_cl_kernels(backend_ctx.get());
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// Allocate intermediate buffers and images
@ -3822,22 +3905,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
return dev_ctx->backend_ctx;
}
static void ggml_cl2_free(ggml_backend_t backend) {
static void ggml_cl_free(ggml_backend_t backend) {
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
ctx->free();
// The CL context is shared by all backends, release it if all backends have been released
bool should_release_opencl = true;
for (auto device : g_ggml_backend_opencl_devices) {
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
if (ctx_dev->backend_ctx->ref_count > 0) {
should_release_opencl = false;
}
}
if (should_release_opencl) {
CL_CHECK(clReleaseContext(ctx->context));
}
}
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@ -4421,7 +4491,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
}
static void ggml_backend_opencl_free(ggml_backend_t backend) {
ggml_cl2_free(backend);
ggml_cl_free(backend);
}
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@ -4460,14 +4530,17 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
// enqueued to it won't start until commands in the other devices have
// completed.
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
if (g_ggml_backend_opencl_devices.size() < 2)
return; // No other devices to synchronize with.
if (g_ggml_backend_opencl_devices.size() < 2) {
return; // No other devices to synchronize with.
}
std::vector<cl_event> events;
events.reserve(g_ggml_backend_opencl_devices.size());
for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) backend_dev.context;
auto * other_backend_ctx = dev_ctx->backend_ctx;
if (backend_ctx != other_backend_ctx) {
cl_event ev;
CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
@ -4880,6 +4953,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_IM2COL:
return true;
case GGML_OP_ARGSORT: {
load_cl_kernels_argsort(backend_ctx);
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
@ -4897,6 +4972,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_FLASH_ATTN_EXT:
{
load_cl_kernels_flash_attn(backend_ctx);
const ggml_tensor * q = op->src[0];
const ggml_tensor * k = op->src[1];
const ggml_tensor * v = op->src[2];
@ -4964,7 +5041,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
ggml_backend_t ggml_backend_opencl_init(void) {
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(dev);
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_opencl_guid(),
@ -5343,15 +5420,13 @@ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
return (void *) (uintptr_t) backend_ctx->alignment;
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment;
}
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_cl2_init(buffer->buft->device);
if (tensor->view_src != nullptr) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
@ -5391,7 +5466,8 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
}
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
cl_context context = backend_ctx->context;
cl_command_queue queue = backend_ctx->queue;
@ -6626,7 +6702,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->extra);
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx;
cl_context context = backend_ctx->context;
cl_command_queue queue = backend_ctx->queue;
@ -7470,8 +7547,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
}
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_dev_t dev = buffer->buft->device;
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
cl_command_queue queue = backend_ctx->queue;
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
@ -7511,7 +7589,8 @@ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer
}
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(buffer_type->device);
load_cl_kernels(backend_ctx);
// clCreateBuffer returns -61 for size 0
size = std::max(size, (size_t)1);
@ -7534,15 +7613,15 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
}
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
return backend_ctx->alignment;
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
return dev_ctx->backend_ctx->alignment;
}
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
static size_t max_size = -1;
if (max_size == (size_t)-1) {
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
max_size = backend_ctx->max_alloc_size;
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
max_size = dev_ctx->backend_ctx->max_alloc_size;
}
return max_size;
}
@ -7579,14 +7658,13 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx;
static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
// OpenCL does not provide reliable currently-free device memory.
// Use total/global memory as a best-effort upper bound.
// Improved safety: Reduce by a 1GiB extra margin for common --fit
*total = backend_ctx->global_mem_size;
*total = dev_ctx->global_mem_size;
*free = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
}
@ -7610,7 +7688,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
}
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
ggml_backend_opencl_context * backend_ctx = ggml_cl_init(dev);
// Getting a new reference to the backend, increase ref_count
backend_ctx->ref_count++;
@ -7647,6 +7725,7 @@ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_bac
}
static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
ggml_cl_init(dev);
return ggml_opencl_supports_op(dev, op);
}
@ -7659,8 +7738,8 @@ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggm
// Check cl_context is the same. clEnqueue* commands may not use
// buffers from another cl_context.
ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
ggml_backend_opencl_context * backend_ctx0 = ggml_cl_init(dev);
ggml_backend_opencl_context * backend_ctx1 = ggml_cl_init(buft->device);
return backend_ctx0->context == backend_ctx1->context;
}