diff --git a/Makefile b/Makefile index 99ce886ac..5e1591a72 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,8 @@ ifdef KCPP_SANITIZE CFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error CXXFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error endif -CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK -CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK +CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK +CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK ifndef KCPP_DEBUG CFLAGS += -DNDEBUG -s diff --git a/OpenCL.dll b/OpenCL.dll deleted file mode 100644 index 16281318b..000000000 Binary files a/OpenCL.dll and /dev/null differ diff --git a/clblast.dll b/clblast.dll deleted file mode 100644 index 68012638d..000000000 Binary files a/clblast.dll and /dev/null differ diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1c36e231b..cadc157ea 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -701,14 +701,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - union { char padding[8]; - union { - char trimmed_pad_1[3]; - char clblast_offload_gpu; //we sneak the flag for gpu offloading for clblast into the padding - char trimmed_pad_2[4]; - }; - }; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 546bf85e2..180f6a128 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -76,9 +76,6 @@ // precomputed f32 table for f16 (256 KB) (simd-mappings.h) float ggml_table_f32_f16[1 << 16]; -#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions -#include "ggml_v3b-opencl.h" -#endif #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int sve_cnt; @@ -1263,15 +1260,6 @@ void ggml_compute_forward_mul_mat( // compute by src0 rows // TODO: extract to "extra_op" -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#endif - #if GGML_USE_LLAMAFILE // broadcast factors const int64_t r2 = ne12 / ne02; @@ -3628,11 +3616,6 @@ struct ggml_cplan ggml_graph_plan( { const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); - } else -#endif if (node->src[1]->type != vec_dot_type) { size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); cur = MAX(cur, cur2); @@ -4561,9 +4544,6 @@ void ggml_cpu_init(void) { #if defined(__ARM_ARCH) ggml_init_arm_arch_features(); #endif -#if defined(GGML_USE_CLBLAST) - ggml_cl_init(); -#endif #if defined(__riscv) ggml_init_riscv_arch_features(); diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 0c3288c39..5721a2b56 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -2324,9 +2324,6 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons static const ggml::cpu::repack::tensor_traits iq4_nl_4x4_q8_0; bool permit_repack = true; -#if defined(GGML_USE_CLBLAST) - permit_repack = false; //kcpp: clblast cannot handle repacking -#endif // instance for Q8_0 static const ggml::cpu::repack::tensor_traits q8_0_4x4_q8_0; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 54c667fdc..6c9900018 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -158,7 +158,7 @@ static int savestate_limit = 0; static std::vector savestates; inline int kcpp_cpu_has_blas(void) { -#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; @@ -2389,21 +2389,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } - #if defined(GGML_USE_CLBLAST) - if(file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers>0) - { - if(file_format_meta.model_architecture == GGUFArch::ARCH_FALCON) - { - printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n"); - model_params.n_gpu_layers = 0; - } - else if(file_format_meta.n_expert_count>1) - { - printf("\nOpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n"); - - } - } - #endif #if defined(GGML_USE_CUDA) if(kcpp_parseinfo_maindevice>0) { @@ -3278,7 +3263,7 @@ int GetThreadsToUse(bool blasmode) { if (blasmode) { - #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) + #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) return kcpp_data->n_blasthreads; #else return std::min(kcpp_data->n_blasthreads, 4); diff --git a/include/CL/Utils/Context.h b/include/CL/Utils/Context.h deleted file mode 100644 index cebb8b9eb..000000000 --- a/include/CL/Utils/Context.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -// OpenCL includes -#include - -// STL includes -#include - -UTILS_EXPORT -cl_context cl_util_get_context(const cl_uint plat_id, const cl_uint dev_id, - const cl_device_type type, cl_int* const error); -UTILS_EXPORT -cl_device_id cl_util_get_device(const cl_uint plat_id, const cl_uint dev_id, - const cl_device_type type, cl_int* const error); - -UTILS_EXPORT -cl_int cl_util_print_device_info(const cl_device_id device); - -UTILS_EXPORT -char* cl_util_get_device_info(const cl_device_id device, - const cl_device_info info, cl_int* const error); -UTILS_EXPORT -char* cl_util_get_platform_info(const cl_platform_id platform, - const cl_platform_info info, - cl_int* const error); - -// build program and show log if build is not successful -UTILS_EXPORT -cl_int cl_util_build_program(const cl_program pr, const cl_device_id dev, - const char* const opt); - -#define GET_CURRENT_TIMER(time) \ - struct timespec time; \ - timespec_get(&time, TIME_UTC); \ - { \ - } - -#define TIMER_DIFFERENCE(dt, time1, time2) \ - { \ - dt = (time2.tv_sec - time1.tv_sec) * 1000000000 \ - + (time2.tv_nsec - time1.tv_nsec); \ - } - -#define START_TIMER GET_CURRENT_TIMER(start_timer1) -#define STOP_TIMER(dt) \ - GET_CURRENT_TIMER(stop_timer2) \ - TIMER_DIFFERENCE(dt, start_timer1, stop_timer2) diff --git a/include/CL/Utils/Context.hpp b/include/CL/Utils/Context.hpp deleted file mode 100644 index bd1110c34..000000000 --- a/include/CL/Utils/Context.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -// OpenCL SDK includes -#include "OpenCLUtilsCpp_Export.h" - -#include - -// OpenCL includes -#include - -namespace cl { -namespace util { - Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id, - cl_device_type type, - cl_int* error = nullptr); -} -} diff --git a/include/CL/Utils/Detail.hpp b/include/CL/Utils/Detail.hpp deleted file mode 100644 index 49cccd02c..000000000 --- a/include/CL/Utils/Detail.hpp +++ /dev/null @@ -1,84 +0,0 @@ -#pragma once - -// STL includes -#include -#include // std::forward, std::integer_sequence -#include // std::tuple, std::get -#include // std::initializer_list - -namespace cl { -namespace util { - namespace detail { - // Borrowed from: - // https://www.fluentcpp.com/2019/03/05/for_each_arg-applying-a-function-to-each-argument-of-a-function-in-cpp/ - template F for_each_arg(F f, Args&&... args) - { - (void)std::initializer_list{ ( - (void)f(std::forward(args)), 0)... }; - return f; - } - - namespace impl { - // Borrowed from: https://stackoverflow.com/a/16387374/1476661 - template - void for_each_in_tuple(T&& t, F&& f, - std::integer_sequence) - { - auto l = { - (std::forward(f)(std::get(std::forward(t))), 0)... - }; - (void)l; - } - } - template - void for_each_in_tuple(std::tuple const& t, F&& f) - { - impl::for_each_in_tuple( - t, std::forward(f), - std::make_integer_sequence()); - } - - namespace impl { - // Borrowed from - // https://codereview.stackexchange.com/questions/193420/apply-a-function-to-each-element-of-a-tuple-map-a-tuple - template - auto transform_tuple(Tuple&& t, F&& f, std::index_sequence) - { - return std::make_tuple(std::forward(f)(std::get(t))...); - } - } - template - auto transform_tuple(const std::tuple& t, F&& f) - { - return impl::transform_tuple( - t, std::forward(f), - std::make_index_sequence{}); - } - - namespace impl { - // Borrowed from - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3658.html - // with modifications of Casey Carter at - // https://stackoverflow.com/a/51365112/1476661 - template - auto apply(F&& f, Tuple&& args, std::index_sequence) - -> decltype(std::forward(f)( - std::get(std::forward(args))...)) - { - return std::forward(f)( - std::get(std::forward(args))...); - } - } - template >::value>> - auto apply(F&& f, Tuple&& args) - -> decltype(impl::apply(std::forward(f), - std::forward(args), Indices())) - { - return impl::apply(std::forward(f), std::forward(args), - Indices()); - } - } -} -} diff --git a/include/CL/Utils/Device.hpp b/include/CL/Utils/Device.hpp deleted file mode 100644 index 0a8aed8b6..000000000 --- a/include/CL/Utils/Device.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include "OpenCLUtilsCpp_Export.h" -#include - -#include - -namespace cl { -namespace util { - bool UTILSCPP_EXPORT opencl_c_version_contains( - const cl::Device& device, const cl::string& version_fragment); - - bool UTILSCPP_EXPORT supports_extension(const cl::Device& device, - const cl::string& extension); - -#ifdef CL_VERSION_3_0 - bool UTILSCPP_EXPORT supports_feature(const cl::Device& device, - const cl::string& feature_name); -#endif -} -} diff --git a/include/CL/Utils/Error.h b/include/CL/Utils/Error.h deleted file mode 100644 index bf6e24bf5..000000000 --- a/include/CL/Utils/Error.h +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -// OpenCL Utils includes -#include - -// STL includes -#include // fprintf - -// OpenCL includes -#include - -// RET = function returns error code -// PAR = functions sets error code in the paremeter - -#ifdef _DEBUG - -#define OCLERROR_RET(func, err, label) \ - do \ - { \ - err = func; \ - if (err != CL_SUCCESS) \ - { \ - cl_util_print_error(err); \ - fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__, \ - __FILE__, #func); \ - goto label; \ - } \ - } while (0) - -#define OCLERROR_PAR(func, err, label) \ - do \ - { \ - func; \ - if (err != CL_SUCCESS) \ - { \ - cl_util_print_error(err); \ - fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__, \ - __FILE__, #func); \ - goto label; \ - } \ - } while (0) - -#define MEM_CHECK(func, err, label) \ - do \ - { \ - if ((func) == NULL) \ - { \ - err = CL_OUT_OF_HOST_MEMORY; \ - cl_util_print_error(err); \ - fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__, \ - __FILE__, #func); \ - goto label; \ - } \ - } while (0) - -#else - -#define OCLERROR_RET(func, err, label) \ - do \ - { \ - err = func; \ - if (err != CL_SUCCESS) goto label; \ - } while (0) - -#define OCLERROR_PAR(func, err, label) \ - do \ - { \ - func; \ - if (err != CL_SUCCESS) goto label; \ - } while (0) - -#define MEM_CHECK(func, err, label) \ - do \ - { \ - if ((func) == NULL) \ - { \ - err = CL_OUT_OF_HOST_MEMORY; \ - goto label; \ - } \ - } while (0) - -#endif - -UTILS_EXPORT -void cl_util_print_error(cl_int error); diff --git a/include/CL/Utils/Error.hpp b/include/CL/Utils/Error.hpp deleted file mode 100644 index 50df2f7b3..000000000 --- a/include/CL/Utils/Error.hpp +++ /dev/null @@ -1,70 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtilsCpp_Export.h" - -// OpenCL Utils includes -#include - -// OpenCL includes -#include - -namespace cl { -namespace util { -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - /*! \brief Exception class - * - * This may be thrown by SDK utility functions when - * CL_HPP_ENABLE_EXCEPTIONS is defined. - */ - class Error : public std::exception { - private: - int err_; - const char* errStr_; - - public: - /*! \brief Create a new SDK error exception for a given error code - * and corresponding message. - * - * \param err error code value. - * - * \param errStr a descriptive string that must remain in scope until - * handling of the exception has concluded. If set, it - * will be returned by what(). - */ - Error(cl_int err, const char* errStr = NULL): err_(err), errStr_(errStr) - {} - - ~Error() throw() {} - - /*! \brief Get error string associated with exception - * - * \return A memory pointer to the error message string. - */ - virtual const char* what() const throw() - { - if (errStr_ == NULL) - { - return "empty"; - } - else - { - return errStr_; - } - } - - /*! \brief Get error code associated with exception - * - * \return The error code. - */ - cl_int err(void) const { return err_; } - }; -#endif - - namespace detail { - UTILSCPP_EXPORT cl_int errHandler(cl_int err, cl_int* errPtr, - const char* errStr = nullptr); - } - -} -} diff --git a/include/CL/Utils/ErrorCodes.h b/include/CL/Utils/ErrorCodes.h deleted file mode 100644 index a1f9d4a8e..000000000 --- a/include/CL/Utils/ErrorCodes.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#define CL_UTIL_INDEX_OUT_OF_RANGE -2000 -#define CL_UTIL_DEVICE_NOT_INTEROPERABLE -2001 -#define CL_UTIL_FILE_OPERATION_ERROR -2002 diff --git a/include/CL/Utils/Event.h b/include/CL/Utils/Event.h deleted file mode 100644 index f144e2157..000000000 --- a/include/CL/Utils/Event.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -// OpenCL includes -#include - -UTILS_EXPORT -cl_ulong cl_util_get_event_duration(const cl_event event, - const cl_profiling_info start, - const cl_profiling_info end, - cl_int* const error); \ No newline at end of file diff --git a/include/CL/Utils/Event.hpp b/include/CL/Utils/Event.hpp deleted file mode 100644 index e5a6577eb..000000000 --- a/include/CL/Utils/Event.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -// OpenCL SDK includes -#include "OpenCLUtilsCpp_Export.h" - -// STL includes -#include - -// OpenCL includes -#include - -namespace cl { -namespace util { - template - auto get_duration(cl::Event& ev) - { - return std::chrono::duration_cast(std::chrono::nanoseconds{ - ev.getProfilingInfo() - ev.getProfilingInfo() }); - } -} -} diff --git a/include/CL/Utils/File.h b/include/CL/Utils/File.h deleted file mode 100644 index 62c8e95a7..000000000 --- a/include/CL/Utils/File.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -// OpenCL includes -#include - -// read all the text file contents securely in ANSI C89 -// return pointer to C-string with file contents -// can handle streams with no known size and no support for fseek -// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal -UTILS_EXPORT -char* cl_util_read_text_file(const char* const filename, size_t* const length, - cl_int* const error); - -// read all the binary file contents securely in ANSI C89 -// return pointer to file contents -// can handle streams with no known size and no support for fseek -// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal -UTILS_EXPORT -unsigned char* cl_util_read_binary_file(const char* const filename, - size_t* const length, - cl_int* const error); - -// write binaries of OpenCL compiled program -// binaries are written as separate files for each device -// with file name "(program_file_name)_(name of device).bin" -// based on variant of Logan -// http://logan.tw/posts/2014/11/22/pre-compile-the-opencl-kernel-program-part-2/ -UTILS_EXPORT -cl_int cl_util_write_binaries(const cl_program program, - const char* const program_file_name); - -// read binaries of OpenCL compiled program -// from files of file names "(program_file_name)_(name of device).bin" -UTILS_EXPORT -cl_program cl_util_read_binaries(const cl_context context, - const cl_device_id* const devices, - const cl_uint num_devices, - const char* const program_file_name, - cl_int* const error); diff --git a/include/CL/Utils/File.hpp b/include/CL/Utils/File.hpp deleted file mode 100644 index 984a32388..000000000 --- a/include/CL/Utils/File.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -// OpenCL SDK includes -#include - -// STL includes -#include -#include - -namespace cl { -namespace util { - // Scott Meyers, Effective STL, Addison-Wesley Professional, 2001, Item 29 - // with error handling - UTILSCPP_EXPORT - std::string read_text_file(const char* const filename, cl_int* const error) - { - std::ifstream in(filename); - if (in.good()) - { - try - { - std::string red((std::istreambuf_iterator(in)), - std::istreambuf_iterator()); - if (in.good() && in.eof()) - { - if (error != nullptr) *error = CL_SUCCESS; - return red; - } - else - { - detail::errHandler(CL_UTIL_FILE_OPERATION_ERROR, error, - "File read error!"); - return std::string(); - } - } catch (std::bad_alloc& ex) - { - detail::errHandler(CL_OUT_OF_RESOURCES, error, - "Bad allocation!"); - return std::string(); - } - } - else - { - detail::errHandler(CL_INVALID_VALUE, error, "No file!"); - return std::string(); - } - } -} -} diff --git a/include/CL/Utils/InteropContext.hpp b/include/CL/Utils/InteropContext.hpp deleted file mode 100644 index 2d8fdbfdc..000000000 --- a/include/CL/Utils/InteropContext.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "OpenCLUtilsCpp_Export.h" -#include - -#include - -namespace cl { -namespace util { - vector - UTILSCPP_EXPORT get_interop_context_properties(const cl::Device& plat, - cl_int* error = nullptr); - - Context UTILSCPP_EXPORT get_interop_context(int plat_id, int dev_id, - cl_device_type type, - cl_int* error = nullptr); -} -} diff --git a/include/CL/Utils/OpenCLUtilsCpp_Export.h b/include/CL/Utils/OpenCLUtilsCpp_Export.h deleted file mode 100644 index b063c9fe1..000000000 --- a/include/CL/Utils/OpenCLUtilsCpp_Export.h +++ /dev/null @@ -1,42 +0,0 @@ - -#ifndef UTILSCPP_EXPORT_H -#define UTILSCPP_EXPORT_H - -#ifdef OPENCLUTILSCPP_STATIC_DEFINE -# define UTILSCPP_EXPORT -# define OPENCLUTILSCPP_NO_EXPORT -#else -# ifndef UTILSCPP_EXPORT -# ifdef OpenCLUtilsCpp_EXPORTS - /* We are building this library */ -# define UTILSCPP_EXPORT -# else - /* We are using this library */ -# define UTILSCPP_EXPORT -# endif -# endif - -# ifndef OPENCLUTILSCPP_NO_EXPORT -# define OPENCLUTILSCPP_NO_EXPORT -# endif -#endif - -#ifndef OPENCLUTILSCPP_DEPRECATED -# define OPENCLUTILSCPP_DEPRECATED __declspec(deprecated) -#endif - -#ifndef OPENCLUTILSCPP_DEPRECATED_EXPORT -# define OPENCLUTILSCPP_DEPRECATED_EXPORT UTILSCPP_EXPORT OPENCLUTILSCPP_DEPRECATED -#endif - -#ifndef OPENCLUTILSCPP_DEPRECATED_NO_EXPORT -# define OPENCLUTILSCPP_DEPRECATED_NO_EXPORT OPENCLUTILSCPP_NO_EXPORT OPENCLUTILSCPP_DEPRECATED -#endif - -#if 0 /* DEFINE_NO_DEPRECATED */ -# ifndef OPENCLUTILSCPP_NO_DEPRECATED -# define OPENCLUTILSCPP_NO_DEPRECATED -# endif -#endif - -#endif /* UTILSCPP_EXPORT_H */ diff --git a/include/CL/Utils/OpenCLUtils_Export.h b/include/CL/Utils/OpenCLUtils_Export.h deleted file mode 100644 index 2db857ec0..000000000 --- a/include/CL/Utils/OpenCLUtils_Export.h +++ /dev/null @@ -1,42 +0,0 @@ - -#ifndef UTILS_EXPORT_H -#define UTILS_EXPORT_H - -#ifdef OPENCLUTILS_STATIC_DEFINE -# define UTILS_EXPORT -# define OPENCLUTILS_NO_EXPORT -#else -# ifndef UTILS_EXPORT -# ifdef OpenCLUtils_EXPORTS - /* We are building this library */ -# define UTILS_EXPORT -# else - /* We are using this library */ -# define UTILS_EXPORT -# endif -# endif - -# ifndef OPENCLUTILS_NO_EXPORT -# define OPENCLUTILS_NO_EXPORT -# endif -#endif - -#ifndef OPENCLUTILS_DEPRECATED -# define OPENCLUTILS_DEPRECATED __declspec(deprecated) -#endif - -#ifndef OPENCLUTILS_DEPRECATED_EXPORT -# define OPENCLUTILS_DEPRECATED_EXPORT UTILS_EXPORT OPENCLUTILS_DEPRECATED -#endif - -#ifndef OPENCLUTILS_DEPRECATED_NO_EXPORT -# define OPENCLUTILS_DEPRECATED_NO_EXPORT OPENCLUTILS_NO_EXPORT OPENCLUTILS_DEPRECATED -#endif - -#if 0 /* DEFINE_NO_DEPRECATED */ -# ifndef OPENCLUTILS_NO_DEPRECATED -# define OPENCLUTILS_NO_DEPRECATED -# endif -#endif - -#endif /* UTILS_EXPORT_H */ diff --git a/include/CL/Utils/Platform.hpp b/include/CL/Utils/Platform.hpp deleted file mode 100644 index 9957d76ba..000000000 --- a/include/CL/Utils/Platform.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "OpenCLUtilsCpp_Export.h" -#include - -#include - -namespace cl { -namespace util { - bool UTILSCPP_EXPORT supports_extension(const cl::Platform& platform, - const cl::string& extension); - - bool UTILSCPP_EXPORT platform_version_contains( - const cl::Platform& platform, const cl::string& version_fragment); -} -} diff --git a/include/CL/Utils/Utils.h b/include/CL/Utils/Utils.h deleted file mode 100644 index 9d1c2471f..000000000 --- a/include/CL/Utils/Utils.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -#include -#include -#include - -// OpenCL includes -#include diff --git a/include/CL/Utils/Utils.hpp b/include/CL/Utils/Utils.hpp deleted file mode 100644 index 40dd8d795..000000000 --- a/include/CL/Utils/Utils.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -// OpenCL Utils includes -#include "OpenCLUtils_Export.h" - -#include -#include -#include -#include -#include -#include - -// OpenCL includes -#include diff --git a/include/CL/cl.h b/include/CL/cl.h deleted file mode 100644 index 6c700ab17..000000000 --- a/include/CL/cl.h +++ /dev/null @@ -1,1936 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_H -#define __OPENCL_CL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/******************************************************************************/ - -typedef struct _cl_platform_id * cl_platform_id; -typedef struct _cl_device_id * cl_device_id; -typedef struct _cl_context * cl_context; -typedef struct _cl_command_queue * cl_command_queue; -typedef struct _cl_mem * cl_mem; -typedef struct _cl_program * cl_program; -typedef struct _cl_kernel * cl_kernel; -typedef struct _cl_event * cl_event; -typedef struct _cl_sampler * cl_sampler; - -typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ -typedef cl_ulong cl_bitfield; -typedef cl_ulong cl_properties; -typedef cl_bitfield cl_device_type; -typedef cl_uint cl_platform_info; -typedef cl_uint cl_device_info; -typedef cl_bitfield cl_device_fp_config; -typedef cl_uint cl_device_mem_cache_type; -typedef cl_uint cl_device_local_mem_type; -typedef cl_bitfield cl_device_exec_capabilities; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_device_svm_capabilities; -#endif -typedef cl_bitfield cl_command_queue_properties; -#ifdef CL_VERSION_1_2 -typedef intptr_t cl_device_partition_property; -typedef cl_bitfield cl_device_affinity_domain; -#endif - -typedef intptr_t cl_context_properties; -typedef cl_uint cl_context_info; -#ifdef CL_VERSION_2_0 -typedef cl_properties cl_queue_properties; -#endif -typedef cl_uint cl_command_queue_info; -typedef cl_uint cl_channel_order; -typedef cl_uint cl_channel_type; -typedef cl_bitfield cl_mem_flags; -#ifdef CL_VERSION_2_0 -typedef cl_bitfield cl_svm_mem_flags; -#endif -typedef cl_uint cl_mem_object_type; -typedef cl_uint cl_mem_info; -#ifdef CL_VERSION_1_2 -typedef cl_bitfield cl_mem_migration_flags; -#endif -typedef cl_uint cl_image_info; -#ifdef CL_VERSION_1_1 -typedef cl_uint cl_buffer_create_type; -#endif -typedef cl_uint cl_addressing_mode; -typedef cl_uint cl_filter_mode; -typedef cl_uint cl_sampler_info; -typedef cl_bitfield cl_map_flags; -#ifdef CL_VERSION_2_0 -typedef intptr_t cl_pipe_properties; -typedef cl_uint cl_pipe_info; -#endif -typedef cl_uint cl_program_info; -typedef cl_uint cl_program_build_info; -#ifdef CL_VERSION_1_2 -typedef cl_uint cl_program_binary_type; -#endif -typedef cl_int cl_build_status; -typedef cl_uint cl_kernel_info; -#ifdef CL_VERSION_1_2 -typedef cl_uint cl_kernel_arg_info; -typedef cl_uint cl_kernel_arg_address_qualifier; -typedef cl_uint cl_kernel_arg_access_qualifier; -typedef cl_bitfield cl_kernel_arg_type_qualifier; -#endif -typedef cl_uint cl_kernel_work_group_info; -#ifdef CL_VERSION_2_1 -typedef cl_uint cl_kernel_sub_group_info; -#endif -typedef cl_uint cl_event_info; -typedef cl_uint cl_command_type; -typedef cl_uint cl_profiling_info; -#ifdef CL_VERSION_2_0 -typedef cl_properties cl_sampler_properties; -typedef cl_uint cl_kernel_exec_info; -#endif -#ifdef CL_VERSION_3_0 -typedef cl_bitfield cl_device_atomic_capabilities; -typedef cl_bitfield cl_device_device_enqueue_capabilities; -typedef cl_uint cl_khronos_vendor_id; -typedef cl_properties cl_mem_properties; -typedef cl_uint cl_version; -#endif - -typedef struct _cl_image_format { - cl_channel_order image_channel_order; - cl_channel_type image_channel_data_type; -} cl_image_format; - -#ifdef CL_VERSION_1_2 - -typedef struct _cl_image_desc { - cl_mem_object_type image_type; - size_t image_width; - size_t image_height; - size_t image_depth; - size_t image_array_size; - size_t image_row_pitch; - size_t image_slice_pitch; - cl_uint num_mip_levels; - cl_uint num_samples; -#ifdef CL_VERSION_2_0 -#if defined(__GNUC__) - __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ -#endif -#if defined(_MSC_VER) && !defined(__STDC__) -#pragma warning( push ) -#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 builds */ -#endif -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wc11-extensions" /* Prevents warning about nameless union being C11 extension*/ -#endif -#if defined(_MSC_VER) && defined(__STDC__) - /* Anonymous unions are not supported in /Za builds */ -#else - union { -#endif -#endif - cl_mem buffer; -#ifdef CL_VERSION_2_0 -#if defined(_MSC_VER) && defined(__STDC__) - /* Anonymous unions are not supported in /Za builds */ -#else - cl_mem mem_object; - }; -#endif -#if defined(_MSC_VER) && !defined(__STDC__) -#pragma warning( pop ) -#endif -#ifdef __clang__ -#pragma clang diagnostic pop -#endif -#endif -} cl_image_desc; - -#endif - -#ifdef CL_VERSION_1_1 - -typedef struct _cl_buffer_region { - size_t origin; - size_t size; -} cl_buffer_region; - -#endif - -#ifdef CL_VERSION_3_0 - -#define CL_NAME_VERSION_MAX_NAME_SIZE 64 - -typedef struct _cl_name_version { - cl_version version; - char name[CL_NAME_VERSION_MAX_NAME_SIZE]; -} cl_name_version; - -#endif - -/******************************************************************************/ - -/* Error Codes */ -#define CL_SUCCESS 0 -#define CL_DEVICE_NOT_FOUND -1 -#define CL_DEVICE_NOT_AVAILABLE -2 -#define CL_COMPILER_NOT_AVAILABLE -3 -#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 -#define CL_OUT_OF_RESOURCES -5 -#define CL_OUT_OF_HOST_MEMORY -6 -#define CL_PROFILING_INFO_NOT_AVAILABLE -7 -#define CL_MEM_COPY_OVERLAP -8 -#define CL_IMAGE_FORMAT_MISMATCH -9 -#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 -#define CL_BUILD_PROGRAM_FAILURE -11 -#define CL_MAP_FAILURE -12 -#ifdef CL_VERSION_1_1 -#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 -#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 -#endif -#ifdef CL_VERSION_1_2 -#define CL_COMPILE_PROGRAM_FAILURE -15 -#define CL_LINKER_NOT_AVAILABLE -16 -#define CL_LINK_PROGRAM_FAILURE -17 -#define CL_DEVICE_PARTITION_FAILED -18 -#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 -#endif - -#define CL_INVALID_VALUE -30 -#define CL_INVALID_DEVICE_TYPE -31 -#define CL_INVALID_PLATFORM -32 -#define CL_INVALID_DEVICE -33 -#define CL_INVALID_CONTEXT -34 -#define CL_INVALID_QUEUE_PROPERTIES -35 -#define CL_INVALID_COMMAND_QUEUE -36 -#define CL_INVALID_HOST_PTR -37 -#define CL_INVALID_MEM_OBJECT -38 -#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 -#define CL_INVALID_IMAGE_SIZE -40 -#define CL_INVALID_SAMPLER -41 -#define CL_INVALID_BINARY -42 -#define CL_INVALID_BUILD_OPTIONS -43 -#define CL_INVALID_PROGRAM -44 -#define CL_INVALID_PROGRAM_EXECUTABLE -45 -#define CL_INVALID_KERNEL_NAME -46 -#define CL_INVALID_KERNEL_DEFINITION -47 -#define CL_INVALID_KERNEL -48 -#define CL_INVALID_ARG_INDEX -49 -#define CL_INVALID_ARG_VALUE -50 -#define CL_INVALID_ARG_SIZE -51 -#define CL_INVALID_KERNEL_ARGS -52 -#define CL_INVALID_WORK_DIMENSION -53 -#define CL_INVALID_WORK_GROUP_SIZE -54 -#define CL_INVALID_WORK_ITEM_SIZE -55 -#define CL_INVALID_GLOBAL_OFFSET -56 -#define CL_INVALID_EVENT_WAIT_LIST -57 -#define CL_INVALID_EVENT -58 -#define CL_INVALID_OPERATION -59 -#define CL_INVALID_GL_OBJECT -60 -#define CL_INVALID_BUFFER_SIZE -61 -#define CL_INVALID_MIP_LEVEL -62 -#define CL_INVALID_GLOBAL_WORK_SIZE -63 -#ifdef CL_VERSION_1_1 -#define CL_INVALID_PROPERTY -64 -#endif -#ifdef CL_VERSION_1_2 -#define CL_INVALID_IMAGE_DESCRIPTOR -65 -#define CL_INVALID_COMPILER_OPTIONS -66 -#define CL_INVALID_LINKER_OPTIONS -67 -#define CL_INVALID_DEVICE_PARTITION_COUNT -68 -#endif -#ifdef CL_VERSION_2_0 -#define CL_INVALID_PIPE_SIZE -69 -#define CL_INVALID_DEVICE_QUEUE -70 -#endif -#ifdef CL_VERSION_2_2 -#define CL_INVALID_SPEC_ID -71 -#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 -#endif - - -/* cl_bool */ -#define CL_FALSE 0 -#define CL_TRUE 1 -#ifdef CL_VERSION_1_2 -#define CL_BLOCKING CL_TRUE -#define CL_NON_BLOCKING CL_FALSE -#endif - -/* cl_platform_info */ -#define CL_PLATFORM_PROFILE 0x0900 -#define CL_PLATFORM_VERSION 0x0901 -#define CL_PLATFORM_NAME 0x0902 -#define CL_PLATFORM_VENDOR 0x0903 -#define CL_PLATFORM_EXTENSIONS 0x0904 -#ifdef CL_VERSION_2_1 -#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 -#endif -#ifdef CL_VERSION_3_0 -#define CL_PLATFORM_NUMERIC_VERSION 0x0906 -#define CL_PLATFORM_EXTENSIONS_WITH_VERSION 0x0907 -#endif - -/* cl_device_type - bitfield */ -#define CL_DEVICE_TYPE_DEFAULT (1 << 0) -#define CL_DEVICE_TYPE_CPU (1 << 1) -#define CL_DEVICE_TYPE_GPU (1 << 2) -#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_TYPE_CUSTOM (1 << 4) -#endif -#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF - -/* cl_device_info */ -#define CL_DEVICE_TYPE 0x1000 -#define CL_DEVICE_VENDOR_ID 0x1001 -#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 -#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 -#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 -#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B -#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C -#define CL_DEVICE_ADDRESS_BITS 0x100D -#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E -#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F -#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 -#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 -#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 -#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 -#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 -#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 -#define CL_DEVICE_IMAGE_SUPPORT 0x1016 -#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 -#define CL_DEVICE_MAX_SAMPLERS 0x1018 -#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 -#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A -#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B -#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C -#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D -#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E -#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F -#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 -#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 -#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 -#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 -#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 -#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 -#define CL_DEVICE_ENDIAN_LITTLE 0x1026 -#define CL_DEVICE_AVAILABLE 0x1027 -#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 -#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 -#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ -#ifdef CL_VERSION_2_0 -#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A -#endif -#define CL_DEVICE_NAME 0x102B -#define CL_DEVICE_VENDOR 0x102C -#define CL_DRIVER_VERSION 0x102D -#define CL_DEVICE_PROFILE 0x102E -#define CL_DEVICE_VERSION 0x102F -#define CL_DEVICE_EXTENSIONS 0x1030 -#define CL_DEVICE_PLATFORM 0x1031 -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 -#endif -/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ -#ifdef CL_VERSION_1_1 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 -#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C -#define CL_DEVICE_OPENCL_C_VERSION 0x103D -#endif -#ifdef CL_VERSION_1_2 -#define CL_DEVICE_LINKER_AVAILABLE 0x103E -#define CL_DEVICE_BUILT_IN_KERNELS 0x103F -#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 -#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 -#define CL_DEVICE_PARENT_DEVICE 0x1042 -#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 -#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 -#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 -#define CL_DEVICE_PARTITION_TYPE 0x1046 -#define CL_DEVICE_REFERENCE_COUNT 0x1047 -#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 -#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 -#endif -#ifdef CL_VERSION_2_0 -#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A -#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B -#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C -#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D -#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E -#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F -#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 -#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 -#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 -#define CL_DEVICE_SVM_CAPABILITIES 0x1053 -#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 -#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 -#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 -#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 -#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 -#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 -#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A -#endif -#ifdef CL_VERSION_2_1 -#define CL_DEVICE_IL_VERSION 0x105B -#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C -#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D -#endif -#ifdef CL_VERSION_3_0 -#define CL_DEVICE_NUMERIC_VERSION 0x105E -#define CL_DEVICE_EXTENSIONS_WITH_VERSION 0x1060 -#define CL_DEVICE_ILS_WITH_VERSION 0x1061 -#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION 0x1062 -#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES 0x1063 -#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064 -#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT 0x1065 -#define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066 -#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x1067 -#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068 -#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069 -/* 0x106A to 0x106E - Reserved for upcoming KHR extension */ -#define CL_DEVICE_OPENCL_C_FEATURES 0x106F -#define CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES 0x1070 -#define CL_DEVICE_PIPE_SUPPORT 0x1071 -#define CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED 0x1072 -#endif - -/* cl_device_fp_config - bitfield */ -#define CL_FP_DENORM (1 << 0) -#define CL_FP_INF_NAN (1 << 1) -#define CL_FP_ROUND_TO_NEAREST (1 << 2) -#define CL_FP_ROUND_TO_ZERO (1 << 3) -#define CL_FP_ROUND_TO_INF (1 << 4) -#define CL_FP_FMA (1 << 5) -#ifdef CL_VERSION_1_1 -#define CL_FP_SOFT_FLOAT (1 << 6) -#endif -#ifdef CL_VERSION_1_2 -#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) -#endif - -/* cl_device_mem_cache_type */ -#define CL_NONE 0x0 -#define CL_READ_ONLY_CACHE 0x1 -#define CL_READ_WRITE_CACHE 0x2 - -/* cl_device_local_mem_type */ -#define CL_LOCAL 0x1 -#define CL_GLOBAL 0x2 - -/* cl_device_exec_capabilities - bitfield */ -#define CL_EXEC_KERNEL (1 << 0) -#define CL_EXEC_NATIVE_KERNEL (1 << 1) - -/* cl_command_queue_properties - bitfield */ -#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) -#define CL_QUEUE_PROFILING_ENABLE (1 << 1) -#ifdef CL_VERSION_2_0 -#define CL_QUEUE_ON_DEVICE (1 << 2) -#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) -#endif - -/* cl_context_info */ -#define CL_CONTEXT_REFERENCE_COUNT 0x1080 -#define CL_CONTEXT_DEVICES 0x1081 -#define CL_CONTEXT_PROPERTIES 0x1082 -#ifdef CL_VERSION_1_1 -#define CL_CONTEXT_NUM_DEVICES 0x1083 -#endif - -/* cl_context_properties */ -#define CL_CONTEXT_PLATFORM 0x1084 -#ifdef CL_VERSION_1_2 -#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_device_partition_property */ -#define CL_DEVICE_PARTITION_EQUALLY 0x1086 -#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 -#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 -#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_device_affinity_domain */ -#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) -#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) -#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) -#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) -#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) -#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) - -#endif - -#ifdef CL_VERSION_2_0 - -/* cl_device_svm_capabilities */ -#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) -#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) -#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) -#define CL_DEVICE_SVM_ATOMICS (1 << 3) - -#endif - -/* cl_command_queue_info */ -#define CL_QUEUE_CONTEXT 0x1090 -#define CL_QUEUE_DEVICE 0x1091 -#define CL_QUEUE_REFERENCE_COUNT 0x1092 -#define CL_QUEUE_PROPERTIES 0x1093 -#ifdef CL_VERSION_2_0 -#define CL_QUEUE_SIZE 0x1094 -#endif -#ifdef CL_VERSION_2_1 -#define CL_QUEUE_DEVICE_DEFAULT 0x1095 -#endif -#ifdef CL_VERSION_3_0 -#define CL_QUEUE_PROPERTIES_ARRAY 0x1098 -#endif - -/* cl_mem_flags and cl_svm_mem_flags - bitfield */ -#define CL_MEM_READ_WRITE (1 << 0) -#define CL_MEM_WRITE_ONLY (1 << 1) -#define CL_MEM_READ_ONLY (1 << 2) -#define CL_MEM_USE_HOST_PTR (1 << 3) -#define CL_MEM_ALLOC_HOST_PTR (1 << 4) -#define CL_MEM_COPY_HOST_PTR (1 << 5) -/* reserved (1 << 6) */ -#ifdef CL_VERSION_1_2 -#define CL_MEM_HOST_WRITE_ONLY (1 << 7) -#define CL_MEM_HOST_READ_ONLY (1 << 8) -#define CL_MEM_HOST_NO_ACCESS (1 << 9) -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ -#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ -#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_mem_migration_flags - bitfield */ -#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) -#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) - -#endif - -/* cl_channel_order */ -#define CL_R 0x10B0 -#define CL_A 0x10B1 -#define CL_RG 0x10B2 -#define CL_RA 0x10B3 -#define CL_RGB 0x10B4 -#define CL_RGBA 0x10B5 -#define CL_BGRA 0x10B6 -#define CL_ARGB 0x10B7 -#define CL_INTENSITY 0x10B8 -#define CL_LUMINANCE 0x10B9 -#ifdef CL_VERSION_1_1 -#define CL_Rx 0x10BA -#define CL_RGx 0x10BB -#define CL_RGBx 0x10BC -#endif -#ifdef CL_VERSION_1_2 -#define CL_DEPTH 0x10BD -#define CL_DEPTH_STENCIL 0x10BE -#endif -#ifdef CL_VERSION_2_0 -#define CL_sRGB 0x10BF -#define CL_sRGBx 0x10C0 -#define CL_sRGBA 0x10C1 -#define CL_sBGRA 0x10C2 -#define CL_ABGR 0x10C3 -#endif - -/* cl_channel_type */ -#define CL_SNORM_INT8 0x10D0 -#define CL_SNORM_INT16 0x10D1 -#define CL_UNORM_INT8 0x10D2 -#define CL_UNORM_INT16 0x10D3 -#define CL_UNORM_SHORT_565 0x10D4 -#define CL_UNORM_SHORT_555 0x10D5 -#define CL_UNORM_INT_101010 0x10D6 -#define CL_SIGNED_INT8 0x10D7 -#define CL_SIGNED_INT16 0x10D8 -#define CL_SIGNED_INT32 0x10D9 -#define CL_UNSIGNED_INT8 0x10DA -#define CL_UNSIGNED_INT16 0x10DB -#define CL_UNSIGNED_INT32 0x10DC -#define CL_HALF_FLOAT 0x10DD -#define CL_FLOAT 0x10DE -#ifdef CL_VERSION_1_2 -#define CL_UNORM_INT24 0x10DF -#endif -#ifdef CL_VERSION_2_1 -#define CL_UNORM_INT_101010_2 0x10E0 -#endif - -/* cl_mem_object_type */ -#define CL_MEM_OBJECT_BUFFER 0x10F0 -#define CL_MEM_OBJECT_IMAGE2D 0x10F1 -#define CL_MEM_OBJECT_IMAGE3D 0x10F2 -#ifdef CL_VERSION_1_2 -#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 -#define CL_MEM_OBJECT_IMAGE1D 0x10F4 -#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 -#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_OBJECT_PIPE 0x10F7 -#endif - -/* cl_mem_info */ -#define CL_MEM_TYPE 0x1100 -#define CL_MEM_FLAGS 0x1101 -#define CL_MEM_SIZE 0x1102 -#define CL_MEM_HOST_PTR 0x1103 -#define CL_MEM_MAP_COUNT 0x1104 -#define CL_MEM_REFERENCE_COUNT 0x1105 -#define CL_MEM_CONTEXT 0x1106 -#ifdef CL_VERSION_1_1 -#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 -#define CL_MEM_OFFSET 0x1108 -#endif -#ifdef CL_VERSION_2_0 -#define CL_MEM_USES_SVM_POINTER 0x1109 -#endif -#ifdef CL_VERSION_3_0 -#define CL_MEM_PROPERTIES 0x110A -#endif - -/* cl_image_info */ -#define CL_IMAGE_FORMAT 0x1110 -#define CL_IMAGE_ELEMENT_SIZE 0x1111 -#define CL_IMAGE_ROW_PITCH 0x1112 -#define CL_IMAGE_SLICE_PITCH 0x1113 -#define CL_IMAGE_WIDTH 0x1114 -#define CL_IMAGE_HEIGHT 0x1115 -#define CL_IMAGE_DEPTH 0x1116 -#ifdef CL_VERSION_1_2 -#define CL_IMAGE_ARRAY_SIZE 0x1117 -#define CL_IMAGE_BUFFER 0x1118 -#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 -#define CL_IMAGE_NUM_SAMPLES 0x111A -#endif - - -/* cl_pipe_info */ -#ifdef CL_VERSION_2_0 -#define CL_PIPE_PACKET_SIZE 0x1120 -#define CL_PIPE_MAX_PACKETS 0x1121 -#endif -#ifdef CL_VERSION_3_0 -#define CL_PIPE_PROPERTIES 0x1122 -#endif - -/* cl_addressing_mode */ -#define CL_ADDRESS_NONE 0x1130 -#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 -#define CL_ADDRESS_CLAMP 0x1132 -#define CL_ADDRESS_REPEAT 0x1133 -#ifdef CL_VERSION_1_1 -#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 -#endif - -/* cl_filter_mode */ -#define CL_FILTER_NEAREST 0x1140 -#define CL_FILTER_LINEAR 0x1141 - -/* cl_sampler_info */ -#define CL_SAMPLER_REFERENCE_COUNT 0x1150 -#define CL_SAMPLER_CONTEXT 0x1151 -#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 -#define CL_SAMPLER_ADDRESSING_MODE 0x1153 -#define CL_SAMPLER_FILTER_MODE 0x1154 -#ifdef CL_VERSION_2_0 -/* These enumerants are for the cl_khr_mipmap_image extension. - They have since been added to cl_ext.h with an appropriate - KHR suffix, but are left here for backwards compatibility. */ -#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 -#define CL_SAMPLER_LOD_MIN 0x1156 -#define CL_SAMPLER_LOD_MAX 0x1157 -#endif -#ifdef CL_VERSION_3_0 -#define CL_SAMPLER_PROPERTIES 0x1158 -#endif - -/* cl_map_flags - bitfield */ -#define CL_MAP_READ (1 << 0) -#define CL_MAP_WRITE (1 << 1) -#ifdef CL_VERSION_1_2 -#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) -#endif - -/* cl_program_info */ -#define CL_PROGRAM_REFERENCE_COUNT 0x1160 -#define CL_PROGRAM_CONTEXT 0x1161 -#define CL_PROGRAM_NUM_DEVICES 0x1162 -#define CL_PROGRAM_DEVICES 0x1163 -#define CL_PROGRAM_SOURCE 0x1164 -#define CL_PROGRAM_BINARY_SIZES 0x1165 -#define CL_PROGRAM_BINARIES 0x1166 -#ifdef CL_VERSION_1_2 -#define CL_PROGRAM_NUM_KERNELS 0x1167 -#define CL_PROGRAM_KERNEL_NAMES 0x1168 -#endif -#ifdef CL_VERSION_2_1 -#define CL_PROGRAM_IL 0x1169 -#endif -#ifdef CL_VERSION_2_2 -#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A -#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B -#endif - -/* cl_program_build_info */ -#define CL_PROGRAM_BUILD_STATUS 0x1181 -#define CL_PROGRAM_BUILD_OPTIONS 0x1182 -#define CL_PROGRAM_BUILD_LOG 0x1183 -#ifdef CL_VERSION_1_2 -#define CL_PROGRAM_BINARY_TYPE 0x1184 -#endif -#ifdef CL_VERSION_2_0 -#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_program_binary_type */ -#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 -#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 -#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 -#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 - -#endif - -/* cl_build_status */ -#define CL_BUILD_SUCCESS 0 -#define CL_BUILD_NONE -1 -#define CL_BUILD_ERROR -2 -#define CL_BUILD_IN_PROGRESS -3 - -/* cl_kernel_info */ -#define CL_KERNEL_FUNCTION_NAME 0x1190 -#define CL_KERNEL_NUM_ARGS 0x1191 -#define CL_KERNEL_REFERENCE_COUNT 0x1192 -#define CL_KERNEL_CONTEXT 0x1193 -#define CL_KERNEL_PROGRAM 0x1194 -#ifdef CL_VERSION_1_2 -#define CL_KERNEL_ATTRIBUTES 0x1195 -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_info */ -#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 -#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 -#define CL_KERNEL_ARG_TYPE_NAME 0x1198 -#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 -#define CL_KERNEL_ARG_NAME 0x119A - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_address_qualifier */ -#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B -#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C -#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D -#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_access_qualifier */ -#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 -#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 -#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 -#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 - -#endif - -#ifdef CL_VERSION_1_2 - -/* cl_kernel_arg_type_qualifier */ -#define CL_KERNEL_ARG_TYPE_NONE 0 -#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) -#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) -#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) -#ifdef CL_VERSION_2_0 -#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) -#endif - -#endif - -/* cl_kernel_work_group_info */ -#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 -#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 -#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 -#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 -#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 -#ifdef CL_VERSION_1_2 -#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 -#endif - -#ifdef CL_VERSION_2_1 - -/* cl_kernel_sub_group_info */ -#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 -#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 -#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 -#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 -#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA - -#endif - -#ifdef CL_VERSION_2_0 - -/* cl_kernel_exec_info */ -#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 -#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 - -#endif - -/* cl_event_info */ -#define CL_EVENT_COMMAND_QUEUE 0x11D0 -#define CL_EVENT_COMMAND_TYPE 0x11D1 -#define CL_EVENT_REFERENCE_COUNT 0x11D2 -#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 -#ifdef CL_VERSION_1_1 -#define CL_EVENT_CONTEXT 0x11D4 -#endif - -/* cl_command_type */ -#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 -#define CL_COMMAND_TASK 0x11F1 -#define CL_COMMAND_NATIVE_KERNEL 0x11F2 -#define CL_COMMAND_READ_BUFFER 0x11F3 -#define CL_COMMAND_WRITE_BUFFER 0x11F4 -#define CL_COMMAND_COPY_BUFFER 0x11F5 -#define CL_COMMAND_READ_IMAGE 0x11F6 -#define CL_COMMAND_WRITE_IMAGE 0x11F7 -#define CL_COMMAND_COPY_IMAGE 0x11F8 -#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 -#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA -#define CL_COMMAND_MAP_BUFFER 0x11FB -#define CL_COMMAND_MAP_IMAGE 0x11FC -#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD -#define CL_COMMAND_MARKER 0x11FE -#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF -#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 -#ifdef CL_VERSION_1_1 -#define CL_COMMAND_READ_BUFFER_RECT 0x1201 -#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 -#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 -#define CL_COMMAND_USER 0x1204 -#endif -#ifdef CL_VERSION_1_2 -#define CL_COMMAND_BARRIER 0x1205 -#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 -#define CL_COMMAND_FILL_BUFFER 0x1207 -#define CL_COMMAND_FILL_IMAGE 0x1208 -#endif -#ifdef CL_VERSION_2_0 -#define CL_COMMAND_SVM_FREE 0x1209 -#define CL_COMMAND_SVM_MEMCPY 0x120A -#define CL_COMMAND_SVM_MEMFILL 0x120B -#define CL_COMMAND_SVM_MAP 0x120C -#define CL_COMMAND_SVM_UNMAP 0x120D -#endif -#ifdef CL_VERSION_3_0 -#define CL_COMMAND_SVM_MIGRATE_MEM 0x120E -#endif - -/* command execution status */ -#define CL_COMPLETE 0x0 -#define CL_RUNNING 0x1 -#define CL_SUBMITTED 0x2 -#define CL_QUEUED 0x3 - -/* cl_buffer_create_type */ -#ifdef CL_VERSION_1_1 -#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 -#endif - -/* cl_profiling_info */ -#define CL_PROFILING_COMMAND_QUEUED 0x1280 -#define CL_PROFILING_COMMAND_SUBMIT 0x1281 -#define CL_PROFILING_COMMAND_START 0x1282 -#define CL_PROFILING_COMMAND_END 0x1283 -#ifdef CL_VERSION_2_0 -#define CL_PROFILING_COMMAND_COMPLETE 0x1284 -#endif - -/* cl_device_atomic_capabilities - bitfield */ -#ifdef CL_VERSION_3_0 -#define CL_DEVICE_ATOMIC_ORDER_RELAXED (1 << 0) -#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL (1 << 1) -#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST (1 << 2) -#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM (1 << 3) -#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP (1 << 4) -#define CL_DEVICE_ATOMIC_SCOPE_DEVICE (1 << 5) -#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES (1 << 6) -#endif - -/* cl_device_device_enqueue_capabilities - bitfield */ -#ifdef CL_VERSION_3_0 -#define CL_DEVICE_QUEUE_SUPPORTED (1 << 0) -#define CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT (1 << 1) -#endif - -/* cl_khronos_vendor_id */ -#define CL_KHRONOS_VENDOR_ID_CODEPLAY 0x10004 - -#ifdef CL_VERSION_3_0 - -/* cl_version */ -#define CL_VERSION_MAJOR_BITS (10) -#define CL_VERSION_MINOR_BITS (10) -#define CL_VERSION_PATCH_BITS (12) - -#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1) -#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1) -#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1) - -#define CL_VERSION_MAJOR(version) \ - ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) - -#define CL_VERSION_MINOR(version) \ - (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK) - -#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK) - -#define CL_MAKE_VERSION(major, minor, patch) \ - ((((major) & CL_VERSION_MAJOR_MASK) \ - << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) | \ - (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \ - ((patch) & CL_VERSION_PATCH_MASK)) - -#endif - -/********************************************************************************************************/ - -/* Platform API */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPlatformIDs(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPlatformInfo(cl_platform_id platform, - cl_platform_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Device APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDs(cl_platform_id platform, - cl_device_type device_type, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceInfo(cl_device_id device, - cl_device_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateSubDevices(cl_device_id in_device, - const cl_device_partition_property * properties, - cl_uint num_devices, - cl_device_id * out_devices, - cl_uint * num_devices_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetDefaultDeviceCommandQueue(cl_context context, - cl_device_id device, - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceAndHostTimer(cl_device_id device, - cl_ulong* device_timestamp, - cl_ulong* host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetHostTimer(cl_device_id device, - cl_ulong * host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -#endif - -/* Context APIs */ -extern CL_API_ENTRY cl_context CL_API_CALL -clCreateContext(const cl_context_properties * properties, - cl_uint num_devices, - const cl_device_id * devices, - void (CL_CALLBACK * pfn_notify)(const char * errinfo, - const void * private_info, - size_t cb, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_context CL_API_CALL -clCreateContextFromType(const cl_context_properties * properties, - cl_device_type device_type, - void (CL_CALLBACK * pfn_notify)(const char * errinfo, - const void * private_info, - size_t cb, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetContextInfo(cl_context context, - cl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_3_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetContextDestructorCallback(cl_context context, - void (CL_CALLBACK* pfn_notify)(cl_context context, - void* user_data), - void* user_data) CL_API_SUFFIX__VERSION_3_0; - -#endif - -/* Command Queue APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_command_queue CL_API_CALL -clCreateCommandQueueWithProperties(cl_context context, - cl_device_id device, - const cl_queue_properties * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetCommandQueueInfo(cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Memory Object APIs */ -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateBuffer(cl_context context, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateSubBuffer(cl_mem buffer, - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void * buffer_create_info, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - const cl_image_desc * image_desc, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreatePipe(cl_context context, - cl_mem_flags flags, - cl_uint pipe_packet_size, - cl_uint pipe_max_packets, - const cl_pipe_properties * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_3_0 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateBufferWithProperties(cl_context context, - const cl_mem_properties * properties, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateImageWithProperties(cl_context context, - const cl_mem_properties * properties, - cl_mem_flags flags, - const cl_image_format * image_format, - const cl_image_desc * image_desc, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedImageFormats(cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - cl_image_format * image_formats, - cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMemObjectInfo(cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetImageInfo(cl_mem image, - cl_image_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetPipeInfo(cl_mem pipe, - cl_pipe_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetMemObjectDestructorCallback(cl_mem memobj, - void (CL_CALLBACK * pfn_notify)(cl_mem memobj, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_1; - -#endif - -/* SVM Allocation APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY void * CL_API_CALL -clSVMAlloc(cl_context context, - cl_svm_mem_flags flags, - size_t size, - cl_uint alignment) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY void CL_API_CALL -clSVMFree(cl_context context, - void * svm_pointer) CL_API_SUFFIX__VERSION_2_0; - -#endif - -/* Sampler APIs */ - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_sampler CL_API_CALL -clCreateSamplerWithProperties(cl_context context, - const cl_sampler_properties * sampler_properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSamplerInfo(cl_sampler sampler, - cl_sampler_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Program Object APIs */ -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithSource(cl_context context, - cl_uint count, - const char ** strings, - const size_t * lengths, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithBinary(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const size_t * lengths, - const unsigned char ** binaries, - cl_int * binary_status, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithBuiltInKernels(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const char * kernel_names, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithIL(cl_context context, - const void* il, - size_t length, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clBuildProgram(cl_program program, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clCompileProgram(cl_program program, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - cl_uint num_input_headers, - const cl_program * input_headers, - const char ** header_include_names, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_program CL_API_CALL -clLinkProgram(cl_context context, - cl_uint num_devices, - const cl_device_id * device_list, - const char * options, - cl_uint num_input_programs, - const cl_program * input_programs, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_2 - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL -clSetProgramReleaseCallback(cl_program program, - void (CL_CALLBACK * pfn_notify)(cl_program program, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_2_2_DEPRECATED; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetProgramSpecializationConstant(cl_program program, - cl_uint spec_id, - size_t spec_size, - const void* spec_value) CL_API_SUFFIX__VERSION_2_2; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetProgramInfo(cl_program program, - cl_program_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetProgramBuildInfo(cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Kernel Object APIs */ -extern CL_API_ENTRY cl_kernel CL_API_CALL -clCreateKernel(cl_program program, - const char * kernel_name, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateKernelsInProgram(cl_program program, - cl_uint num_kernels, - cl_kernel * kernels, - cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_kernel CL_API_CALL -clCloneKernel(cl_kernel source_kernel, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArg(cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void * arg_value) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgSVMPointer(cl_kernel kernel, - cl_uint arg_index, - const void * arg_value) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelExecInfo(cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void * param_value) CL_API_SUFFIX__VERSION_2_0; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelInfo(cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelArgInfo(cl_kernel kernel, - cl_uint arg_indx, - cl_kernel_arg_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelWorkGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelSubGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void* input_value, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; - -#endif - -/* Event Object APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clWaitForEvents(cl_uint num_events, - const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetEventInfo(cl_event event, - cl_event_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateUserEvent(cl_context context, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetUserEventStatus(cl_event event, - cl_int execution_status) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetEventCallback(cl_event event, - cl_int command_exec_callback_type, - void (CL_CALLBACK * pfn_notify)(cl_event event, - cl_int event_command_status, - void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_1; - -#endif - -/* Profiling APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clGetEventProfilingInfo(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -/* Flush and Finish APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -/* Enqueued Commands APIs */ -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - const size_t * buffer_origin, - const size_t * host_origin, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - const size_t * buffer_origin, - const size_t * host_origin, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueFillBuffer(cl_command_queue command_queue, - cl_mem buffer, - const void * pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBuffer(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBufferRect(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_1; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_read, - const size_t * origin, - const size_t * region, - size_t row_pitch, - size_t slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_write, - const size_t * origin, - const size_t * region, - size_t input_row_pitch, - size_t input_slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueFillImage(cl_command_queue command_queue, - cl_mem image, - const void * fill_color, - const size_t * origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyImage(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyImageToBuffer(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_buffer, - const size_t * src_origin, - const size_t * region, - size_t dst_offset, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBufferToImage(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_image, - size_t src_offset, - const size_t * dst_origin, - const size_t * region, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY void * CL_API_CALL -clEnqueueMapBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY void * CL_API_CALL -clEnqueueMapImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_map, - cl_map_flags map_flags, - const size_t * origin, - const size_t * region, - size_t * image_row_pitch, - size_t * image_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueUnmapMemObject(cl_command_queue command_queue, - cl_mem memobj, - void * mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemObjects(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueNDRangeKernel(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t * global_work_offset, - const size_t * global_work_size, - const size_t * local_work_size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueNativeKernel(cl_command_queue command_queue, - void (CL_CALLBACK * user_func)(void *), - void * args, - size_t cb_args, - cl_uint num_mem_objects, - const cl_mem * mem_list, - const void ** args_mem_loc, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMarkerWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueBarrierWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_VERSION_2_0 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMFree(cl_command_queue command_queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void * user_data), - void * user_data, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemcpy(cl_command_queue command_queue, - cl_bool blocking_copy, - void * dst_ptr, - const void * src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemFill(cl_command_queue command_queue, - void * svm_ptr, - const void * pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMap(cl_command_queue command_queue, - cl_bool blocking_map, - cl_map_flags flags, - void * svm_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMUnmap(cl_command_queue command_queue, - void * svm_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_0; - -#endif - -#ifdef CL_VERSION_2_1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMigrateMem(cl_command_queue command_queue, - cl_uint num_svm_pointers, - const void ** svm_pointers, - const size_t * sizes, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_2_1; - -#endif - -#ifdef CL_VERSION_1_2 - -/* Extension function access - * - * Returns the extension function address for the given function name, - * or NULL if a valid function can not be found. The client must - * check to make sure the address is not NULL, before using or - * calling the returned function address. - */ -extern CL_API_ENTRY void * CL_API_CALL -clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, - const char * func_name) CL_API_SUFFIX__VERSION_1_2; - -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - /* - * WARNING: - * This API introduces mutable state into the OpenCL implementation. It has been REMOVED - * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the - * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. - * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. - * - * Software developers previously relying on this API are instructed to set the command queue - * properties when creating the queue, instead. - */ - extern CL_API_ENTRY cl_int CL_API_CALL - clSetCommandQueueProperty(cl_command_queue command_queue, - cl_command_queue_properties properties, - cl_bool enable, - cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0_DEPRECATED; -#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ - -/* Deprecated OpenCL 1.1 APIs */ -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateImage2D(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - size_t image_width, - size_t image_height, - size_t image_row_pitch, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateImage3D(cl_context context, - cl_mem_flags flags, - const cl_image_format * image_format, - size_t image_width, - size_t image_height, - size_t image_depth, - size_t image_row_pitch, - size_t image_slice_pitch, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueMarker(cl_command_queue command_queue, - cl_event * event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueWaitForEvents(cl_command_queue command_queue, - cl_uint num_events, - const cl_event * event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL -clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL -clGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -/* Deprecated OpenCL 2.0 APIs */ -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL -clCreateCommandQueue(cl_context context, - cl_device_id device, - cl_command_queue_properties properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL -clCreateSampler(cl_context context, - cl_bool normalized_coords, - cl_addressing_mode addressing_mode, - cl_filter_mode filter_mode, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL -clEnqueueTask(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2_DEPRECATED; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_H */ diff --git a/include/CL/cl2.hpp b/include/CL/cl2.hpp deleted file mode 100644 index a332962a8..000000000 --- a/include/CL/cl2.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// Copyright (c) 2020 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include -#pragma message("cl2.hpp has been renamed to opencl.hpp to make it clear that it supports all versions of OpenCL. Please include opencl.hpp directly.") diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h deleted file mode 100644 index 0d9950bed..000000000 --- a/include/CL/cl_d3d10.h +++ /dev/null @@ -1,154 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_D3D10_H -#define __OPENCL_CL_D3D10_H - -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( push ) -#pragma warning( disable : 4201 ) -#pragma warning( disable : 5105 ) -#endif -#endif -#include -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( pop ) -#endif -#endif -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * cl_khr_d3d10_sharing */ -#define cl_khr_d3d10_sharing 1 - -typedef cl_uint cl_d3d10_device_source_khr; -typedef cl_uint cl_d3d10_device_set_khr; - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_D3D10_DEVICE_KHR -1002 -#define CL_INVALID_D3D10_RESOURCE_KHR -1003 -#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 -#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 - -/* cl_d3d10_device_source_nv */ -#define CL_D3D10_DEVICE_KHR 0x4010 -#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 - -/* cl_d3d10_device_set_nv */ -#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 -#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 - -/* cl_context_info */ -#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 -#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C - -/* cl_mem_info */ -#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 - -/* cl_image_info */ -#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 -#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 - -/******************************************************************************/ - -typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Buffer * resource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture2D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture3D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -/*************************************************************** -* cl_intel_sharing_format_query_d3d10 -***************************************************************/ -#define cl_intel_sharing_format_query_d3d10 1 - -/* when cl_khr_d3d10_sharing is supported */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedD3D10TextureFormatsINTEL( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - DXGI_FORMAT* d3d10_formats, - cl_uint* num_texture_formats) ; - -typedef cl_int (CL_API_CALL * -clGetSupportedD3D10TextureFormatsINTEL_fn)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - DXGI_FORMAT* d3d10_formats, - cl_uint* num_texture_formats) ; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_D3D10_H */ - diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h deleted file mode 100644 index 9393e5c84..000000000 --- a/include/CL/cl_d3d11.h +++ /dev/null @@ -1,156 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_D3D11_H -#define __OPENCL_CL_D3D11_H - -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( push ) -#pragma warning( disable : 4201 ) -#pragma warning( disable : 5105 ) -#endif -#endif -#include -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( pop ) -#endif -#endif -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * cl_khr_d3d11_sharing */ -#define cl_khr_d3d11_sharing 1 - -typedef cl_uint cl_d3d11_device_source_khr; -typedef cl_uint cl_d3d11_device_set_khr; - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_D3D11_DEVICE_KHR -1006 -#define CL_INVALID_D3D11_RESOURCE_KHR -1007 -#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 -#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 - -/* cl_d3d11_device_source */ -#define CL_D3D11_DEVICE_KHR 0x4019 -#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A - -/* cl_d3d11_device_set */ -#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B -#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C - -/* cl_context_info */ -#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D -#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D - -/* cl_mem_info */ -#define CL_MEM_D3D11_RESOURCE_KHR 0x401E - -/* cl_image_info */ -#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 -#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 - -/******************************************************************************/ - -typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( - cl_platform_id platform, - cl_d3d11_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Buffer * resource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Texture2D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D11Texture3D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -/*************************************************************** -* cl_intel_sharing_format_query_d3d11 -***************************************************************/ -#define cl_intel_sharing_format_query_d3d11 1 - -/* when cl_khr_d3d11_sharing is supported */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedD3D11TextureFormatsINTEL( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - DXGI_FORMAT* d3d11_formats, - cl_uint* num_texture_formats) ; - -typedef cl_int (CL_API_CALL * -clGetSupportedD3D11TextureFormatsINTEL_fn)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - DXGI_FORMAT* d3d11_formats, - cl_uint* num_texture_formats) ; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_D3D11_H */ - diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h deleted file mode 100644 index fd03bbdc2..000000000 --- a/include/CL/cl_dx9_media_sharing.h +++ /dev/null @@ -1,268 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H -#define __OPENCL_CL_DX9_MEDIA_SHARING_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/******************************************************************************/ -/* cl_khr_dx9_media_sharing */ -#define cl_khr_dx9_media_sharing 1 - -typedef cl_uint cl_dx9_media_adapter_type_khr; -typedef cl_uint cl_dx9_media_adapter_set_khr; - -#if defined(_WIN32) -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( push ) -#pragma warning( disable : 4201 ) -#pragma warning( disable : 5105 ) -#endif -#endif -#include -#if defined(_MSC_VER) -#if _MSC_VER >=1500 -#pragma warning( pop ) -#endif -#endif -typedef struct _cl_dx9_surface_info_khr -{ - IDirect3DSurface9 *resource; - HANDLE shared_handle; -} cl_dx9_surface_info_khr; -#endif - - -/******************************************************************************/ - -/* Error Codes */ -#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 -#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 -#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 -#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 - -/* cl_media_adapter_type_khr */ -#define CL_ADAPTER_D3D9_KHR 0x2020 -#define CL_ADAPTER_D3D9EX_KHR 0x2021 -#define CL_ADAPTER_DXVA_KHR 0x2022 - -/* cl_media_adapter_set_khr */ -#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 -#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 - -/* cl_context_info */ -#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 -#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 -#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 - -/* cl_mem_info */ -#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 -#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 - -/* cl_image_info */ -#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B -#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C - -/******************************************************************************/ - -typedef cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( - cl_platform_id platform, - cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr * media_adapter_type, - void * media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( - cl_context context, - cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, - void * surface_info, - cl_uint plane, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -/*************************************** -* cl_intel_dx9_media_sharing extension * -****************************************/ - -#define cl_intel_dx9_media_sharing 1 - -typedef cl_uint cl_dx9_device_source_intel; -typedef cl_uint cl_dx9_device_set_intel; - -/* error codes */ -#define CL_INVALID_DX9_DEVICE_INTEL -1010 -#define CL_INVALID_DX9_RESOURCE_INTEL -1011 -#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 -#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 - -/* cl_dx9_device_source_intel */ -#define CL_D3D9_DEVICE_INTEL 0x4022 -#define CL_D3D9EX_DEVICE_INTEL 0x4070 -#define CL_DXVA_DEVICE_INTEL 0x4071 - -/* cl_dx9_device_set_intel */ -#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 -#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 - -/* cl_context_info */ -#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 -#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 -#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 - -/* cl_mem_info */ -#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 -#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 - -/* cl_image_info */ -#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A -#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B -/******************************************************************************/ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDsFromDX9INTEL( - cl_platform_id platform, - cl_dx9_device_source_intel dx9_device_source, - void* dx9_object, - cl_dx9_device_set_intel dx9_device_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( - cl_platform_id platform, - cl_dx9_device_source_intel dx9_device_source, - void* dx9_object, - cl_dx9_device_set_intel dx9_device_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromDX9MediaSurfaceINTEL( - cl_context context, - cl_mem_flags flags, - IDirect3DSurface9* resource, - HANDLE sharedHandle, - UINT plane, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( - cl_context context, - cl_mem_flags flags, - IDirect3DSurface9* resource, - HANDLE sharedHandle, - UINT plane, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireDX9ObjectsINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseDX9ObjectsINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_1; - -/*************************************************************** -* cl_intel_sharing_format_query_dx9 -***************************************************************/ -#define cl_intel_sharing_format_query_dx9 1 - -/* when cl_khr_dx9_media_sharing or cl_intel_dx9_media_sharing is supported */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedDX9MediaSurfaceFormatsINTEL( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - D3DFORMAT* dx9_formats, - cl_uint* num_surface_formats) ; - -typedef cl_int (CL_API_CALL * -clGetSupportedDX9MediaSurfaceFormatsINTEL_fn)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - D3DFORMAT* dx9_formats, - cl_uint* num_surface_formats) ; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ - diff --git a/include/CL/cl_dx9_media_sharing_intel.h b/include/CL/cl_dx9_media_sharing_intel.h deleted file mode 100644 index f6518d7f6..000000000 --- a/include/CL/cl_dx9_media_sharing_intel.h +++ /dev/null @@ -1,18 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#include -#pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h. Please include cl_dx9_media_sharing.h directly.") diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h deleted file mode 100644 index 357a37c02..000000000 --- a/include/CL/cl_egl.h +++ /dev/null @@ -1,120 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_EGL_H -#define __OPENCL_CL_EGL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ -#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F -#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D -#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E - -/* Error type for clCreateFromEGLImageKHR */ -#define CL_INVALID_EGL_OBJECT_KHR -1093 -#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 - -/* CLeglImageKHR is an opaque handle to an EGLImage */ -typedef void* CLeglImageKHR; - -/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ -typedef void* CLeglDisplayKHR; - -/* CLeglSyncKHR is an opaque handle to an EGLSync object */ -typedef void* CLeglSyncKHR; - -/* properties passed to clCreateFromEGLImageKHR */ -typedef intptr_t cl_egl_image_properties_khr; - - -#define cl_khr_egl_image 1 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromEGLImageKHR(cl_context context, - CLeglDisplayKHR egldisplay, - CLeglImageKHR eglimage, - cl_mem_flags flags, - const cl_egl_image_properties_khr * properties, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( - cl_context context, - CLeglDisplayKHR egldisplay, - CLeglImageKHR eglimage, - cl_mem_flags flags, - const cl_egl_image_properties_khr * properties, - cl_int * errcode_ret); - - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -#define cl_khr_egl_event 1 - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateEventFromEGLSyncKHR(cl_context context, - CLeglSyncKHR sync, - CLeglDisplayKHR display, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( - cl_context context, - CLeglSyncKHR sync, - CLeglDisplayKHR display, - cl_int * errcode_ret); - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_EGL_H */ diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h deleted file mode 100644 index 3eba7ed1b..000000000 --- a/include/CL/cl_ext.h +++ /dev/null @@ -1,2634 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -/* cl_ext.h contains OpenCL extensions which don't have external */ -/* (OpenGL, D3D) dependencies. */ - -#ifndef __CL_EXT_H -#define __CL_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/*************************************************************** -* cl_khr_command_buffer -***************************************************************/ -#define cl_khr_command_buffer 1 -#define CL_KHR_COMMAND_BUFFER_EXTENSION_NAME \ - "cl_khr_command_buffer" - -typedef cl_bitfield cl_device_command_buffer_capabilities_khr; -typedef struct _cl_command_buffer_khr* cl_command_buffer_khr; -typedef cl_uint cl_sync_point_khr; -typedef cl_uint cl_command_buffer_info_khr; -typedef cl_uint cl_command_buffer_state_khr; -typedef cl_properties cl_command_buffer_properties_khr; -typedef cl_bitfield cl_command_buffer_flags_khr; -typedef cl_properties cl_ndrange_kernel_command_properties_khr; -typedef struct _cl_mutable_command_khr* cl_mutable_command_khr; - -/* cl_device_info */ -#define CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR 0x12A9 -#define CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR 0x12AA - -/* cl_device_command_buffer_capabilities_khr - bitfield */ -#define CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR (1 << 0) -#define CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR (1 << 1) -#define CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR (1 << 2) -#define CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR (1 << 3) - -/* cl_command_buffer_properties_khr */ -#define CL_COMMAND_BUFFER_FLAGS_KHR 0x1293 - -/* cl_command_buffer_flags_khr */ -#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR (1 << 0) - -/* Error codes */ -#define CL_INVALID_COMMAND_BUFFER_KHR -1138 -#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR -1139 -#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR -1140 - -/* cl_command_buffer_info_khr */ -#define CL_COMMAND_BUFFER_QUEUES_KHR 0x1294 -#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR 0x1295 -#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR 0x1296 -#define CL_COMMAND_BUFFER_STATE_KHR 0x1297 -#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR 0x1298 - -/* cl_command_buffer_state_khr */ -#define CL_COMMAND_BUFFER_STATE_RECORDING_KHR 0 -#define CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR 1 -#define CL_COMMAND_BUFFER_STATE_PENDING_KHR 2 -#define CL_COMMAND_BUFFER_STATE_INVALID_KHR 3 - -/* cl_command_type */ -#define CL_COMMAND_COMMAND_BUFFER_KHR 0x12A8 - - -typedef cl_command_buffer_khr (CL_API_CALL * -clCreateCommandBufferKHR_fn)( - cl_uint num_queues, - const cl_command_queue* queues, - const cl_command_buffer_properties_khr* properties, - cl_int* errcode_ret) ; - -typedef cl_int (CL_API_CALL * -clFinalizeCommandBufferKHR_fn)( - cl_command_buffer_khr command_buffer) ; - -typedef cl_int (CL_API_CALL * -clRetainCommandBufferKHR_fn)( - cl_command_buffer_khr command_buffer) ; - -typedef cl_int (CL_API_CALL * -clReleaseCommandBufferKHR_fn)( - cl_command_buffer_khr command_buffer) ; - -typedef cl_int (CL_API_CALL * -clEnqueueCommandBufferKHR_fn)( - cl_uint num_queues, - cl_command_queue* queues, - cl_command_buffer_khr command_buffer, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -typedef cl_int (CL_API_CALL * -clCommandBarrierWithWaitListKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandCopyBufferKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandCopyBufferRectKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t* src_origin, - const size_t* dst_origin, - const size_t* region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandCopyBufferToImageKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_image, - size_t src_offset, - const size_t* dst_origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandCopyImageKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t* src_origin, - const size_t* dst_origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandCopyImageToBufferKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_buffer, - const size_t* src_origin, - const size_t* region, - size_t dst_offset, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandFillBufferKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem buffer, - const void* pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandFillImageKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem image, - const void* fill_color, - const size_t* origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clCommandNDRangeKernelKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr* properties, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - const size_t* local_work_size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -typedef cl_int (CL_API_CALL * -clGetCommandBufferInfoKHR_fn)( - cl_command_buffer_khr command_buffer, - cl_command_buffer_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -#ifndef CL_NO_PROTOTYPES - -extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL -clCreateCommandBufferKHR( - cl_uint num_queues, - const cl_command_queue* queues, - const cl_command_buffer_properties_khr* properties, - cl_int* errcode_ret) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clFinalizeCommandBufferKHR( - cl_command_buffer_khr command_buffer) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainCommandBufferKHR( - cl_command_buffer_khr command_buffer) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseCommandBufferKHR( - cl_command_buffer_khr command_buffer) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCommandBufferKHR( - cl_uint num_queues, - cl_command_queue* queues, - cl_command_buffer_khr command_buffer, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandBarrierWithWaitListKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandCopyBufferKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandCopyBufferRectKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t* src_origin, - const size_t* dst_origin, - const size_t* region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandCopyBufferToImageKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_image, - size_t src_offset, - const size_t* dst_origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandCopyImageKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t* src_origin, - const size_t* dst_origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandCopyImageToBufferKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_buffer, - const size_t* src_origin, - const size_t* region, - size_t dst_offset, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandFillBufferKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem buffer, - const void* pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandFillImageKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - cl_mem image, - const void* fill_color, - const size_t* origin, - const size_t* region, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clCommandNDRangeKernelKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr* properties, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - const size_t* local_work_size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetCommandBufferInfoKHR( - cl_command_buffer_khr command_buffer, - cl_command_buffer_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -#endif /* CL_NO_PROTOTYPES */ - -/*************************************************************** -* cl_khr_command_buffer_mutable_dispatch -***************************************************************/ -#define cl_khr_command_buffer_mutable_dispatch 1 -#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME \ - "cl_khr_command_buffer_mutable_dispatch" - -typedef cl_uint cl_command_buffer_structure_type_khr; -typedef cl_bitfield cl_mutable_dispatch_fields_khr; -typedef cl_uint cl_mutable_command_info_khr; -typedef struct _cl_mutable_dispatch_arg_khr { - cl_uint arg_index; - size_t arg_size; - const void* arg_value; -} cl_mutable_dispatch_arg_khr; -typedef struct _cl_mutable_dispatch_exec_info_khr { - cl_uint param_name; - size_t param_value_size; - const void* param_value; -} cl_mutable_dispatch_exec_info_khr; -typedef struct _cl_mutable_dispatch_config_khr { - cl_command_buffer_structure_type_khr type; - const void* next; - cl_mutable_command_khr command; - cl_uint num_args; - cl_uint num_svm_args; - cl_uint num_exec_infos; - cl_uint work_dim; - const cl_mutable_dispatch_arg_khr* arg_list; - const cl_mutable_dispatch_arg_khr* arg_svm_list; - const cl_mutable_dispatch_exec_info_khr* exec_info_list; - const size_t* global_work_offset; - const size_t* global_work_size; - const size_t* local_work_size; -} cl_mutable_dispatch_config_khr; -typedef struct _cl_mutable_base_config_khr { - cl_command_buffer_structure_type_khr type; - const void* next; - cl_uint num_mutable_dispatch; - const cl_mutable_dispatch_config_khr* mutable_dispatch_list; -} cl_mutable_base_config_khr; - -/* cl_command_buffer_flags_khr - bitfield */ -#define CL_COMMAND_BUFFER_MUTABLE_KHR (1 << 1) - -/* Error codes */ -#define CL_INVALID_MUTABLE_COMMAND_KHR -1141 - -/* cl_device_info */ -#define CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR 0x12B0 - -/* cl_ndrange_kernel_command_properties_khr */ -#define CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR 0x12B1 - -/* cl_mutable_dispatch_fields_khr - bitfield */ -#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR (1 << 0) -#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR (1 << 1) -#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR (1 << 2) -#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR (1 << 3) -#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR (1 << 4) - -/* cl_mutable_command_info_khr */ -#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR 0x12A0 -#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR 0x12A1 -#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR 0x12AD -#define CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR 0x12A2 -#define CL_MUTABLE_DISPATCH_KERNEL_KHR 0x12A3 -#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR 0x12A4 -#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR 0x12A5 -#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR 0x12A6 -#define CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR 0x12A7 - -/* cl_command_buffer_structure_type_khr */ -#define CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR 0 -#define CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR 1 - - -typedef cl_int (CL_API_CALL * -clUpdateMutableCommandsKHR_fn)( - cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr* mutable_config) ; - -typedef cl_int (CL_API_CALL * -clGetMutableCommandInfoKHR_fn)( - cl_mutable_command_khr command, - cl_mutable_command_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -#ifndef CL_NO_PROTOTYPES - -extern CL_API_ENTRY cl_int CL_API_CALL -clUpdateMutableCommandsKHR( - cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr* mutable_config) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMutableCommandInfoKHR( - cl_mutable_command_khr command, - cl_mutable_command_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -#endif /* CL_NO_PROTOTYPES */ - -/* cl_khr_fp64 extension - no extension #define since it has no functions */ -/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ - -#if CL_TARGET_OPENCL_VERSION <= 110 -#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 -#endif - -/* cl_khr_fp16 extension - no extension #define since it has no functions */ -#define CL_DEVICE_HALF_FP_CONFIG 0x1033 - -/* Memory object destruction - * - * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR - * - * Registers a user callback function that will be called when the memory object is deleted and its resources - * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback - * stack associated with memobj. The registered user callback functions are called in the reverse order in - * which they were registered. The user callback functions are called and then the memory object is deleted - * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be - * notified when the memory referenced by host_ptr, specified when the memory object is created and used as - * the storage bits for the memory object, can be reused or freed. - * - * The application may not call CL api's with the cl_mem object passed to the pfn_notify. - * - * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) - * before using. - */ -#define cl_APPLE_SetMemObjectDestructor 1 -extern CL_API_ENTRY cl_int CL_API_CALL clSetMemObjectDestructorAPPLE( cl_mem memobj, - void (* pfn_notify)(cl_mem memobj, void * user_data), - void * user_data) CL_API_SUFFIX__VERSION_1_0; - - -/* Context Logging Functions - * - * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). - * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) - * before using. - * - * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger - */ -#define cl_APPLE_ContextLoggingFunctions 1 -extern CL_API_ENTRY void CL_API_CALL clLogMessagesToSystemLogAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_API_SUFFIX__VERSION_1_0; - -/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ -extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStdoutAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_API_SUFFIX__VERSION_1_0; - -/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ -extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStderrAPPLE( const char * errstr, - const void * private_info, - size_t cb, - void * user_data) CL_API_SUFFIX__VERSION_1_0; - - -/************************ -* cl_khr_icd extension * -************************/ -#define cl_khr_icd 1 - -/* cl_platform_info */ -#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 - -/* Additional Error Codes */ -#define CL_PLATFORM_NOT_FOUND_KHR -1001 - -extern CL_API_ENTRY cl_int CL_API_CALL -clIcdGetPlatformIDsKHR(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms); - -typedef cl_int -(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries, - cl_platform_id * platforms, - cl_uint * num_platforms); - - -/******************************* - * cl_khr_il_program extension * - *******************************/ -#define cl_khr_il_program 1 - -/* New property to clGetDeviceInfo for retrieving supported intermediate - * languages - */ -#define CL_DEVICE_IL_VERSION_KHR 0x105B - -/* New property to clGetProgramInfo for retrieving for retrieving the IL of a - * program - */ -#define CL_PROGRAM_IL_KHR 0x1169 - -extern CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithILKHR(cl_context context, - const void * il, - size_t length, - cl_int * errcode_ret); - -typedef cl_program -(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context, - const void * il, - size_t length, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -/* Extension: cl_khr_image2d_from_buffer - * - * This extension allows a 2D image to be created from a cl_mem buffer without - * a copy. The type associated with a 2D image created from a buffer in an - * OpenCL program is image2d_t. Both the sampler and sampler-less read_image - * built-in functions are supported for 2D images and 2D images created from - * a buffer. Similarly, the write_image built-ins are also supported for 2D - * images created from a buffer. - * - * When the 2D image from buffer is created, the client must specify the - * width, height, image format (i.e. channel order and channel data type) - * and optionally the row pitch. - * - * The pitch specified must be a multiple of - * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels. - * The base address of the buffer must be aligned to - * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels. - */ - -#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A -#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B - - -/************************************** - * cl_khr_initialize_memory extension * - **************************************/ - -#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 - - -/************************************** - * cl_khr_terminate_context extension * - **************************************/ - -#define CL_CONTEXT_TERMINATED_KHR -1121 - -#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 -#define CL_CONTEXT_TERMINATE_KHR 0x2032 - -#define cl_khr_terminate_context 1 -extern CL_API_ENTRY cl_int CL_API_CALL -clTerminateContextKHR(cl_context context) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int -(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_API_SUFFIX__VERSION_1_2; - - -/* - * Extension: cl_khr_spir - * - * This extension adds support to create an OpenCL program object from a - * Standard Portable Intermediate Representation (SPIR) instance - */ - -#define CL_DEVICE_SPIR_VERSIONS 0x40E0 -#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 - - -/***************************************** - * cl_khr_create_command_queue extension * - *****************************************/ -#define cl_khr_create_command_queue 1 - -typedef cl_properties cl_queue_properties_khr; - -extern CL_API_ENTRY cl_command_queue CL_API_CALL -clCreateCommandQueueWithPropertiesKHR(cl_context context, - cl_device_id device, - const cl_queue_properties_khr* properties, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_command_queue -(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context, - cl_device_id device, - const cl_queue_properties_khr* properties, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - - -/****************************************** -* cl_nv_device_attribute_query extension * -******************************************/ - -/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ -#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 -#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 -#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 -#define CL_DEVICE_WARP_SIZE_NV 0x4003 -#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 -#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 -#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 - - -/********************************* -* cl_amd_device_attribute_query * -*********************************/ - -#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 -#define CL_DEVICE_TOPOLOGY_AMD 0x4037 -#define CL_DEVICE_BOARD_NAME_AMD 0x4038 -#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 -#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 -#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 -#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 -#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 -#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 -#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 -#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 -#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 -#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 -#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049 -#define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A -#define CL_DEVICE_GFXIP_MINOR_AMD 0x404B -#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C -#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 -#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 -#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 -#define CL_DEVICE_PCIE_ID_AMD 0x4034 - - -/********************************* -* cl_arm_printf extension -*********************************/ - -#define CL_PRINTF_CALLBACK_ARM 0x40B0 -#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 - - -/*********************************** -* cl_ext_device_fission extension -***********************************/ -#define cl_ext_device_fission 1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int -(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int -(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_ulong cl_device_partition_property_ext; -extern CL_API_ENTRY cl_int CL_API_CALL -clCreateSubDevicesEXT(cl_device_id in_device, - const cl_device_partition_property_ext * properties, - cl_uint num_entries, - cl_device_id * out_devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int -(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device, - const cl_device_partition_property_ext * properties, - cl_uint num_entries, - cl_device_id * out_devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_1; - -/* cl_device_partition_property_ext */ -#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 -#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 -#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 -#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 - -/* clDeviceGetInfo selectors */ -#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 -#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 -#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 -#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 -#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 - -/* error codes */ -#define CL_DEVICE_PARTITION_FAILED_EXT -1057 -#define CL_INVALID_PARTITION_COUNT_EXT -1058 -#define CL_INVALID_PARTITION_NAME_EXT -1059 - -/* CL_AFFINITY_DOMAINs */ -#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 -#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 -#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 -#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 -#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 -#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 - -/* cl_device_partition_property_ext list terminators */ -#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) -#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) -#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) - - -/*********************************** - * cl_ext_migrate_memobject extension definitions - ***********************************/ -#define cl_ext_migrate_memobject 1 - -typedef cl_bitfield cl_mem_migration_flags_ext; - -#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 - -#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags_ext flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - -typedef cl_int -(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags_ext flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event); - - -/********************************* -* cl_ext_cxx_for_opencl extension -*********************************/ -#define cl_ext_cxx_for_opencl 1 - -#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT 0x4230 - -/********************************* -* cl_qcom_ext_host_ptr extension -*********************************/ -#define cl_qcom_ext_host_ptr 1 - -#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) - -#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 -#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 -#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 -#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 -#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 -#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 -#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 -#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 - -typedef cl_uint cl_image_pitch_info_qcom; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceImageInfoQCOM(cl_device_id device, - size_t image_width, - size_t image_height, - const cl_image_format *image_format, - cl_image_pitch_info_qcom param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - -typedef struct _cl_mem_ext_host_ptr -{ - /* Type of external memory allocation. */ - /* Legal values will be defined in layered extensions. */ - cl_uint allocation_type; - - /* Host cache policy for this external memory allocation. */ - cl_uint host_cache_policy; - -} cl_mem_ext_host_ptr; - - -/******************************************* -* cl_qcom_ext_host_ptr_iocoherent extension -********************************************/ - -/* Cache policy specifying io-coherence */ -#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 - - -/********************************* -* cl_qcom_ion_host_ptr extension -*********************************/ - -#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 - -typedef struct _cl_mem_ion_host_ptr -{ - /* Type of external memory allocation. */ - /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ - cl_mem_ext_host_ptr ext_host_ptr; - - /* ION file descriptor */ - int ion_filedesc; - - /* Host pointer to the ION allocated memory */ - void* ion_hostptr; - -} cl_mem_ion_host_ptr; - - -/********************************* -* cl_qcom_android_native_buffer_host_ptr extension -*********************************/ - -#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 - -typedef struct _cl_mem_android_native_buffer_host_ptr -{ - /* Type of external memory allocation. */ - /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ - cl_mem_ext_host_ptr ext_host_ptr; - - /* Virtual pointer to the android native buffer */ - void* anb_ptr; - -} cl_mem_android_native_buffer_host_ptr; - - -/****************************************** - * cl_img_yuv_image extension * - ******************************************/ - -/* Image formats used in clCreateImage */ -#define CL_NV21_IMG 0x40D0 -#define CL_YV12_IMG 0x40D1 - - -/****************************************** - * cl_img_cached_allocations extension * - ******************************************/ - -/* Flag values used by clCreateBuffer */ -#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) -#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) - - -/****************************************** - * cl_img_use_gralloc_ptr extension * - ******************************************/ -#define cl_img_use_gralloc_ptr 1 - -/* Flag values used by clCreateBuffer */ -#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) - -/* To be used by clGetEventInfo: */ -#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 -#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 - -/* Error codes from clEnqueueAcquireGrallocObjectsIMG and clEnqueueReleaseGrallocObjectsIMG */ -#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 -#define CL_INVALID_GRALLOC_OBJECT_IMG 0x40D5 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -/****************************************** - * cl_img_generate_mipmap extension * - ******************************************/ -#define cl_img_generate_mipmap 1 - -typedef cl_uint cl_mipmap_filter_mode_img; - -/* To be used by clEnqueueGenerateMipmapIMG */ -#define CL_MIPMAP_FILTER_ANY_IMG 0x0 -#define CL_MIPMAP_FILTER_BOX_IMG 0x1 - -/* To be used by clGetEventInfo */ -#define CL_COMMAND_GENERATE_MIPMAP_IMG 0x40D6 - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueGenerateMipmapIMG(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - cl_mipmap_filter_mode_img mipmap_filter_mode, - const size_t *array_region, - const size_t *mip_region, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -/****************************************** - * cl_img_mem_properties extension * - ******************************************/ -#define cl_img_mem_properties 1 - -/* To be used by clCreateBufferWithProperties */ -#define CL_MEM_ALLOC_FLAGS_IMG 0x40D7 - -/* To be used wiith the CL_MEM_ALLOC_FLAGS_IMG property */ -typedef cl_bitfield cl_mem_alloc_flags_img; - -/* To be used with cl_mem_alloc_flags_img */ -#define CL_MEM_ALLOC_RELAX_REQUIREMENTS_IMG (1 << 0) - -/********************************* -* cl_khr_subgroups extension -*********************************/ -#define cl_khr_subgroups 1 - -#if !defined(CL_VERSION_2_1) -/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. - In hindsight, there should have been a khr suffix on this type for - the extension, but keeping it un-suffixed to maintain backwards - compatibility. */ -typedef cl_uint cl_kernel_sub_group_info; -#endif - -/* cl_kernel_sub_group_info */ -#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 -#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelSubGroupInfoKHR(cl_kernel in_kernel, - cl_device_id in_device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void * input_value, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED; - -typedef cl_int -(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel, - cl_device_id in_device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void * input_value, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED; - - -/********************************* -* cl_khr_mipmap_image extension -*********************************/ - -/* cl_sampler_properties */ -#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155 -#define CL_SAMPLER_LOD_MIN_KHR 0x1156 -#define CL_SAMPLER_LOD_MAX_KHR 0x1157 - - -/********************************* -* cl_khr_priority_hints extension -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_priority_hints 1 - -typedef cl_uint cl_queue_priority_khr; - -/* cl_command_queue_properties */ -#define CL_QUEUE_PRIORITY_KHR 0x1096 - -/* cl_queue_priority_khr */ -#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) -#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) -#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) - - -/********************************* -* cl_khr_throttle_hints extension -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_throttle_hints 1 - -typedef cl_uint cl_queue_throttle_khr; - -/* cl_command_queue_properties */ -#define CL_QUEUE_THROTTLE_KHR 0x1097 - -/* cl_queue_throttle_khr */ -#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) -#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) -#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) - - -/********************************* -* cl_khr_subgroup_named_barrier -*********************************/ -/* This extension define is for backwards compatibility. - It shouldn't be required since this extension has no new functions. */ -#define cl_khr_subgroup_named_barrier 1 - -/* cl_device_info */ -#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 - - -/********************************* -* cl_khr_extended_versioning -*********************************/ - -#define cl_khr_extended_versioning 1 - -#define CL_VERSION_MAJOR_BITS_KHR (10) -#define CL_VERSION_MINOR_BITS_KHR (10) -#define CL_VERSION_PATCH_BITS_KHR (12) - -#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1) -#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1) -#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1) - -#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) -#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR) -#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR) - -#define CL_MAKE_VERSION_KHR(major, minor, patch) \ - ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \ - (((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \ - ((patch) & CL_VERSION_PATCH_MASK_KHR)) - -typedef cl_uint cl_version_khr; - -#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64 - -typedef struct _cl_name_version_khr -{ - cl_version_khr version; - char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR]; -} cl_name_version_khr; - -/* cl_platform_info */ -#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906 -#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907 - -/* cl_device_info */ -#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E -#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F -#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060 -#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061 -#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062 - - -/********************************* -* cl_khr_device_uuid extension -*********************************/ -#define cl_khr_device_uuid 1 - -#define CL_UUID_SIZE_KHR 16 -#define CL_LUID_SIZE_KHR 8 - -#define CL_DEVICE_UUID_KHR 0x106A -#define CL_DRIVER_UUID_KHR 0x106B -#define CL_DEVICE_LUID_VALID_KHR 0x106C -#define CL_DEVICE_LUID_KHR 0x106D -#define CL_DEVICE_NODE_MASK_KHR 0x106E - - -/*************************************************************** -* cl_khr_pci_bus_info -***************************************************************/ -#define cl_khr_pci_bus_info 1 - -typedef struct _cl_device_pci_bus_info_khr { - cl_uint pci_domain; - cl_uint pci_bus; - cl_uint pci_device; - cl_uint pci_function; -} cl_device_pci_bus_info_khr; - -/* cl_device_info */ -#define CL_DEVICE_PCI_BUS_INFO_KHR 0x410F - - -/*************************************************************** -* cl_khr_suggested_local_work_size -***************************************************************/ -#define cl_khr_suggested_local_work_size 1 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetKernelSuggestedLocalWorkSizeKHR( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0; - -typedef cl_int (CL_API_CALL * -clGetKernelSuggestedLocalWorkSizeKHR_fn)( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0; - - -/*************************************************************** -* cl_khr_integer_dot_product -***************************************************************/ -#define cl_khr_integer_dot_product 1 - -typedef cl_bitfield cl_device_integer_dot_product_capabilities_khr; - -/* cl_device_integer_dot_product_capabilities_khr */ -#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0) -#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR (1 << 1) - -typedef struct _cl_device_integer_dot_product_acceleration_properties_khr { - cl_bool signed_accelerated; - cl_bool unsigned_accelerated; - cl_bool mixed_signedness_accelerated; - cl_bool accumulating_saturating_signed_accelerated; - cl_bool accumulating_saturating_unsigned_accelerated; - cl_bool accumulating_saturating_mixed_signedness_accelerated; -} cl_device_integer_dot_product_acceleration_properties_khr; - -/* cl_device_info */ -#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR 0x1073 -#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR 0x1074 -#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR 0x1075 - - -/*************************************************************** -* cl_khr_external_memory -***************************************************************/ -#define cl_khr_external_memory 1 - -typedef cl_uint cl_external_memory_handle_type_khr; - -/* cl_platform_info */ -#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044 - -/* cl_device_info */ -#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x204F - -/* cl_mem_properties */ -#define CL_DEVICE_HANDLE_LIST_KHR 0x2051 -#define CL_DEVICE_HANDLE_LIST_END_KHR 0 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR 0x2047 -#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR 0x2048 - - -typedef cl_int (CL_API_CALL * -clEnqueueAcquireExternalMemObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_3_0; - -typedef cl_int (CL_API_CALL * -clEnqueueReleaseExternalMemObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_3_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireExternalMemObjectsKHR( - cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_3_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseExternalMemObjectsKHR( - cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_3_0; - -/*************************************************************** -* cl_khr_external_memory_dma_buf -***************************************************************/ -#define cl_khr_external_memory_dma_buf 1 - -/* cl_external_memory_handle_type_khr */ -#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR 0x2067 - -/*************************************************************** -* cl_khr_external_memory_dx -***************************************************************/ -#define cl_khr_external_memory_dx 1 - -/* cl_external_memory_handle_type_khr */ -#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KHR 0x2063 -#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KMT_KHR 0x2064 -#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR 0x2065 -#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_RESOURCE_KHR 0x2066 - -/*************************************************************** -* cl_khr_external_memory_opaque_fd -***************************************************************/ -#define cl_khr_external_memory_opaque_fd 1 - -/* cl_external_memory_handle_type_khr */ -#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR 0x2060 - -/*************************************************************** -* cl_khr_external_memory_win32 -***************************************************************/ -#define cl_khr_external_memory_win32 1 - -/* cl_external_memory_handle_type_khr */ -#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR 0x2061 -#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2062 - -/*************************************************************** -* cl_khr_external_semaphore -***************************************************************/ -#define cl_khr_external_semaphore 1 - -typedef struct _cl_semaphore_khr * cl_semaphore_khr; -typedef cl_uint cl_external_semaphore_handle_type_khr; - -/* cl_platform_info */ -#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x2037 -#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x2038 - -/* cl_device_info */ -#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x204D -#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x204E - -/* cl_semaphore_properties_khr */ -#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x203F -#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR 0 - - -typedef cl_int (CL_API_CALL * -clGetSemaphoreHandleForTypeKHR_fn)( - cl_semaphore_khr sema_object, - cl_device_id device, - cl_external_semaphore_handle_type_khr handle_type, - size_t handle_size, - void* handle_ptr, - size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSemaphoreHandleForTypeKHR( - cl_semaphore_khr sema_object, - cl_device_id device, - cl_external_semaphore_handle_type_khr handle_type, - size_t handle_size, - void* handle_ptr, - size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2; - -/*************************************************************** -* cl_khr_external_semaphore_dx_fence -***************************************************************/ -#define cl_khr_external_semaphore_dx_fence 1 - -/* cl_external_semaphore_handle_type_khr */ -#define CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR 0x2059 - -/*************************************************************** -* cl_khr_external_semaphore_opaque_fd -***************************************************************/ -#define cl_khr_external_semaphore_opaque_fd 1 - -/* cl_external_semaphore_handle_type_khr */ -#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR 0x2055 - -/*************************************************************** -* cl_khr_external_semaphore_sync_fd -***************************************************************/ -#define cl_khr_external_semaphore_sync_fd 1 - -/* cl_external_semaphore_handle_type_khr */ -#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR 0x2058 - -/*************************************************************** -* cl_khr_external_semaphore_win32 -***************************************************************/ -#define cl_khr_external_semaphore_win32 1 - -/* cl_external_semaphore_handle_type_khr */ -#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR 0x2056 -#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2057 - -/*************************************************************** -* cl_khr_semaphore -***************************************************************/ -#define cl_khr_semaphore 1 - -/* type cl_semaphore_khr */ -typedef cl_properties cl_semaphore_properties_khr; -typedef cl_uint cl_semaphore_info_khr; -typedef cl_uint cl_semaphore_type_khr; -typedef cl_ulong cl_semaphore_payload_khr; - -/* cl_semaphore_type */ -#define CL_SEMAPHORE_TYPE_BINARY_KHR 1 - -/* cl_platform_info */ -#define CL_PLATFORM_SEMAPHORE_TYPES_KHR 0x2036 - -/* cl_device_info */ -#define CL_DEVICE_SEMAPHORE_TYPES_KHR 0x204C - -/* cl_semaphore_info_khr */ -#define CL_SEMAPHORE_CONTEXT_KHR 0x2039 -#define CL_SEMAPHORE_REFERENCE_COUNT_KHR 0x203A -#define CL_SEMAPHORE_PROPERTIES_KHR 0x203B -#define CL_SEMAPHORE_PAYLOAD_KHR 0x203C - -/* cl_semaphore_info_khr or cl_semaphore_properties_khr */ -#define CL_SEMAPHORE_TYPE_KHR 0x203D -/* enum CL_DEVICE_HANDLE_LIST_KHR */ -/* enum CL_DEVICE_HANDLE_LIST_END_KHR */ - -/* cl_command_type */ -#define CL_COMMAND_SEMAPHORE_WAIT_KHR 0x2042 -#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR 0x2043 - -/* Error codes */ -#define CL_INVALID_SEMAPHORE_KHR -1142 - - -typedef cl_semaphore_khr (CL_API_CALL * -clCreateSemaphoreWithPropertiesKHR_fn)( - cl_context context, - const cl_semaphore_properties_khr* sema_props, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * -clEnqueueWaitSemaphoresKHR_fn)( - cl_command_queue command_queue, - cl_uint num_sema_objects, - const cl_semaphore_khr* sema_objects, - const cl_semaphore_payload_khr* sema_payload_list, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * -clEnqueueSignalSemaphoresKHR_fn)( - cl_command_queue command_queue, - cl_uint num_sema_objects, - const cl_semaphore_khr* sema_objects, - const cl_semaphore_payload_khr* sema_payload_list, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * -clGetSemaphoreInfoKHR_fn)( - cl_semaphore_khr sema_object, - cl_semaphore_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * -clReleaseSemaphoreKHR_fn)( - cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * -clRetainSemaphoreKHR_fn)( - cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL -clCreateSemaphoreWithPropertiesKHR( - cl_context context, - const cl_semaphore_properties_khr* sema_props, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWaitSemaphoresKHR( - cl_command_queue command_queue, - cl_uint num_sema_objects, - const cl_semaphore_khr* sema_objects, - const cl_semaphore_payload_khr* sema_payload_list, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSignalSemaphoresKHR( - cl_command_queue command_queue, - cl_uint num_sema_objects, - const cl_semaphore_khr* sema_objects, - const cl_semaphore_payload_khr* sema_payload_list, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSemaphoreInfoKHR( - cl_semaphore_khr sema_object, - cl_semaphore_info_khr param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseSemaphoreKHR( - cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainSemaphoreKHR( - cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; - -/********************************** - * cl_arm_import_memory extension * - **********************************/ -#define cl_arm_import_memory 1 - -typedef intptr_t cl_import_properties_arm; - -/* Default and valid proporties name for cl_arm_import_memory */ -#define CL_IMPORT_TYPE_ARM 0x40B2 - -/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 - -/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 - -/* Protected memory property */ -#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5 - -/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */ -#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2 - -/* Data consistency with host property */ -#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3 - -/* Index of plane in a multiplanar hardware buffer */ -#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM 0x41EF - -/* Index of layer in a multilayer hardware buffer */ -#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM 0x41F0 - -/* Import memory size value to indicate a size for the whole buffer */ -#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX - -/* This extension adds a new function that allows for direct memory import into - * OpenCL via the clImportMemoryARM function. - * - * Memory imported through this interface will be mapped into the device's page - * tables directly, providing zero copy access. It will never fall back to copy - * operations and aliased buffers. - * - * Types of memory supported for import are specified as additional extension - * strings. - * - * This extension produces cl_mem allocations which are compatible with all other - * users of cl_mem in the standard API. - * - * This extension maps pages with the same properties as the normal buffer creation - * function clCreateBuffer. - */ -extern CL_API_ENTRY cl_mem CL_API_CALL -clImportMemoryARM( cl_context context, - cl_mem_flags flags, - const cl_import_properties_arm *properties, - void *memory, - size_t size, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - - -/****************************************** - * cl_arm_shared_virtual_memory extension * - ******************************************/ -#define cl_arm_shared_virtual_memory 1 - -/* Used by clGetDeviceInfo */ -#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 - -/* Used by clGetMemObjectInfo */ -#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 - -/* Used by clSetKernelExecInfoARM: */ -#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 -#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 - -/* To be used by clGetEventInfo: */ -#define CL_COMMAND_SVM_FREE_ARM 0x40BA -#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB -#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC -#define CL_COMMAND_SVM_MAP_ARM 0x40BD -#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE - -/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ -#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) -#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) -#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) -#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) - -/* Flag values used by clSVMAllocARM: */ -#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) -#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) - -typedef cl_bitfield cl_svm_mem_flags_arm; -typedef cl_uint cl_kernel_exec_info_arm; -typedef cl_bitfield cl_device_svm_capabilities_arm; - -extern CL_API_ENTRY void * CL_API_CALL -clSVMAllocARM(cl_context context, - cl_svm_mem_flags_arm flags, - size_t size, - cl_uint alignment) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY void CL_API_CALL -clSVMFreeARM(cl_context context, - void * svm_pointer) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMFreeARM(cl_command_queue command_queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, - cl_uint num_svm_pointers, - void * svm_pointers[], - void * user_data), - void * user_data, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemcpyARM(cl_command_queue command_queue, - cl_bool blocking_copy, - void * dst_ptr, - const void * src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMemFillARM(cl_command_queue command_queue, - void * svm_ptr, - const void * pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMMapARM(cl_command_queue command_queue, - cl_bool blocking_map, - cl_map_flags flags, - void * svm_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueSVMUnmapARM(cl_command_queue command_queue, - void * svm_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgSVMPointerARM(cl_kernel kernel, - cl_uint arg_index, - const void * arg_value) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelExecInfoARM(cl_kernel kernel, - cl_kernel_exec_info_arm param_name, - size_t param_value_size, - const void * param_value) CL_API_SUFFIX__VERSION_1_2; - -/******************************** - * cl_arm_get_core_id extension * - ********************************/ - -#ifdef CL_VERSION_1_2 - -#define cl_arm_get_core_id 1 - -/* Device info property for bitfield of cores present */ -#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF - -#endif /* CL_VERSION_1_2 */ - -/********************************* -* cl_arm_job_slot_selection -*********************************/ - -#define cl_arm_job_slot_selection 1 - -/* cl_device_info */ -#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0 - -/* cl_command_queue_properties */ -#define CL_QUEUE_JOB_SLOT_ARM 0x41E1 - -/********************************* -* cl_arm_scheduling_controls -*********************************/ - -#define cl_arm_scheduling_controls 1 - -typedef cl_bitfield cl_device_scheduling_controls_capabilities_arm; - -/* cl_device_info */ -#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM 0x41E4 - -#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM (1 << 0) -#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM (1 << 1) -#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2) -#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM (1 << 3) -#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM (1 << 4) -#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM (1 << 5) -#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_BATCH_QUEUE_SIZE_ARM (1 << 6) - -#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM 0x41EB -#define CL_DEVICE_MAX_WARP_COUNT_ARM 0x41EA - -/* cl_kernel_info */ -#define CL_KERNEL_MAX_WARP_COUNT_ARM 0x41E9 - -/* cl_kernel_exec_info */ -#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM 0x41E5 -#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6 -#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM 0x41E8 -#define CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM 0x41F1 - -/* cl_queue_properties */ -#define CL_QUEUE_KERNEL_BATCHING_ARM 0x41E7 -#define CL_QUEUE_DEFERRED_FLUSH_ARM 0x41EC - -/************************************** -* cl_arm_controlled_kernel_termination -***************************************/ - -#define cl_arm_controlled_kernel_termination 1 - -/* Error code to indicate kernel terminated with failure */ -#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM -1108 - -/* cl_device_info */ -#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM 0x41EE - -/* Bit fields for controlled termination feature query */ -typedef cl_bitfield cl_device_controlled_termination_capabilities_arm; - -#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM (1 << 0) -#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM (1 << 1) -#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM (1 << 2) - -/* cl_event_info */ -#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM 0x41ED - -/* Values returned for event termination reason query */ -typedef cl_uint cl_command_termination_reason_arm; - -#define CL_COMMAND_TERMINATION_COMPLETION_ARM 0 -#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM 1 -#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM 2 -#define CL_COMMAND_TERMINATION_ERROR_ARM 3 - -/************************************* -* cl_arm_protected_memory_allocation * -*************************************/ - -#define cl_arm_protected_memory_allocation 1 - -#define CL_MEM_PROTECTED_ALLOC_ARM (1ULL << 36) - -/****************************************** -* cl_intel_exec_by_local_thread extension * -******************************************/ - -#define cl_intel_exec_by_local_thread 1 - -#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) - -/*************************************************************** -* cl_intel_device_attribute_query -***************************************************************/ - -#define cl_intel_device_attribute_query 1 - -typedef cl_bitfield cl_device_feature_capabilities_intel; - -/* cl_device_feature_capabilities_intel */ -#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL (1 << 0) -#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL (1 << 1) - -/* cl_device_info */ -#define CL_DEVICE_IP_VERSION_INTEL 0x4250 -#define CL_DEVICE_ID_INTEL 0x4251 -#define CL_DEVICE_NUM_SLICES_INTEL 0x4252 -#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253 -#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254 -#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL 0x4255 -#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL 0x4256 - -/*********************************************** -* cl_intel_device_partition_by_names extension * -************************************************/ - -#define cl_intel_device_partition_by_names 1 - -#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 -#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 - -/************************************************ -* cl_intel_accelerator extension * -* cl_intel_motion_estimation extension * -* cl_intel_advanced_motion_estimation extension * -*************************************************/ - -#define cl_intel_accelerator 1 -#define cl_intel_motion_estimation 1 -#define cl_intel_advanced_motion_estimation 1 - -typedef struct _cl_accelerator_intel* cl_accelerator_intel; -typedef cl_uint cl_accelerator_type_intel; -typedef cl_uint cl_accelerator_info_intel; - -typedef struct _cl_motion_estimation_desc_intel { - cl_uint mb_block_type; - cl_uint subpixel_mode; - cl_uint sad_adjust_mode; - cl_uint search_path_type; -} cl_motion_estimation_desc_intel; - -/* error codes */ -#define CL_INVALID_ACCELERATOR_INTEL -1094 -#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 -#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 -#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 - -/* cl_accelerator_type_intel */ -#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 - -/* cl_accelerator_info_intel */ -#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 -#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 -#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 -#define CL_ACCELERATOR_TYPE_INTEL 0x4093 - -/* cl_motion_detect_desc_intel flags */ -#define CL_ME_MB_TYPE_16x16_INTEL 0x0 -#define CL_ME_MB_TYPE_8x8_INTEL 0x1 -#define CL_ME_MB_TYPE_4x4_INTEL 0x2 - -#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 - -#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 - -#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 -#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 -#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 - -#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 -#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 -#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 -#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 - -#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 -#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 -#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 - -#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 -#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 -#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 -#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 -#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 - -#define CL_ME_COST_PENALTY_NONE_INTEL 0x0 -#define CL_ME_COST_PENALTY_LOW_INTEL 0x1 -#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 -#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 - -#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 -#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 -#define CL_ME_COST_PRECISION_PEL_INTEL 0x2 -#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 - -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 - -#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 - -#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 - -/* cl_device_info */ -#define CL_DEVICE_ME_VERSION_INTEL 0x407E - -#define CL_ME_VERSION_LEGACY_INTEL 0x0 -#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 -#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 - -extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL -clCreateAcceleratorINTEL( - cl_context context, - cl_accelerator_type_intel accelerator_type, - size_t descriptor_size, - const void* descriptor, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( - cl_context context, - cl_accelerator_type_intel accelerator_type, - size_t descriptor_size, - const void* descriptor, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetAcceleratorInfoINTEL( - cl_accelerator_intel accelerator, - cl_accelerator_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( - cl_accelerator_intel accelerator, - cl_accelerator_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clRetainAcceleratorINTEL( - cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( - cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseAcceleratorINTEL( - cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( - cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2; - -/****************************************** -* cl_intel_simultaneous_sharing extension * -*******************************************/ - -#define cl_intel_simultaneous_sharing 1 - -#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 -#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 - -/*********************************** -* cl_intel_egl_image_yuv extension * -************************************/ - -#define cl_intel_egl_image_yuv 1 - -#define CL_EGL_YUV_PLANE_INTEL 0x4107 - -/******************************** -* cl_intel_packed_yuv extension * -*********************************/ - -#define cl_intel_packed_yuv 1 - -#define CL_YUYV_INTEL 0x4076 -#define CL_UYVY_INTEL 0x4077 -#define CL_YVYU_INTEL 0x4078 -#define CL_VYUY_INTEL 0x4079 - -/******************************************** -* cl_intel_required_subgroup_size extension * -*********************************************/ - -#define cl_intel_required_subgroup_size 1 - -#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 -#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 -#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A - -/**************************************** -* cl_intel_driver_diagnostics extension * -*****************************************/ - -#define cl_intel_driver_diagnostics 1 - -typedef cl_uint cl_diagnostics_verbose_level; - -#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 - -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) - -/******************************** -* cl_intel_planar_yuv extension * -*********************************/ - -#define CL_NV12_INTEL 0x410E - -#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) -#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) - -#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E -#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F - -/******************************************************* -* cl_intel_device_side_avc_motion_estimation extension * -********************************************************/ - -#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B -#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C -#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D - -#define CL_AVC_ME_VERSION_0_INTEL 0x0 /* No support. */ -#define CL_AVC_ME_VERSION_1_INTEL 0x1 /* First supported version. */ - -#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 -#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 -#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 -#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 - -#define CL_AVC_ME_MINOR_8x8_INTEL 0x0 -#define CL_AVC_ME_MINOR_8x4_INTEL 0x1 -#define CL_AVC_ME_MINOR_4x8_INTEL 0x2 -#define CL_AVC_ME_MINOR_4x4_INTEL 0x3 - -#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 -#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 -#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 - -#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 -#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E -#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D -#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B -#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 -#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F -#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F -#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F - -#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 -#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 -#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 -#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 -#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 -#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 -#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 -#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 -#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 -#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 -#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 -#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa - -#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 - -#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 - -#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 -#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 -#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 -#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 - -#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 -#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 -#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 -#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B -#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 - -#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 -#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 -#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 -#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 - -#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 -#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 - -#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) -#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) - -#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 -#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 - -#define CL_AVC_ME_INTRA_16x16_INTEL 0x0 -#define CL_AVC_ME_INTRA_8x8_INTEL 0x1 -#define CL_AVC_ME_INTRA_4x4_INTEL 0x2 - -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 -#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 - -#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 -#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 - -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 - -#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 -#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 -#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 - -#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 -#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 -#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 - -#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 -#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 - -/******************************************* -* cl_intel_unified_shared_memory extension * -********************************************/ -#define cl_intel_unified_shared_memory 1 - -typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; -typedef cl_properties cl_mem_properties_intel; -typedef cl_bitfield cl_mem_alloc_flags_intel; -typedef cl_uint cl_mem_info_intel; -typedef cl_uint cl_unified_shared_memory_type_intel; -typedef cl_uint cl_mem_advice_intel; - -/* cl_device_info */ -#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 -#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 -#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 -#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 -#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 - -/* cl_device_unified_shared_memory_capabilities_intel - bitfield */ -#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) -#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3) - -/* cl_mem_properties_intel */ -#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 - -/* cl_mem_alloc_flags_intel - bitfield */ -#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) -#define CL_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE_INTEL (1 << 1) -#define CL_MEM_ALLOC_INITIAL_PLACEMENT_HOST_INTEL (1 << 2) - -/* cl_mem_alloc_info_intel */ -#define CL_MEM_ALLOC_TYPE_INTEL 0x419A -#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B -#define CL_MEM_ALLOC_SIZE_INTEL 0x419C -#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D - -/* cl_unified_shared_memory_type_intel */ -#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 -#define CL_MEM_TYPE_HOST_INTEL 0x4197 -#define CL_MEM_TYPE_DEVICE_INTEL 0x4198 -#define CL_MEM_TYPE_SHARED_INTEL 0x4199 - -/* cl_kernel_exec_info */ -#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200 -#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201 -#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202 -#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203 - -/* cl_command_type */ -#define CL_COMMAND_MEMFILL_INTEL 0x4204 -#define CL_COMMAND_MEMCPY_INTEL 0x4205 -#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 -#define CL_COMMAND_MEMADVISE_INTEL 0x4207 - - -typedef void* (CL_API_CALL * -clHostMemAllocINTEL_fn)( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -typedef void* (CL_API_CALL * -clDeviceMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -typedef void* (CL_API_CALL * -clSharedMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -typedef cl_int (CL_API_CALL * -clMemFreeINTEL_fn)( - cl_context context, - void* ptr) ; - -typedef cl_int (CL_API_CALL * -clMemBlockingFreeINTEL_fn)( - cl_context context, - void* ptr) ; - -typedef cl_int (CL_API_CALL * -clGetMemAllocInfoINTEL_fn)( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -typedef cl_int (CL_API_CALL * -clSetKernelArgMemPointerINTEL_fn)( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value) ; - -typedef cl_int (CL_API_CALL * -clEnqueueMemFillINTEL_fn)( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -typedef cl_int (CL_API_CALL * -clEnqueueMemcpyINTEL_fn)( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -typedef cl_int (CL_API_CALL * -clEnqueueMemAdviseINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#ifndef CL_NO_PROTOTYPES - -extern CL_API_ENTRY void* CL_API_CALL -clHostMemAllocINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -extern CL_API_ENTRY void* CL_API_CALL -clDeviceMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -extern CL_API_ENTRY void* CL_API_CALL -clSharedMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clMemFreeINTEL( - cl_context context, - void* ptr) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clMemBlockingFreeINTEL( - cl_context context, - void* ptr) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMemAllocInfoINTEL( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgMemPointerINTEL( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemFillINTEL( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemcpyINTEL( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemAdviseINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#endif /* CL_NO_PROTOTYPES */ - -#if defined(CL_VERSION_1_2) -/* Requires OpenCL 1.2 for cl_mem_migration_flags: */ - -typedef cl_int (CL_API_CALL * -clEnqueueMigrateMemINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#ifndef CL_NO_PROTOTYPES - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#endif /* CL_NO_PROTOTYPES */ - -#endif /* defined(CL_VERSION_1_2) */ - -/* deprecated, use clEnqueueMemFillINTEL instead */ - -typedef cl_int (CL_API_CALL * -clEnqueueMemsetINTEL_fn)( - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#ifndef CL_NO_PROTOTYPES - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemsetINTEL( - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) ; - -#endif /* CL_NO_PROTOTYPES */ - -/*************************************************************** -* cl_intel_mem_alloc_buffer_location -***************************************************************/ -#define cl_intel_mem_alloc_buffer_location 1 -#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_NAME \ - "cl_intel_mem_alloc_buffer_location" - -/* cl_mem_properties_intel */ -#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL 0x419E - -/* cl_mem_alloc_info_intel */ -/* enum CL_MEM_ALLOC_BUFFER_LOCATION_INTEL */ - -/*************************************************** -* cl_intel_create_buffer_with_properties extension * -****************************************************/ - -#define cl_intel_create_buffer_with_properties 1 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateBufferWithPropertiesINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem (CL_API_CALL * -clCreateBufferWithPropertiesINTEL_fn)( - cl_context context, - const cl_mem_properties_intel* properties, - cl_mem_flags flags, - size_t size, - void * host_ptr, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -/****************************************** -* cl_intel_mem_channel_property extension * -*******************************************/ - -#define CL_MEM_CHANNEL_INTEL 0x4213 - -/********************************* -* cl_intel_mem_force_host_memory * -**********************************/ - -#define cl_intel_mem_force_host_memory 1 - -/* cl_mem_flags */ -#define CL_MEM_FORCE_HOST_MEMORY_INTEL (1 << 20) - -/*************************************************************** -* cl_intel_command_queue_families -***************************************************************/ -#define cl_intel_command_queue_families 1 - -typedef cl_bitfield cl_command_queue_capabilities_intel; - -#define CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL 64 - -typedef struct _cl_queue_family_properties_intel { - cl_command_queue_properties properties; - cl_command_queue_capabilities_intel capabilities; - cl_uint count; - char name[CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL]; -} cl_queue_family_properties_intel; - -/* cl_device_info */ -#define CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL 0x418B - -/* cl_queue_properties */ -#define CL_QUEUE_FAMILY_INTEL 0x418C -#define CL_QUEUE_INDEX_INTEL 0x418D - -/* cl_command_queue_capabilities_intel */ -#define CL_QUEUE_DEFAULT_CAPABILITIES_INTEL 0 -#define CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL (1 << 0) -#define CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL (1 << 1) -#define CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 2) -#define CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 3) -#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_INTEL (1 << 8) -#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_RECT_INTEL (1 << 9) -#define CL_QUEUE_CAPABILITY_MAP_BUFFER_INTEL (1 << 10) -#define CL_QUEUE_CAPABILITY_FILL_BUFFER_INTEL (1 << 11) -#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_INTEL (1 << 12) -#define CL_QUEUE_CAPABILITY_MAP_IMAGE_INTEL (1 << 13) -#define CL_QUEUE_CAPABILITY_FILL_IMAGE_INTEL (1 << 14) -#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_IMAGE_INTEL (1 << 15) -#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_BUFFER_INTEL (1 << 16) -#define CL_QUEUE_CAPABILITY_MARKER_INTEL (1 << 24) -#define CL_QUEUE_CAPABILITY_BARRIER_INTEL (1 << 25) -#define CL_QUEUE_CAPABILITY_KERNEL_INTEL (1 << 26) - -/*************************************************************** -* cl_intel_queue_no_sync_operations -***************************************************************/ - -#define cl_intel_queue_no_sync_operations 1 - -/* addition to cl_command_queue_properties */ -#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL (1 << 29) - -/*************************************************************** -* cl_intel_sharing_format_query -***************************************************************/ -#define cl_intel_sharing_format_query 1 - -/*************************************************************** -* cl_ext_image_requirements_info -***************************************************************/ - -#ifdef CL_VERSION_3_0 - -#define cl_ext_image_requirements_info 1 - -typedef cl_uint cl_image_requirements_info_ext; - -#define CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT 0x1290 -#define CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT 0x1292 -#define CL_IMAGE_REQUIREMENTS_SIZE_EXT 0x12B2 -#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT 0x12B3 -#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT 0x12B4 -#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT 0x12B5 -#define CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT 0x12B6 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetImageRequirementsInfoEXT( - cl_context context, - const cl_mem_properties* properties, - cl_mem_flags flags, - const cl_image_format* image_format, - const cl_image_desc* image_desc, - cl_image_requirements_info_ext param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0; - -typedef cl_int (CL_API_CALL * -clGetImageRequirementsInfoEXT_fn)( - cl_context context, - const cl_mem_properties* properties, - cl_mem_flags flags, - const cl_image_format* image_format, - const cl_image_desc* image_desc, - cl_image_requirements_info_ext param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0; - -#endif - -/*************************************************************** -* cl_ext_image_from_buffer -***************************************************************/ - -#ifdef CL_VERSION_3_0 - -#define cl_ext_image_from_buffer 1 - -#define CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT 0x1291 - -#endif - -#ifdef __cplusplus -} -#endif - - -#endif /* __CL_EXT_H */ diff --git a/include/CL/cl_ext_intel.h b/include/CL/cl_ext_intel.h deleted file mode 100644 index a7ae87a34..000000000 --- a/include/CL/cl_ext_intel.h +++ /dev/null @@ -1,19 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ******************************************************************************/ - -#include -#pragma message("The Intel extensions have been moved into cl_ext.h. Please include cl_ext.h directly.") diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h deleted file mode 100644 index 327746508..000000000 --- a/include/CL/cl_gl.h +++ /dev/null @@ -1,194 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2021 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_GL_H -#define __OPENCL_CL_GL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef cl_uint cl_gl_object_type; -typedef cl_uint cl_gl_texture_info; -typedef cl_uint cl_gl_platform_info; -typedef struct __GLsync *cl_GLsync; - -/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ -#define CL_GL_OBJECT_BUFFER 0x2000 -#define CL_GL_OBJECT_TEXTURE2D 0x2001 -#define CL_GL_OBJECT_TEXTURE3D 0x2002 -#define CL_GL_OBJECT_RENDERBUFFER 0x2003 -#ifdef CL_VERSION_1_2 -#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E -#define CL_GL_OBJECT_TEXTURE1D 0x200F -#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 -#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 -#endif - -/* cl_gl_texture_info */ -#define CL_GL_TEXTURE_TARGET 0x2004 -#define CL_GL_MIPMAP_LEVEL 0x2005 -#ifdef CL_VERSION_1_2 -#define CL_GL_NUM_SAMPLES 0x2012 -#endif - - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLBuffer(cl_context context, - cl_mem_flags flags, - cl_GLuint bufobj, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLTexture(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#endif - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLRenderbuffer(cl_context context, - cl_mem_flags flags, - cl_GLuint renderbuffer, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLObjectInfo(cl_mem memobj, - cl_gl_object_type * gl_object_type, - cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLTextureInfo(cl_mem memobj, - cl_gl_texture_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireGLObjects(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseGLObjects(cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - - -/* Deprecated OpenCL 1.1 APIs */ -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture2D(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture3D(cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -/* cl_khr_gl_sharing extension */ - -#define cl_khr_gl_sharing 1 - -typedef cl_uint cl_gl_context_info; - -/* Additional Error Codes */ -#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 - -/* cl_gl_context_info */ -#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 -#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 - -/* Additional cl_context_properties */ -#define CL_GL_CONTEXT_KHR 0x2008 -#define CL_EGL_DISPLAY_KHR 0x2009 -#define CL_GLX_DISPLAY_KHR 0x200A -#define CL_WGL_HDC_KHR 0x200B -#define CL_CGL_SHAREGROUP_KHR 0x200C - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLContextInfoKHR(const cl_context_properties * properties, - cl_gl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( - const cl_context_properties * properties, - cl_gl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -/* - * cl_khr_gl_event extension - */ -#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateEventFromGLsyncKHR(cl_context context, - cl_GLsync sync, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; - -/*************************************************************** -* cl_intel_sharing_format_query_gl -***************************************************************/ -#define cl_intel_sharing_format_query_gl 1 - -/* when cl_khr_gl_sharing is supported */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedGLTextureFormatsINTEL( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - cl_GLenum* gl_formats, - cl_uint* num_texture_formats) ; - -typedef cl_int (CL_API_CALL * -clGetSupportedGLTextureFormatsINTEL_fn)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - cl_GLenum* gl_formats, - cl_uint* num_texture_formats) ; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_GL_H */ diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h deleted file mode 100644 index 8ec818167..000000000 --- a/include/CL/cl_gl_ext.h +++ /dev/null @@ -1,18 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2021 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#include -#pragma message("All OpenGL-related extensions have been moved into cl_gl.h. Please include cl_gl.h directly.") diff --git a/include/CL/cl_half.h b/include/CL/cl_half.h deleted file mode 100644 index ecc422332..000000000 --- a/include/CL/cl_half.h +++ /dev/null @@ -1,440 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2019-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -/** - * This is a header-only utility library that provides OpenCL host code with - * routines for converting to/from cl_half values. - * - * Example usage: - * - * #include - * ... - * cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE); - * cl_float f = cl_half_to_float(h); - */ - -#ifndef OPENCL_CL_HALF_H -#define OPENCL_CL_HALF_H - -#include - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -/** - * Rounding mode used when converting to cl_half. - */ -typedef enum -{ - CL_HALF_RTE, // round to nearest even - CL_HALF_RTZ, // round towards zero - CL_HALF_RTP, // round towards positive infinity - CL_HALF_RTN, // round towards negative infinity -} cl_half_rounding_mode; - - -/* Private utility macros. */ -#define CL_HALF_EXP_MASK 0x7C00 -#define CL_HALF_MAX_FINITE_MAG 0x7BFF - - -/* - * Utility to deal with values that overflow when converting to half precision. - */ -static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode, - uint16_t sign) -{ - if (rounding_mode == CL_HALF_RTZ) - { - // Round overflow towards zero -> largest finite number (preserving sign) - return (sign << 15) | CL_HALF_MAX_FINITE_MAG; - } - else if (rounding_mode == CL_HALF_RTP && sign) - { - // Round negative overflow towards positive infinity -> most negative finite number - return (1 << 15) | CL_HALF_MAX_FINITE_MAG; - } - else if (rounding_mode == CL_HALF_RTN && !sign) - { - // Round positive overflow towards negative infinity -> largest finite number - return CL_HALF_MAX_FINITE_MAG; - } - - // Overflow to infinity - return (sign << 15) | CL_HALF_EXP_MASK; -} - -/* - * Utility to deal with values that underflow when converting to half precision. - */ -static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode, - uint16_t sign) -{ - if (rounding_mode == CL_HALF_RTP && !sign) - { - // Round underflow towards positive infinity -> smallest positive value - return (sign << 15) | 1; - } - else if (rounding_mode == CL_HALF_RTN && sign) - { - // Round underflow towards negative infinity -> largest negative value - return (sign << 15) | 1; - } - - // Flush to zero - return (sign << 15); -} - - -/** - * Convert a cl_float to a cl_half. - */ -static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_float f; - uint32_t i; - } f32; - f32.f = f; - - // Extract sign bit - uint16_t sign = f32.i >> 31; - - // Extract FP32 exponent and mantissa - uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF; - uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1); - - // Remove FP32 exponent bias - int32_t exp = f_exp - CL_FLT_MAX_EXP + 1; - - // Add FP16 exponent bias - uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1); - - // Position of the bit that will become the FP16 mantissa LSB - uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG; - - // Check for NaN / infinity - if (f_exp == 0xFF) - { - if (f_mant) - { - // NaN -> propagate mantissa and silence it - uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos); - h_mant |= 0x200; - return (sign << 15) | CL_HALF_EXP_MASK | h_mant; - } - else - { - // Infinity -> zero mantissa - return (sign << 15) | CL_HALF_EXP_MASK; - } - } - - // Check for zero - if (!f_exp && !f_mant) - { - return (sign << 15); - } - - // Check for overflow - if (exp >= CL_HALF_MAX_EXP) - { - return cl_half_handle_overflow(rounding_mode, sign); - } - - // Check for underflow - if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) - { - return cl_half_handle_underflow(rounding_mode, sign); - } - - // Check for value that will become denormal - if (exp < -14) - { - // Denormal -> include the implicit 1 from the FP32 mantissa - h_exp = 0; - f_mant |= 1 << (CL_FLT_MANT_DIG - 1); - - // Mantissa shift amount depends on exponent - lsb_pos = -exp + (CL_FLT_MANT_DIG - 25); - } - - // Generate FP16 mantissa by shifting FP32 mantissa - uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos); - - // Check whether we need to round - uint32_t halfway = 1 << (lsb_pos - 1); - uint32_t mask = (halfway << 1) - 1; - switch (rounding_mode) - { - case CL_HALF_RTE: - if ((f_mant & mask) > halfway) - { - // More than halfway -> round up - h_mant += 1; - } - else if ((f_mant & mask) == halfway) - { - // Exactly halfway -> round to nearest even - if (h_mant & 0x1) - h_mant += 1; - } - break; - case CL_HALF_RTZ: - // Mantissa has already been truncated -> do nothing - break; - case CL_HALF_RTP: - if ((f_mant & mask) && !sign) - { - // Round positive numbers up - h_mant += 1; - } - break; - case CL_HALF_RTN: - if ((f_mant & mask) && sign) - { - // Round negative numbers down - h_mant += 1; - } - break; - } - - // Check for mantissa overflow - if (h_mant & 0x400) - { - h_exp += 1; - h_mant = 0; - } - - return (sign << 15) | (h_exp << 10) | h_mant; -} - - -/** - * Convert a cl_double to a cl_half. - */ -static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_double d; - uint64_t i; - } f64; - f64.d = d; - - // Extract sign bit - uint16_t sign = f64.i >> 63; - - // Extract FP64 exponent and mantissa - uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF; - uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1); - - // Remove FP64 exponent bias - int64_t exp = d_exp - CL_DBL_MAX_EXP + 1; - - // Add FP16 exponent bias - uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1); - - // Position of the bit that will become the FP16 mantissa LSB - uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG; - - // Check for NaN / infinity - if (d_exp == 0x7FF) - { - if (d_mant) - { - // NaN -> propagate mantissa and silence it - uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); - h_mant |= 0x200; - return (sign << 15) | CL_HALF_EXP_MASK | h_mant; - } - else - { - // Infinity -> zero mantissa - return (sign << 15) | CL_HALF_EXP_MASK; - } - } - - // Check for zero - if (!d_exp && !d_mant) - { - return (sign << 15); - } - - // Check for overflow - if (exp >= CL_HALF_MAX_EXP) - { - return cl_half_handle_overflow(rounding_mode, sign); - } - - // Check for underflow - if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) - { - return cl_half_handle_underflow(rounding_mode, sign); - } - - // Check for value that will become denormal - if (exp < -14) - { - // Include the implicit 1 from the FP64 mantissa - h_exp = 0; - d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1); - - // Mantissa shift amount depends on exponent - lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25)); - } - - // Generate FP16 mantissa by shifting FP64 mantissa - uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); - - // Check whether we need to round - uint64_t halfway = (uint64_t)1 << (lsb_pos - 1); - uint64_t mask = (halfway << 1) - 1; - switch (rounding_mode) - { - case CL_HALF_RTE: - if ((d_mant & mask) > halfway) - { - // More than halfway -> round up - h_mant += 1; - } - else if ((d_mant & mask) == halfway) - { - // Exactly halfway -> round to nearest even - if (h_mant & 0x1) - h_mant += 1; - } - break; - case CL_HALF_RTZ: - // Mantissa has already been truncated -> do nothing - break; - case CL_HALF_RTP: - if ((d_mant & mask) && !sign) - { - // Round positive numbers up - h_mant += 1; - } - break; - case CL_HALF_RTN: - if ((d_mant & mask) && sign) - { - // Round negative numbers down - h_mant += 1; - } - break; - } - - // Check for mantissa overflow - if (h_mant & 0x400) - { - h_exp += 1; - h_mant = 0; - } - - return (sign << 15) | (h_exp << 10) | h_mant; -} - - -/** - * Convert a cl_half to a cl_float. - */ -static inline cl_float cl_half_to_float(cl_half h) -{ - // Type-punning to get direct access to underlying bits - union - { - cl_float f; - uint32_t i; - } f32; - - // Extract sign bit - uint16_t sign = h >> 15; - - // Extract FP16 exponent and mantissa - uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F; - uint16_t h_mant = h & 0x3FF; - - // Remove FP16 exponent bias - int32_t exp = h_exp - CL_HALF_MAX_EXP + 1; - - // Add FP32 exponent bias - uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1; - - // Check for NaN / infinity - if (h_exp == 0x1F) - { - if (h_mant) - { - // NaN -> propagate mantissa and silence it - uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); - f_mant |= 0x400000; - f32.i = (sign << 31) | 0x7F800000 | f_mant; - return f32.f; - } - else - { - // Infinity -> zero mantissa - f32.i = (sign << 31) | 0x7F800000; - return f32.f; - } - } - - // Check for zero / denormal - if (h_exp == 0) - { - if (h_mant == 0) - { - // Zero -> zero exponent - f_exp = 0; - } - else - { - // Denormal -> normalize it - // - Shift mantissa to make most-significant 1 implicit - // - Adjust exponent accordingly - uint32_t shift = 0; - while ((h_mant & 0x400) == 0) - { - h_mant <<= 1; - shift++; - } - h_mant &= 0x3FF; - f_exp -= shift - 1; - } - } - - f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13); - return f32.f; -} - - -#undef CL_HALF_EXP_MASK -#undef CL_HALF_MAX_FINITE_MAG - - -#ifdef __cplusplus -} -#endif - - -#endif /* OPENCL_CL_HALF_H */ diff --git a/include/CL/cl_icd.h b/include/CL/cl_icd.h deleted file mode 100644 index 360b87030..000000000 --- a/include/CL/cl_icd.h +++ /dev/null @@ -1,1294 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2019-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef OPENCL_CL_ICD_H -#define OPENCL_CL_ICD_H - -#include -#include -#include -#include - -#if defined(_WIN32) -#include -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This file contains pointer type definitions for each of the CL API calls as - * well as a type definition for the dispatch table used by the Khronos ICD - * loader (see cl_khr_icd extension specification for background). - */ - -/* API function pointer definitions */ - -// Platform APIs -typedef cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)( - cl_uint num_entries, cl_platform_id *platforms, - cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)( - cl_platform_id platform, cl_platform_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Device APIs -typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)( - cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)( - cl_device_id device, cl_device_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clCreateSubDevices)( - cl_device_id in_device, - const cl_device_partition_property *partition_properties, - cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); - -typedef cl_int(CL_API_CALL *cl_api_clRetainDevice)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseDevice)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateSubDevices; -typedef void *cl_api_clRetainDevice; -typedef void *cl_api_clReleaseDevice; - -#endif - -// Context APIs -typedef cl_context(CL_API_CALL *cl_api_clCreateContext)( - const cl_context_properties *properties, cl_uint num_devices, - const cl_device_id *devices, - void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_context(CL_API_CALL *cl_api_clCreateContextFromType)( - const cl_context_properties *properties, cl_device_type device_type, - void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clRetainContext)( - cl_context context) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseContext)( - cl_context context) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetContextInfo)( - cl_context context, cl_context_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Command Queue APIs -typedef cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)( - cl_context context, cl_device_id device, - cl_command_queue_properties properties, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef -cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)( - cl_context /* context */, cl_device_id /* device */, - const cl_queue_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreateCommandQueueWithProperties; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)( - cl_command_queue command_queue, cl_command_queue_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Memory Object APIs -typedef cl_mem(CL_API_CALL *cl_api_clCreateBuffer)( - cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_mem(CL_API_CALL *cl_api_clCreateImage)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - const cl_image_desc *image_desc, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateImage; - -#endif - -#ifdef CL_VERSION_3_0 - -typedef cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)( - cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, - size_t size, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)( - cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; - -typedef cl_int(CL_API_CALL* cl_api_clSetContextDestructorCallback)( - cl_context context, - void(CL_CALLBACK* pfn_notify)(cl_context context, void* user_data), - void* user_data) CL_API_SUFFIX__VERSION_3_0; - -#else - -typedef void *cl_api_clCreateBufferWithProperties; -typedef void *cl_api_clCreateImageWithProperties; -typedef void *cl_api_clSetContextDestructorCallback; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clRetainMemObject)( - cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseMemObject)( - cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)( - cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, - cl_uint num_entries, cl_image_format *image_formats, - cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)( - cl_mem memobj, cl_mem_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetImageInfo)( - cl_mem image, cl_image_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef cl_mem(CL_API_CALL *cl_api_clCreatePipe)( - cl_context /* context */, cl_mem_flags /* flags */, - cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */, - const cl_pipe_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetPipeInfo)( - cl_mem /* pipe */, cl_pipe_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; - -typedef void *(CL_API_CALL *cl_api_clSVMAlloc)( - cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */, - unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0; - -typedef void(CL_API_CALL *cl_api_clSVMFree)( - cl_context /* context */, - void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreatePipe; -typedef void *cl_api_clGetPipeInfo; -typedef void *cl_api_clSVMAlloc; -typedef void *cl_api_clSVMFree; - -#endif - -// Sampler APIs -typedef cl_sampler(CL_API_CALL *cl_api_clCreateSampler)( - cl_context context, cl_bool normalized_coords, - cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clRetainSampler)( - cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseSampler)( - cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)( - cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef -cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)( - cl_context /* context */, - const cl_sampler_properties * /* sampler_properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clCreateSamplerWithProperties; - -#endif - -// Program Object APIs -typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)( - cl_context context, cl_uint count, const char **strings, - const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const size_t *lengths, const unsigned char **binaries, - cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef -cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCreateProgramWithBuiltInKernels; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clRetainProgram)( - cl_program program) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseProgram)( - cl_program program) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clBuildProgram)( - cl_program program, cl_uint num_devices, const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clCompileProgram)( - cl_program program, cl_uint num_devices, const cl_device_id *device_list, - const char *options, cl_uint num_input_headers, - const cl_program *input_headers, const char **header_include_names, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_program(CL_API_CALL *cl_api_clLinkProgram)( - cl_context context, cl_uint num_devices, const cl_device_id *device_list, - const char *options, cl_uint num_input_programs, - const cl_program *input_programs, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clCompileProgram; -typedef void *cl_api_clLinkProgram; - -#endif - -#ifdef CL_VERSION_2_2 - -typedef -cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)( - cl_program program, cl_uint spec_id, size_t spec_size, - const void *spec_value) CL_API_SUFFIX__VERSION_2_2; - -typedef cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)( - cl_program program, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_2_2; - -#else - -typedef void *cl_api_clSetProgramSpecializationConstant; -typedef void *cl_api_clSetProgramReleaseCallback; - -#endif - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)( - cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clUnloadPlatformCompiler; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clGetProgramInfo)( - cl_program program, cl_program_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)( - cl_program program, cl_device_id device, cl_program_build_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Kernel Object APIs -typedef cl_kernel(CL_API_CALL *cl_api_clCreateKernel)( - cl_program program, const char *kernel_name, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)( - cl_program program, cl_uint num_kernels, cl_kernel *kernels, - cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clRetainKernel)( - cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseKernel)( - cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clSetKernelArg)( - cl_kernel kernel, cl_uint arg_index, size_t arg_size, - const void *arg_value) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetKernelInfo)( - cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)( - cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clGetKernelArgInfo; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)( - cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_2_0 - -typedef cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)( - cl_kernel /* kernel */, cl_uint /* arg_index */, - const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)( - cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, - size_t /* param_value_size */, - const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)( - cl_kernel /* in_kernel */, cl_device_id /*in_device*/, - cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/, - const void * /*input_value*/, size_t /*param_value_size*/, - void * /*param_value*/, - size_t * /*param_value_size_ret*/) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clSetKernelArgSVMPointer; -typedef void *cl_api_clSetKernelExecInfo; -typedef void *cl_api_clGetKernelSubGroupInfoKHR; - -#endif - -// Event Object APIs -typedef cl_int(CL_API_CALL *cl_api_clWaitForEvents)( - cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetEventInfo)( - cl_event event, cl_event_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event) - CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event) - CL_API_SUFFIX__VERSION_1_0; - -// Profiling APIs -typedef cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)( - cl_event event, cl_profiling_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -// Flush and Finish APIs -typedef cl_int(CL_API_CALL *cl_api_clFlush)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clFinish)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; - -// Enqueued Commands APIs -typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - const size_t *buffer_origin, const size_t *host_origin, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, void *ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueReadBufferRect; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, - size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, - const size_t *buffer_origin, const size_t *host_origin, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueWriteBufferRect; - -#endif - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)( - cl_command_queue command_queue, cl_mem buffer, const void *pattern, - size_t pattern_size, size_t offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueFillBuffer; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, - size_t src_offset, size_t dst_offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_1 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clEnqueueCopyBufferRect; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, - const size_t *origin, const size_t *region, size_t row_pitch, - size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, - const size_t *origin, const size_t *region, size_t input_row_pitch, - size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)( - cl_command_queue command_queue, cl_mem image, const void *fill_color, - const size_t origin[3], const size_t region[3], - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueFillImage; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)( - cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)( - cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, - const size_t *src_origin, const size_t *region, size_t dst_offset, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)( - cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, - size_t src_offset, const size_t *dst_origin, const size_t *region, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)( - cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, - cl_map_flags map_flags, size_t offset, size_t cb, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; - -typedef void *(CL_API_CALL *cl_api_clEnqueueMapImage)( - cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, - cl_map_flags map_flags, const size_t *origin, const size_t *region, - size_t *image_row_pitch, size_t *image_slice_pitch, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)( - cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)( - cl_command_queue command_queue, cl_uint num_mem_objects, - const cl_mem *mem_objects, cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueMigrateMemObjects; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)( - cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, - const size_t *global_work_offset, const size_t *global_work_size, - const size_t *local_work_size, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueTask)( - cl_command_queue command_queue, cl_kernel kernel, - cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)( - cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *), - void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, - const void **args_mem_loc, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef CL_VERSION_1_2 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)( - cl_command_queue command_queue, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)( - cl_command_queue command_queue, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef void *( - CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)( - cl_platform_id platform, - const char *function_name)CL_API_SUFFIX__VERSION_1_2; - -#else - -typedef void *cl_api_clEnqueueMarkerWithWaitList; -typedef void *cl_api_clEnqueueBarrierWithWaitList; -typedef void *cl_api_clGetExtensionFunctionAddressForPlatform; - -#endif - -// Shared Virtual Memory APIs - -#ifdef CL_VERSION_2_0 - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)( - cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */, - void ** /* svm_pointers */, - void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */, - cl_uint /* num_svm_pointers */, - void ** /* svm_pointers[] */, - void * /* user_data */), - void * /* user_data */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)( - cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, - void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, - const void * /* pattern */, size_t /* pattern_size */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)( - cl_command_queue /* command_queue */, cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; - -#else - -typedef void *cl_api_clEnqueueSVMFree; -typedef void *cl_api_clEnqueueSVMMemcpy; -typedef void *cl_api_clEnqueueSVMMemFill; -typedef void *cl_api_clEnqueueSVMMap; -typedef void *cl_api_clEnqueueSVMUnmap; - -#endif - -// Deprecated APIs -typedef cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)( - cl_command_queue command_queue, cl_command_queue_properties properties, - cl_bool enable, cl_command_queue_properties *old_properties) - CL_API_SUFFIX__VERSION_1_0_DEPRECATED; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateImage2D)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - size_t image_width, size_t image_height, size_t image_row_pitch, - void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateImage3D)( - cl_context context, cl_mem_flags flags, const cl_image_format *image_format, - size_t image_width, size_t image_height, size_t image_depth, - size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void) - CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueMarker)( - cl_command_queue command_queue, - cl_event *event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)( - cl_command_queue command_queue, cl_uint num_events, - const cl_event *event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -typedef void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)( - const char *function_name)CL_API_SUFFIX__VERSION_1_1_DEPRECATED; - -// GL and other APIs -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)( - cl_context context, cl_mem_flags flags, cl_GLuint bufobj, - int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)( - cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, - cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)( - cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)( - cl_mem memobj, cl_gl_object_type *gl_object_type, - cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)( - cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -/* cl_khr_gl_sharing */ -typedef cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)( - const cl_context_properties *properties, cl_gl_context_info param_name, - size_t param_value_size, void *param_value, size_t *param_value_size_ret); - -/* cl_khr_gl_event */ -typedef cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)( - cl_context context, cl_GLsync sync, cl_int *errcode_ret); - -#if defined(_WIN32) - -/* cl_khr_d3d10_sharing */ - -typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)( - cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, - cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D10Buffer *resource, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)( - cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)( - cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR( - cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags, - ID3D10Buffer *resource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR( - cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR( - cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_d3d11_sharing */ -typedef cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)( - cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, - cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D11Buffer *resource, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)( - cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)( - cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, - UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -/* cl_khr_dx9_media_sharing */ -typedef -cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)( - cl_platform_id platform, cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)( - cl_context context, cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, - cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -typedef -cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_2; - -/* cl_khr_d3d11_sharing */ -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR( - cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, - void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, - cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags, - ID3D11Buffer *resource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR( - cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR( - cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, - UINT subresource, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_dx9_media_sharing */ -extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR( - cl_platform_id platform, cl_uint num_media_adapters, - cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters, - cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, - cl_device_id *devices, cl_uint *num_devices); - -extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR( - cl_context context, cl_mem_flags flags, - cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, - cl_uint plane, cl_int *errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -#else - -/* cl_khr_d3d10_sharing */ -typedef void *cl_api_clGetDeviceIDsFromD3D10KHR; -typedef void *cl_api_clCreateFromD3D10BufferKHR; -typedef void *cl_api_clCreateFromD3D10Texture2DKHR; -typedef void *cl_api_clCreateFromD3D10Texture3DKHR; -typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR; -typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR; - -/* cl_khr_d3d11_sharing */ -typedef void *cl_api_clGetDeviceIDsFromD3D11KHR; -typedef void *cl_api_clCreateFromD3D11BufferKHR; -typedef void *cl_api_clCreateFromD3D11Texture2DKHR; -typedef void *cl_api_clCreateFromD3D11Texture3DKHR; -typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR; -typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR; - -/* cl_khr_dx9_media_sharing */ -typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR; -typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR; -typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR; -typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR; - -#endif - -/* OpenCL 1.1 */ - -#ifdef CL_VERSION_1_1 - -typedef cl_int(CL_API_CALL *cl_api_clSetEventCallback)( - cl_event /* event */, cl_int /* command_exec_callback_type */, - void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)( - cl_mem /* buffer */, cl_mem_flags /* flags */, - cl_buffer_create_type /* buffer_create_type */, - const void * /* buffer_create_info */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; - -typedef -cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)( - cl_mem /* memobj */, - void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, - void * /*user_data*/), - void * /*user_data */) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_event(CL_API_CALL *cl_api_clCreateUserEvent)( - cl_context /* context */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; - -typedef cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)( - cl_event /* event */, - cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; - -#else - -typedef void *cl_api_clSetEventCallback; -typedef void *cl_api_clCreateSubBuffer; -typedef void *cl_api_clSetMemObjectDestructorCallback; -typedef void *cl_api_clCreateUserEvent; -typedef void *cl_api_clSetUserEventStatus; - -#endif - -typedef cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)( - cl_device_id in_device, - const cl_device_partition_property_ext *partition_properties, - cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); - -typedef cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_0; - -typedef cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)( - cl_device_id device) CL_API_SUFFIX__VERSION_1_0; - -/* cl_khr_egl_image */ -typedef cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)( - cl_context context, CLeglDisplayKHR display, CLeglImageKHR image, - cl_mem_flags flags, const cl_egl_image_properties_khr *properties, - cl_int *errcode_ret); - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem *mem_objects, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); - -/* cl_khr_egl_event */ -typedef cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)( - cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, - cl_int *errcode_ret); - -#ifdef CL_VERSION_2_1 - -typedef cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)( - cl_context context, cl_device_id device, - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)( - cl_context context, const void *il, size_t length, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)( - cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name, - size_t input_value_size, const void *input_value, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_kernel(CL_API_CALL *cl_api_clCloneKernel)( - cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)( - cl_command_queue command_queue, cl_uint num_svm_pointers, - const void **svm_pointers, const size_t *sizes, - cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)( - cl_device_id device, cl_ulong *device_timestamp, - cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -typedef cl_int(CL_API_CALL *cl_api_clGetHostTimer)( - cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; - -#else - -typedef void *cl_api_clSetDefaultDeviceCommandQueue; -typedef void *cl_api_clCreateProgramWithIL; -typedef void *cl_api_clGetKernelSubGroupInfo; -typedef void *cl_api_clCloneKernel; -typedef void *cl_api_clEnqueueSVMMigrateMem; -typedef void *cl_api_clGetDeviceAndHostTimer; -typedef void *cl_api_clGetHostTimer; - -#endif - -/* Vendor dispatch table structure */ - -typedef struct _cl_icd_dispatch { - /* OpenCL 1.0 */ - cl_api_clGetPlatformIDs clGetPlatformIDs; - cl_api_clGetPlatformInfo clGetPlatformInfo; - cl_api_clGetDeviceIDs clGetDeviceIDs; - cl_api_clGetDeviceInfo clGetDeviceInfo; - cl_api_clCreateContext clCreateContext; - cl_api_clCreateContextFromType clCreateContextFromType; - cl_api_clRetainContext clRetainContext; - cl_api_clReleaseContext clReleaseContext; - cl_api_clGetContextInfo clGetContextInfo; - cl_api_clCreateCommandQueue clCreateCommandQueue; - cl_api_clRetainCommandQueue clRetainCommandQueue; - cl_api_clReleaseCommandQueue clReleaseCommandQueue; - cl_api_clGetCommandQueueInfo clGetCommandQueueInfo; - cl_api_clSetCommandQueueProperty clSetCommandQueueProperty; - cl_api_clCreateBuffer clCreateBuffer; - cl_api_clCreateImage2D clCreateImage2D; - cl_api_clCreateImage3D clCreateImage3D; - cl_api_clRetainMemObject clRetainMemObject; - cl_api_clReleaseMemObject clReleaseMemObject; - cl_api_clGetSupportedImageFormats clGetSupportedImageFormats; - cl_api_clGetMemObjectInfo clGetMemObjectInfo; - cl_api_clGetImageInfo clGetImageInfo; - cl_api_clCreateSampler clCreateSampler; - cl_api_clRetainSampler clRetainSampler; - cl_api_clReleaseSampler clReleaseSampler; - cl_api_clGetSamplerInfo clGetSamplerInfo; - cl_api_clCreateProgramWithSource clCreateProgramWithSource; - cl_api_clCreateProgramWithBinary clCreateProgramWithBinary; - cl_api_clRetainProgram clRetainProgram; - cl_api_clReleaseProgram clReleaseProgram; - cl_api_clBuildProgram clBuildProgram; - cl_api_clUnloadCompiler clUnloadCompiler; - cl_api_clGetProgramInfo clGetProgramInfo; - cl_api_clGetProgramBuildInfo clGetProgramBuildInfo; - cl_api_clCreateKernel clCreateKernel; - cl_api_clCreateKernelsInProgram clCreateKernelsInProgram; - cl_api_clRetainKernel clRetainKernel; - cl_api_clReleaseKernel clReleaseKernel; - cl_api_clSetKernelArg clSetKernelArg; - cl_api_clGetKernelInfo clGetKernelInfo; - cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; - cl_api_clWaitForEvents clWaitForEvents; - cl_api_clGetEventInfo clGetEventInfo; - cl_api_clRetainEvent clRetainEvent; - cl_api_clReleaseEvent clReleaseEvent; - cl_api_clGetEventProfilingInfo clGetEventProfilingInfo; - cl_api_clFlush clFlush; - cl_api_clFinish clFinish; - cl_api_clEnqueueReadBuffer clEnqueueReadBuffer; - cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer; - cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer; - cl_api_clEnqueueReadImage clEnqueueReadImage; - cl_api_clEnqueueWriteImage clEnqueueWriteImage; - cl_api_clEnqueueCopyImage clEnqueueCopyImage; - cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; - cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; - cl_api_clEnqueueMapBuffer clEnqueueMapBuffer; - cl_api_clEnqueueMapImage clEnqueueMapImage; - cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; - cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; - cl_api_clEnqueueTask clEnqueueTask; - cl_api_clEnqueueNativeKernel clEnqueueNativeKernel; - cl_api_clEnqueueMarker clEnqueueMarker; - cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents; - cl_api_clEnqueueBarrier clEnqueueBarrier; - cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; - cl_api_clCreateFromGLBuffer clCreateFromGLBuffer; - cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D; - cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D; - cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer; - cl_api_clGetGLObjectInfo clGetGLObjectInfo; - cl_api_clGetGLTextureInfo clGetGLTextureInfo; - cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; - cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; - cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR; - - /* cl_khr_d3d10_sharing */ - cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR; - cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR; - cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR; - cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR; - cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR; - cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR; - - /* OpenCL 1.1 */ - cl_api_clSetEventCallback clSetEventCallback; - cl_api_clCreateSubBuffer clCreateSubBuffer; - cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; - cl_api_clCreateUserEvent clCreateUserEvent; - cl_api_clSetUserEventStatus clSetUserEventStatus; - cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect; - cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; - cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; - - /* cl_ext_device_fission */ - cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT; - cl_api_clRetainDeviceEXT clRetainDeviceEXT; - cl_api_clReleaseDeviceEXT clReleaseDeviceEXT; - - /* cl_khr_gl_event */ - cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR; - - /* OpenCL 1.2 */ - cl_api_clCreateSubDevices clCreateSubDevices; - cl_api_clRetainDevice clRetainDevice; - cl_api_clReleaseDevice clReleaseDevice; - cl_api_clCreateImage clCreateImage; - cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; - cl_api_clCompileProgram clCompileProgram; - cl_api_clLinkProgram clLinkProgram; - cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler; - cl_api_clGetKernelArgInfo clGetKernelArgInfo; - cl_api_clEnqueueFillBuffer clEnqueueFillBuffer; - cl_api_clEnqueueFillImage clEnqueueFillImage; - cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; - cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; - cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; - cl_api_clGetExtensionFunctionAddressForPlatform - clGetExtensionFunctionAddressForPlatform; - cl_api_clCreateFromGLTexture clCreateFromGLTexture; - - /* cl_khr_d3d11_sharing */ - cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR; - cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR; - cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR; - cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR; - cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR; - cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR; - cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR; - - /* cl_khr_dx9_media_sharing */ - cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR - clGetDeviceIDsFromDX9MediaAdapterKHR; - cl_api_clEnqueueAcquireDX9MediaSurfacesKHR - clEnqueueAcquireDX9MediaSurfacesKHR; - cl_api_clEnqueueReleaseDX9MediaSurfacesKHR - clEnqueueReleaseDX9MediaSurfacesKHR; - - /* cl_khr_egl_image */ - cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; - cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; - cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; - - /* cl_khr_egl_event */ - cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; - - /* OpenCL 2.0 */ - cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; - cl_api_clCreatePipe clCreatePipe; - cl_api_clGetPipeInfo clGetPipeInfo; - cl_api_clSVMAlloc clSVMAlloc; - cl_api_clSVMFree clSVMFree; - cl_api_clEnqueueSVMFree clEnqueueSVMFree; - cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; - cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill; - cl_api_clEnqueueSVMMap clEnqueueSVMMap; - cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap; - cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties; - cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; - cl_api_clSetKernelExecInfo clSetKernelExecInfo; - - /* cl_khr_sub_groups */ - cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR; - - /* OpenCL 2.1 */ - cl_api_clCloneKernel clCloneKernel; - cl_api_clCreateProgramWithIL clCreateProgramWithIL; - cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem; - cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer; - cl_api_clGetHostTimer clGetHostTimer; - cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo; - cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue; - - /* OpenCL 2.2 */ - cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback; - cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant; - - /* OpenCL 3.0 */ - cl_api_clCreateBufferWithProperties clCreateBufferWithProperties; - cl_api_clCreateImageWithProperties clCreateImageWithProperties; - cl_api_clSetContextDestructorCallback clSetContextDestructorCallback; - -} cl_icd_dispatch; - -#ifdef __cplusplus -} -#endif - -#endif /* #ifndef OPENCL_CL_ICD_H */ diff --git a/include/CL/cl_layer.h b/include/CL/cl_layer.h deleted file mode 100644 index fe628bf8a..000000000 --- a/include/CL/cl_layer.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * OpenCL is a trademark of Apple Inc. used under license by Khronos. - */ - -#ifndef OPENCL_CL_LAYER_H -#define OPENCL_CL_LAYER_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef cl_uint cl_layer_info; -typedef cl_uint cl_layer_api_version; -#define CL_LAYER_API_VERSION 0x4240 -#define CL_LAYER_NAME 0x4241 -#define CL_LAYER_API_VERSION_100 100 - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetLayerInfo(cl_layer_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - -typedef cl_int -(CL_API_CALL *pfn_clGetLayerInfo)(cl_layer_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL -clInitLayer(cl_uint num_entries, - const cl_icd_dispatch *target_dispatch, - cl_uint *num_entries_ret, - const cl_icd_dispatch **layer_dispatch_ret); - -typedef cl_int -(CL_API_CALL *pfn_clInitLayer)(cl_uint num_entries, - const cl_icd_dispatch *target_dispatch, - cl_uint *num_entries_ret, - const cl_icd_dispatch **layer_dispatch_ret); - -#ifdef __cplusplus -} -#endif - -#endif /* OPENCL_CL_LAYER_H */ diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h deleted file mode 100644 index e7a0d6f47..000000000 --- a/include/CL/cl_platform.h +++ /dev/null @@ -1,1412 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __CL_PLATFORM_H -#define __CL_PLATFORM_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_WIN32) - #if !defined(CL_API_ENTRY) - #define CL_API_ENTRY - #endif - #if !defined(CL_API_CALL) - #define CL_API_CALL __stdcall - #endif - #if !defined(CL_CALLBACK) - #define CL_CALLBACK __stdcall - #endif -#else - #if !defined(CL_API_ENTRY) - #define CL_API_ENTRY - #endif - #if !defined(CL_API_CALL) - #define CL_API_CALL - #endif - #if !defined(CL_CALLBACK) - #define CL_CALLBACK - #endif -#endif - -/* - * Deprecation flags refer to the last version of the header in which the - * feature was not deprecated. - * - * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without - * deprecation but is deprecated in versions later than 1.1. - */ - -#ifndef CL_API_SUFFIX_USER -#define CL_API_SUFFIX_USER -#endif - -#ifndef CL_API_PREFIX_USER -#define CL_API_PREFIX_USER -#endif - -#define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER -#define CL_API_PREFIX_COMMON CL_API_PREFIX_USER - -#define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON -#define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON - - -#ifdef __GNUC__ - #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated)) - #define CL_API_PREFIX_DEPRECATED -#elif defined(_WIN32) - #define CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX_DEPRECATED __declspec(deprecated) -#else - #define CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS - #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED - #endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS - #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS - #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED -#endif - -#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS - #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON - #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON -#else - #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED - #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED -#endif - -#if (defined (_WIN32) && defined(_MSC_VER)) - -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wlanguage-extension-token" -#endif - -/* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */ -/* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */ -#if defined(__clang__) || _MSC_VER >= 1600 - #include -#endif - -/* scalar types */ -typedef signed __int8 cl_char; -typedef unsigned __int8 cl_uchar; -typedef signed __int16 cl_short; -typedef unsigned __int16 cl_ushort; -typedef signed __int32 cl_int; -typedef unsigned __int32 cl_uint; -typedef signed __int64 cl_long; -typedef unsigned __int64 cl_ulong; - -typedef unsigned __int16 cl_half; -typedef float cl_float; -typedef double cl_double; - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 340282346638528859811704183484516925440.0f -#define CL_FLT_MIN 1.175494350822287507969e-38f -#define CL_FLT_EPSILON 1.1920928955078125e-7f - -#define CL_HALF_DIG 3 -#define CL_HALF_MANT_DIG 11 -#define CL_HALF_MAX_10_EXP +4 -#define CL_HALF_MAX_EXP +16 -#define CL_HALF_MIN_10_EXP -4 -#define CL_HALF_MIN_EXP -13 -#define CL_HALF_RADIX 2 -#define CL_HALF_MAX 65504.0f -#define CL_HALF_MIN 6.103515625e-05f -#define CL_HALF_EPSILON 9.765625e-04f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 1.7976931348623158e+308 -#define CL_DBL_MIN 2.225073858507201383090e-308 -#define CL_DBL_EPSILON 2.220446049250313080847e-16 - -#define CL_M_E 2.7182818284590452354 -#define CL_M_LOG2E 1.4426950408889634074 -#define CL_M_LOG10E 0.43429448190325182765 -#define CL_M_LN2 0.69314718055994530942 -#define CL_M_LN10 2.30258509299404568402 -#define CL_M_PI 3.14159265358979323846 -#define CL_M_PI_2 1.57079632679489661923 -#define CL_M_PI_4 0.78539816339744830962 -#define CL_M_1_PI 0.31830988618379067154 -#define CL_M_2_PI 0.63661977236758134308 -#define CL_M_2_SQRTPI 1.12837916709551257390 -#define CL_M_SQRT2 1.41421356237309504880 -#define CL_M_SQRT1_2 0.70710678118654752440 - -#define CL_M_E_F 2.718281828f -#define CL_M_LOG2E_F 1.442695041f -#define CL_M_LOG10E_F 0.434294482f -#define CL_M_LN2_F 0.693147181f -#define CL_M_LN10_F 2.302585093f -#define CL_M_PI_F 3.141592654f -#define CL_M_PI_2_F 1.570796327f -#define CL_M_PI_4_F 0.785398163f -#define CL_M_1_PI_F 0.318309886f -#define CL_M_2_PI_F 0.636619772f -#define CL_M_2_SQRTPI_F 1.128379167f -#define CL_M_SQRT2_F 1.414213562f -#define CL_M_SQRT1_2_F 0.707106781f - -#define CL_NAN (CL_INFINITY - CL_INFINITY) -#define CL_HUGE_VALF ((cl_float) 1e50) -#define CL_HUGE_VAL ((cl_double) 1e500) -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#else - -#include - -/* scalar types */ -typedef int8_t cl_char; -typedef uint8_t cl_uchar; -typedef int16_t cl_short; -typedef uint16_t cl_ushort; -typedef int32_t cl_int; -typedef uint32_t cl_uint; -typedef int64_t cl_long; -typedef uint64_t cl_ulong; - -typedef uint16_t cl_half; -typedef float cl_float; -typedef double cl_double; - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 340282346638528859811704183484516925440.0f -#define CL_FLT_MIN 1.175494350822287507969e-38f -#define CL_FLT_EPSILON 1.1920928955078125e-7f - -#define CL_HALF_DIG 3 -#define CL_HALF_MANT_DIG 11 -#define CL_HALF_MAX_10_EXP +4 -#define CL_HALF_MAX_EXP +16 -#define CL_HALF_MIN_10_EXP -4 -#define CL_HALF_MIN_EXP -13 -#define CL_HALF_RADIX 2 -#define CL_HALF_MAX 65504.0f -#define CL_HALF_MIN 6.103515625e-05f -#define CL_HALF_EPSILON 9.765625e-04f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 -#define CL_DBL_MIN 2.225073858507201383090e-308 -#define CL_DBL_EPSILON 2.220446049250313080847e-16 - -#define CL_M_E 2.7182818284590452354 -#define CL_M_LOG2E 1.4426950408889634074 -#define CL_M_LOG10E 0.43429448190325182765 -#define CL_M_LN2 0.69314718055994530942 -#define CL_M_LN10 2.30258509299404568402 -#define CL_M_PI 3.14159265358979323846 -#define CL_M_PI_2 1.57079632679489661923 -#define CL_M_PI_4 0.78539816339744830962 -#define CL_M_1_PI 0.31830988618379067154 -#define CL_M_2_PI 0.63661977236758134308 -#define CL_M_2_SQRTPI 1.12837916709551257390 -#define CL_M_SQRT2 1.41421356237309504880 -#define CL_M_SQRT1_2 0.70710678118654752440 - -#define CL_M_E_F 2.718281828f -#define CL_M_LOG2E_F 1.442695041f -#define CL_M_LOG10E_F 0.434294482f -#define CL_M_LN2_F 0.693147181f -#define CL_M_LN10_F 2.302585093f -#define CL_M_PI_F 3.141592654f -#define CL_M_PI_2_F 1.570796327f -#define CL_M_PI_4_F 0.785398163f -#define CL_M_1_PI_F 0.318309886f -#define CL_M_2_PI_F 0.636619772f -#define CL_M_2_SQRTPI_F 1.128379167f -#define CL_M_SQRT2_F 1.414213562f -#define CL_M_SQRT1_2_F 0.707106781f - -#if defined( __GNUC__ ) - #define CL_HUGE_VALF __builtin_huge_valf() - #define CL_HUGE_VAL __builtin_huge_val() - #define CL_NAN __builtin_nanf( "" ) -#else - #define CL_HUGE_VALF ((cl_float) 1e50) - #define CL_HUGE_VAL ((cl_double) 1e500) - float nanf( const char * ); - #define CL_NAN nanf( "" ) -#endif -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#endif - -#include - -/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ -typedef unsigned int cl_GLuint; -typedef int cl_GLint; -typedef unsigned int cl_GLenum; - -/* - * Vector types - * - * Note: OpenCL requires that all types be naturally aligned. - * This means that vector types must be naturally aligned. - * For example, a vector of four floats must be aligned to - * a 16 byte boundary (calculated as 4 * the natural 4-byte - * alignment of the float). The alignment qualifiers here - * will only function properly if your compiler supports them - * and if you don't actively work to defeat them. For example, - * in order for a cl_float4 to be 16 byte aligned in a struct, - * the start of the struct must itself be 16-byte aligned. - * - * Maintaining proper alignment is the user's responsibility. - */ - -/* Define basic vector types */ -#if defined( __VEC__ ) - #if !defined(__clang__) - #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ - #endif - typedef __vector unsigned char __cl_uchar16; - typedef __vector signed char __cl_char16; - typedef __vector unsigned short __cl_ushort8; - typedef __vector signed short __cl_short8; - typedef __vector unsigned int __cl_uint4; - typedef __vector signed int __cl_int4; - typedef __vector float __cl_float4; - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_UINT4__ 1 - #define __CL_INT4__ 1 - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef float __cl_float4 __attribute__((vector_size(16))); - #else - typedef __m128 __cl_float4; - #endif - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE2__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); - typedef cl_char __cl_char16 __attribute__((vector_size(16))); - typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); - typedef cl_short __cl_short8 __attribute__((vector_size(16))); - typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); - typedef cl_int __cl_int4 __attribute__((vector_size(16))); - typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); - typedef cl_long __cl_long2 __attribute__((vector_size(16))); - typedef cl_double __cl_double2 __attribute__((vector_size(16))); - #else - typedef __m128i __cl_uchar16; - typedef __m128i __cl_char16; - typedef __m128i __cl_ushort8; - typedef __m128i __cl_short8; - typedef __m128i __cl_uint4; - typedef __m128i __cl_int4; - typedef __m128i __cl_ulong2; - typedef __m128i __cl_long2; - typedef __m128d __cl_double2; - #endif - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_INT4__ 1 - #define __CL_UINT4__ 1 - #define __CL_ULONG2__ 1 - #define __CL_LONG2__ 1 - #define __CL_DOUBLE2__ 1 -#endif - -#if defined( __MMX__ ) - #include - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); - typedef cl_char __cl_char8 __attribute__((vector_size(8))); - typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); - typedef cl_short __cl_short4 __attribute__((vector_size(8))); - typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); - typedef cl_int __cl_int2 __attribute__((vector_size(8))); - typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); - typedef cl_long __cl_long1 __attribute__((vector_size(8))); - typedef cl_float __cl_float2 __attribute__((vector_size(8))); - #else - typedef __m64 __cl_uchar8; - typedef __m64 __cl_char8; - typedef __m64 __cl_ushort4; - typedef __m64 __cl_short4; - typedef __m64 __cl_uint2; - typedef __m64 __cl_int2; - typedef __m64 __cl_ulong1; - typedef __m64 __cl_long1; - typedef __m64 __cl_float2; - #endif - #define __CL_UCHAR8__ 1 - #define __CL_CHAR8__ 1 - #define __CL_USHORT4__ 1 - #define __CL_SHORT4__ 1 - #define __CL_INT2__ 1 - #define __CL_UINT2__ 1 - #define __CL_ULONG1__ 1 - #define __CL_LONG1__ 1 - #define __CL_FLOAT2__ 1 -#endif - -#if defined( __AVX__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_float __cl_float8 __attribute__((vector_size(32))); - typedef cl_double __cl_double4 __attribute__((vector_size(32))); - #else - typedef __m256 __cl_float8; - typedef __m256d __cl_double4; - #endif - #define __CL_FLOAT8__ 1 - #define __CL_DOUBLE4__ 1 -#endif - -/* Define capabilities for anonymous struct members. */ -#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ -#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__) -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ -#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__) -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ __extension__ -#elif defined(__clang__) -#define __CL_HAS_ANON_STRUCT__ 1 -#define __CL_ANON_STRUCT__ __extension__ -#else -#define __CL_HAS_ANON_STRUCT__ 0 -#define __CL_ANON_STRUCT__ -#endif - -#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ - /* Disable warning C4201: nonstandard extension used : nameless struct/union */ - #pragma warning( push ) - #pragma warning( disable : 4201 ) -#endif - -/* Define alignment keys */ -#if defined( __GNUC__ ) || defined(__INTEGRITY) - #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) -#elif defined( _WIN32) && (_MSC_VER) - /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ - /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ - /* #include */ - /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ - #define CL_ALIGNED(_x) -#else - #warning Need to implement some method to align data here - #define CL_ALIGNED(_x) -#endif - -/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ -#if __CL_HAS_ANON_STRUCT__ - /* .xyzw and .s0123...{f|F} are supported */ - #define CL_HAS_NAMED_VECTOR_FIELDS 1 - /* .hi and .lo are supported */ - #define CL_HAS_HI_LO_VECTOR_FIELDS 1 -#endif - -/* Define cl_vector types */ - -/* ---- cl_charn ---- */ -typedef union -{ - cl_char CL_ALIGNED(2) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2; -#endif -}cl_char2; - -typedef union -{ - cl_char CL_ALIGNED(4) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[2]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4; -#endif -}cl_char4; - -/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ -typedef cl_char4 cl_char3; - -typedef union -{ - cl_char CL_ALIGNED(8) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[4]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[2]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8; -#endif -}cl_char8; - -typedef union -{ - cl_char CL_ALIGNED(16) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[8]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[4]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8[2]; -#endif -#if defined( __CL_CHAR16__ ) - __cl_char16 v16; -#endif -}cl_char16; - - -/* ---- cl_ucharn ---- */ -typedef union -{ - cl_uchar CL_ALIGNED(2) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; -#endif -#if defined( __cl_uchar2__) - __cl_uchar2 v2; -#endif -}cl_uchar2; - -typedef union -{ - cl_uchar CL_ALIGNED(4) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[2]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4; -#endif -}cl_uchar4; - -/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ -typedef cl_uchar4 cl_uchar3; - -typedef union -{ - cl_uchar CL_ALIGNED(8) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[4]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[2]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8; -#endif -}cl_uchar8; - -typedef union -{ - cl_uchar CL_ALIGNED(16) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[8]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[4]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8[2]; -#endif -#if defined( __CL_UCHAR16__ ) - __cl_uchar16 v16; -#endif -}cl_uchar16; - - -/* ---- cl_shortn ---- */ -typedef union -{ - cl_short CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2; -#endif -}cl_short2; - -typedef union -{ - cl_short CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[2]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4; -#endif -}cl_short4; - -/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ -typedef cl_short4 cl_short3; - -typedef union -{ - cl_short CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[4]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[2]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8; -#endif -}cl_short8; - -typedef union -{ - cl_short CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[8]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[4]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8[2]; -#endif -#if defined( __CL_SHORT16__ ) - __cl_short16 v16; -#endif -}cl_short16; - - -/* ---- cl_ushortn ---- */ -typedef union -{ - cl_ushort CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2; -#endif -}cl_ushort2; - -typedef union -{ - cl_ushort CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[2]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4; -#endif -}cl_ushort4; - -/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ -typedef cl_ushort4 cl_ushort3; - -typedef union -{ - cl_ushort CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[4]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[2]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8; -#endif -}cl_ushort8; - -typedef union -{ - cl_ushort CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[8]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[4]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8[2]; -#endif -#if defined( __CL_USHORT16__ ) - __cl_ushort16 v16; -#endif -}cl_ushort16; - - -/* ---- cl_halfn ---- */ -typedef union -{ - cl_half CL_ALIGNED(4) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2; -#endif -}cl_half2; - -typedef union -{ - cl_half CL_ALIGNED(8) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[2]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4; -#endif -}cl_half4; - -/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ -typedef cl_half4 cl_half3; - -typedef union -{ - cl_half CL_ALIGNED(16) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[4]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4[2]; -#endif -#if defined( __CL_HALF8__ ) - __cl_half8 v8; -#endif -}cl_half8; - -typedef union -{ - cl_half CL_ALIGNED(32) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; -#endif -#if defined( __CL_HALF2__) - __cl_half2 v2[8]; -#endif -#if defined( __CL_HALF4__) - __cl_half4 v4[4]; -#endif -#if defined( __CL_HALF8__ ) - __cl_half8 v8[2]; -#endif -#if defined( __CL_HALF16__ ) - __cl_half16 v16; -#endif -}cl_half16; - -/* ---- cl_intn ---- */ -typedef union -{ - cl_int CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2; -#endif -}cl_int2; - -typedef union -{ - cl_int CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[2]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4; -#endif -}cl_int4; - -/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ -typedef cl_int4 cl_int3; - -typedef union -{ - cl_int CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[4]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[2]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8; -#endif -}cl_int8; - -typedef union -{ - cl_int CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[8]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[4]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8[2]; -#endif -#if defined( __CL_INT16__ ) - __cl_int16 v16; -#endif -}cl_int16; - - -/* ---- cl_uintn ---- */ -typedef union -{ - cl_uint CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2; -#endif -}cl_uint2; - -typedef union -{ - cl_uint CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[2]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4; -#endif -}cl_uint4; - -/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ -typedef cl_uint4 cl_uint3; - -typedef union -{ - cl_uint CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[4]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[2]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8; -#endif -}cl_uint8; - -typedef union -{ - cl_uint CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[8]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[4]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8[2]; -#endif -#if defined( __CL_UINT16__ ) - __cl_uint16 v16; -#endif -}cl_uint16; - -/* ---- cl_longn ---- */ -typedef union -{ - cl_long CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2; -#endif -}cl_long2; - -typedef union -{ - cl_long CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[2]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4; -#endif -}cl_long4; - -/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ -typedef cl_long4 cl_long3; - -typedef union -{ - cl_long CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[4]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[2]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8; -#endif -}cl_long8; - -typedef union -{ - cl_long CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[8]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[4]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8[2]; -#endif -#if defined( __CL_LONG16__ ) - __cl_long16 v16; -#endif -}cl_long16; - - -/* ---- cl_ulongn ---- */ -typedef union -{ - cl_ulong CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2; -#endif -}cl_ulong2; - -typedef union -{ - cl_ulong CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[2]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4; -#endif -}cl_ulong4; - -/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ -typedef cl_ulong4 cl_ulong3; - -typedef union -{ - cl_ulong CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[4]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[2]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8; -#endif -}cl_ulong8; - -typedef union -{ - cl_ulong CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[8]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[4]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8[2]; -#endif -#if defined( __CL_ULONG16__ ) - __cl_ulong16 v16; -#endif -}cl_ulong16; - - -/* --- cl_floatn ---- */ - -typedef union -{ - cl_float CL_ALIGNED(8) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2; -#endif -}cl_float2; - -typedef union -{ - cl_float CL_ALIGNED(16) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[2]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4; -#endif -}cl_float4; - -/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ -typedef cl_float4 cl_float3; - -typedef union -{ - cl_float CL_ALIGNED(32) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[4]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[2]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8; -#endif -}cl_float8; - -typedef union -{ - cl_float CL_ALIGNED(64) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[8]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[4]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8[2]; -#endif -#if defined( __CL_FLOAT16__ ) - __cl_float16 v16; -#endif -}cl_float16; - -/* --- cl_doublen ---- */ - -typedef union -{ - cl_double CL_ALIGNED(16) s[2]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; - __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2; -#endif -}cl_double2; - -typedef union -{ - cl_double CL_ALIGNED(32) s[4]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; - __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[2]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4; -#endif -}cl_double4; - -/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ -typedef cl_double4 cl_double3; - -typedef union -{ - cl_double CL_ALIGNED(64) s[8]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; - __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[4]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[2]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8; -#endif -}cl_double8; - -typedef union -{ - cl_double CL_ALIGNED(128) s[16]; -#if __CL_HAS_ANON_STRUCT__ - __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[8]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[4]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8[2]; -#endif -#if defined( __CL_DOUBLE16__ ) - __cl_double16 v16; -#endif -}cl_double16; - -/* Macro to facilitate debugging - * Usage: - * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. - * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" - * Each line thereafter of OpenCL C source must end with: \n\ - * The last line ends in "; - * - * Example: - * - * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ - * kernel void foo( int a, float * b ) \n\ - * { \n\ - * // my comment \n\ - * *b[ get_global_id(0)] = a; \n\ - * } \n\ - * "; - * - * This should correctly set up the line, (column) and file information for your source - * string so you can do source level debugging. - */ -#define __CL_STRINGIFY( _x ) # _x -#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) -#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" - -#ifdef __cplusplus -} -#endif - -#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ - #pragma warning( pop ) -#endif - -#endif /* __CL_PLATFORM_H */ diff --git a/include/CL/cl_va_api_media_sharing_intel.h b/include/CL/cl_va_api_media_sharing_intel.h deleted file mode 100644 index 547e90e88..000000000 --- a/include/CL/cl_va_api_media_sharing_intel.h +++ /dev/null @@ -1,163 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H -#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/*************************************************************** -* cl_intel_sharing_format_query_va_api -***************************************************************/ -#define cl_intel_sharing_format_query_va_api 1 - -/* when cl_intel_va_api_media_sharing is supported */ - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetSupportedVA_APIMediaSurfaceFormatsINTEL( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - VAImageFormat* va_api_formats, - cl_uint* num_surface_formats) ; - -typedef cl_int (CL_API_CALL * -clGetSupportedVA_APIMediaSurfaceFormatsINTEL_fn)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint plane, - cl_uint num_entries, - VAImageFormat* va_api_formats, - cl_uint* num_surface_formats) ; - -/****************************************** -* cl_intel_va_api_media_sharing extension * -*******************************************/ - -#define cl_intel_va_api_media_sharing 1 - -/* error codes */ -#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 -#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 -#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 -#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 - -/* cl_va_api_device_source_intel */ -#define CL_VA_API_DISPLAY_INTEL 0x4094 - -/* cl_va_api_device_set_intel */ -#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 -#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 - -/* cl_context_info */ -#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 - -/* cl_mem_info */ -#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 - -/* cl_image_info */ -#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 - -/* cl_command_type */ -#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A -#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B - -typedef cl_uint cl_va_api_device_source_intel; -typedef cl_uint cl_va_api_device_set_intel; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetDeviceIDsFromVA_APIMediaAdapterINTEL( - cl_platform_id platform, - cl_va_api_device_source_intel media_adapter_type, - void* media_adapter, - cl_va_api_device_set_intel media_adapter_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( - cl_platform_id platform, - cl_va_api_device_source_intel media_adapter_type, - void* media_adapter, - cl_va_api_device_set_intel media_adapter_set, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromVA_APIMediaSurfaceINTEL( - cl_context context, - cl_mem_flags flags, - VASurfaceID* surface, - cl_uint plane, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( - cl_context context, - cl_mem_flags flags, - VASurfaceID* surface, - cl_uint plane, - cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireVA_APIMediaSurfacesINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseVA_APIMediaSurfacesINTEL( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -typedef cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem* mem_objects, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) CL_API_SUFFIX__VERSION_1_2; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ - diff --git a/include/CL/cl_version.h b/include/CL/cl_version.h deleted file mode 100644 index 3844938d5..000000000 --- a/include/CL/cl_version.h +++ /dev/null @@ -1,81 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2018-2020 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __CL_VERSION_H -#define __CL_VERSION_H - -/* Detect which version to target */ -#if !defined(CL_TARGET_OPENCL_VERSION) -#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)") -#define CL_TARGET_OPENCL_VERSION 300 -#endif -#if CL_TARGET_OPENCL_VERSION != 100 && \ - CL_TARGET_OPENCL_VERSION != 110 && \ - CL_TARGET_OPENCL_VERSION != 120 && \ - CL_TARGET_OPENCL_VERSION != 200 && \ - CL_TARGET_OPENCL_VERSION != 210 && \ - CL_TARGET_OPENCL_VERSION != 220 && \ - CL_TARGET_OPENCL_VERSION != 300 -#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)") -#undef CL_TARGET_OPENCL_VERSION -#define CL_TARGET_OPENCL_VERSION 300 -#endif - - -/* OpenCL Version */ -#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0) -#define CL_VERSION_3_0 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) -#define CL_VERSION_2_2 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) -#define CL_VERSION_2_1 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) -#define CL_VERSION_2_0 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) -#define CL_VERSION_1_2 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) -#define CL_VERSION_1_1 1 -#endif -#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) -#define CL_VERSION_1_0 1 -#endif - -/* Allow deprecated APIs for older OpenCL versions. */ -#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_2_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_1_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#endif -#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) -#define CL_USE_DEPRECATED_OPENCL_1_0_APIS -#endif - -#endif /* __CL_VERSION_H */ diff --git a/include/CL/license.txt b/include/CL/license.txt deleted file mode 100644 index f49a4e16e..000000000 --- a/include/CL/license.txt +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/include/CL/opencl.h b/include/CL/opencl.h deleted file mode 100644 index ef8dd1e03..000000000 --- a/include/CL/opencl.h +++ /dev/null @@ -1,32 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2021 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#ifndef __OPENCL_H -#define __OPENCL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_H */ diff --git a/include/CL/opencl.hpp b/include/CL/opencl.hpp deleted file mode 100644 index 1e61d7890..000000000 --- a/include/CL/opencl.hpp +++ /dev/null @@ -1,10372 +0,0 @@ -// -// Copyright (c) 2008-2020 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -/*! \file - * - * \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2, - * OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0. - * \author Lee Howes and Bruce Merry - * - * Derived from the OpenCL 1.x C++ bindings written by - * Benedict R. Gaster, Laurent Morichetti and Lee Howes - * With additions and fixes from: - * Brian Cole, March 3rd 2010 and April 2012 - * Matt Gruenke, April 2012. - * Bruce Merry, February 2013. - * Tom Deakin and Simon McIntosh-Smith, July 2013 - * James Price, 2015- - * \version 2.2.0 - * \date 2019-09-18 - * - * Optional extension support - * - * cl_ext_device_fission - * #define CL_HPP_USE_CL_DEVICE_FISSION - * cl_khr_d3d10_sharing - * #define CL_HPP_USE_DX_INTEROP - * cl_khr_sub_groups - * #define CL_HPP_USE_CL_SUB_GROUPS_KHR - * cl_khr_image2d_from_buffer - * #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR - * - * Doxygen documentation for this header is available here: - * - * http://khronosgroup.github.io/OpenCL-CLHPP/ - * - * The latest version of this header can be found on the GitHub releases page: - * - * https://github.com/KhronosGroup/OpenCL-CLHPP/releases - * - * Bugs and patches can be submitted to the GitHub repository: - * - * https://github.com/KhronosGroup/OpenCL-CLHPP - */ - -/*! \mainpage - * \section intro Introduction - * For many large applications C++ is the language of choice and so it seems - * reasonable to define C++ bindings for OpenCL. - * - * The interface is contained with a single C++ header file \em opencl.hpp and all - * definitions are contained within the namespace \em cl. There is no additional - * requirement to include \em cl.h and to use either the C++ or original C - * bindings; it is enough to simply include \em opencl.hpp. - * - * The bindings themselves are lightweight and correspond closely to the - * underlying C API. Using the C++ bindings introduces no additional execution - * overhead. - * - * There are numerous compatibility, portability and memory management - * fixes in the new header as well as additional OpenCL 2.0 features. - * As a result the header is not directly backward compatible and for this - * reason we release it as opencl.hpp rather than a new version of cl.hpp. - * - * - * \section compatibility Compatibility - * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings - * include an updated approach to defining supported feature versions - * and the range of valid underlying OpenCL runtime versions supported. - * - * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and - * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit - * decimal values representing OpenCL runime versions. The default for - * the target is 200, representing OpenCL 2.0 and the minimum is also - * defined as 200. These settings would use 2.0 API calls only. - * If backward compatibility with a 1.2 runtime is required, the minimum - * version may be set to 120. - * - * Note that this is a compile-time setting, and so affects linking against - * a particular SDK version rather than the versioning of the loaded runtime. - * - * The earlier versions of the header included basic vector and string - * classes based loosely on STL versions. These were difficult to - * maintain and very rarely used. For the 2.0 header we now assume - * the presence of the standard library unless requested otherwise. - * We use std::array, std::vector, std::shared_ptr and std::string - * throughout to safely manage memory and reduce the chance of a - * recurrance of earlier memory management bugs. - * - * These classes are used through typedefs in the cl namespace: - * cl::array, cl::vector, cl::pointer and cl::string. - * In addition cl::allocate_pointer forwards to std::allocate_shared - * by default. - * In all cases these standard library classes can be replaced with - * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, - * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and - * CL_HPP_NO_STD_STRING macros. - * - * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper - * class to interface with kernel enqueue. This caused unpleasant interactions - * with the standard size_t declaration and led to namespacing bugs. - * In the 2.0 version we have replaced this with a std::array-based interface. - * However, the old behaviour can be regained for backward compatibility - * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro. - * - * Finally, the program construction interface used a clumsy vector-of-pairs - * design in the earlier versions. We have replaced that with a cleaner - * vector-of-vectors and vector-of-strings design. However, for backward - * compatibility old behaviour can be regained with the - * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro. - * - * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with - * earlier versions. As a result a flag must be passed to the OpenCL C - * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as - * the default in the absence of the flag. - * In some cases the C++ bindings automatically compile code for ease. - * For those cases the compilation defaults to OpenCL C 2.0. - * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may - * be specified to assume 1.2 compilation. - * If more fine-grained decisions on a per-kernel bases are required - * then explicit build operations that take the flag should be used. - * - * - * \section parameterization Parameters - * This header may be parameterized by a set of preprocessor macros. - * - * - CL_HPP_TARGET_OPENCL_VERSION - * - * Defines the target OpenCL runtime version to build the header - * against. Defaults to 200, representing OpenCL 2.0. - * - * - CL_HPP_NO_STD_STRING - * - * Do not use the standard library string class. cl::string is not - * defined and may be defined by the user before opencl.hpp is - * included. - * - * - CL_HPP_NO_STD_VECTOR - * - * Do not use the standard library vector class. cl::vector is not - * defined and may be defined by the user before opencl.hpp is - * included. - * - * - CL_HPP_NO_STD_ARRAY - * - * Do not use the standard library array class. cl::array is not - * defined and may be defined by the user before opencl.hpp is - * included. - * - * - CL_HPP_NO_STD_UNIQUE_PTR - * - * Do not use the standard library unique_ptr class. cl::pointer and - * the cl::allocate_pointer functions are not defined and may be - * defined by the user before opencl.hpp is included. - * - * - CL_HPP_ENABLE_EXCEPTIONS - * - * Enable exceptions for use in the C++ bindings header. This is the - * preferred error handling mechanism but is not required. - * - * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY - * - * Backward compatibility option to support cl.hpp-style size_t - * class. Replaces the updated std::array derived version and - * removal of size_t from the namespace. Note that in this case the - * new size_t class is placed in the cl::compatibility namespace and - * thus requires an additional using declaration for direct backward - * compatibility. - * - * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY - * - * Enable older vector of pairs interface for construction of - * programs. - * - * - CL_HPP_CL_1_2_DEFAULT_BUILD - * - * Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 - * applies to use of cl::Program construction and other program - * build variants. - * - * - CL_HPP_USE_CL_DEVICE_FISSION - * - * Enable the cl_ext_device_fission extension. - * - * - CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR - * - * Enable the cl_khr_image2d_from_buffer extension. - * - * - CL_HPP_USE_CL_SUB_GROUPS_KHR - * - * Enable the cl_khr_subgroups extension. - * - * - CL_HPP_USE_DX_INTEROP - * - * Enable the cl_khr_d3d10_sharing extension. - * - * - CL_HPP_USE_IL_KHR - * - * Enable the cl_khr_il_program extension. - * - * - * \section example Example - * - * The following example shows a general use case for the C++ - * bindings, including support for the optional exception feature and - * also the supplied vector and string classes, see following sections for - * decriptions of these features. - * - * Note: the C++ bindings use std::call_once and therefore may need to be - * compiled using special command-line options (such as "-pthread") on some - * platforms! - * - * \code - #define CL_HPP_ENABLE_EXCEPTIONS - #define CL_HPP_TARGET_OPENCL_VERSION 200 - - #include - #include - #include - #include - #include - - const int numElements = 32; - - int main(void) - { - // Filter for a 2.0 or newer platform and set it as the default - std::vector platforms; - cl::Platform::get(&platforms); - cl::Platform plat; - for (auto &p : platforms) { - std::string platver = p.getInfo(); - if (platver.find("OpenCL 2.") != std::string::npos || - platver.find("OpenCL 3.") != std::string::npos) { - // Note: an OpenCL 3.x platform may not support all required features! - plat = p; - } - } - if (plat() == 0) { - std::cout << "No OpenCL 2.0 or newer platform found.\n"; - return -1; - } - - cl::Platform newP = cl::Platform::setDefault(plat); - if (newP != plat) { - std::cout << "Error setting default platform.\n"; - return -1; - } - - // C++11 raw string literal for the first kernel - std::string kernel1{R"CLC( - global int globalA; - kernel void updateGlobal() - { - globalA = 75; - } - )CLC"}; - - // Raw string literal for the second kernel - std::string kernel2{R"CLC( - typedef struct { global int *bar; } Foo; - kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, - global int *output, int val, write_only pipe int outPipe, queue_t childQueue) - { - output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar); - write_pipe(outPipe, &val); - queue_t default_queue = get_default_queue(); - ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); - - // Have a child kernel write into third quarter of output - enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, - ^{ - output[get_global_size(0)*2 + get_global_id(0)] = - inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA; - }); - - // Have a child kernel write into last quarter of output - enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, - ^{ - output[get_global_size(0)*3 + get_global_id(0)] = - inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2; - }); - } - )CLC"}; - - std::vector programStrings; - programStrings.push_back(kernel1); - programStrings.push_back(kernel2); - - cl::Program vectorAddProgram(programStrings); - try { - vectorAddProgram.build("-cl-std=CL2.0"); - } - catch (...) { - // Print build info for all devices - cl_int buildErr = CL_SUCCESS; - auto buildInfo = vectorAddProgram.getBuildInfo(&buildErr); - for (auto &pair : buildInfo) { - std::cerr << pair.second << std::endl << std::endl; - } - - return 1; - } - - typedef struct { int *bar; } Foo; - - // Get and run kernel that initializes the program-scope global - // A test for kernels that take no arguments - auto program2Kernel = - cl::KernelFunctor<>(vectorAddProgram, "updateGlobal"); - program2Kernel( - cl::EnqueueArgs( - cl::NDRange(1))); - - ////////////////// - // SVM allocations - - auto anSVMInt = cl::allocate_svm>(); - *anSVMInt = 5; - cl::SVMAllocator>> svmAllocReadOnly; - auto fooPointer = cl::allocate_pointer(svmAllocReadOnly); - fooPointer->bar = anSVMInt.get(); - cl::SVMAllocator> svmAlloc; - std::vector>> inputA(numElements, 1, svmAlloc); - cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); - - ////////////// - // Traditional cl_mem allocations - - std::vector output(numElements, 0xdeadbeef); - cl::Buffer outputBuffer(begin(output), end(output), false); - cl::Pipe aPipe(sizeof(cl_int), numElements / 2); - - // Default command queue, also passed in as a parameter - cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault( - cl::Context::getDefault(), cl::Device::getDefault()); - - auto vectorAddKernel = - cl::KernelFunctor< - decltype(fooPointer)&, - int*, - cl::coarse_svm_vector&, - cl::Buffer, - int, - cl::Pipe&, - cl::DeviceCommandQueue - >(vectorAddProgram, "vectorAdd"); - - // Ensure that the additional SVM pointer is available to the kernel - // This one was not passed as a parameter - vectorAddKernel.setSVMPointers(anSVMInt); - - cl_int error; - vectorAddKernel( - cl::EnqueueArgs( - cl::NDRange(numElements/2), - cl::NDRange(numElements/2)), - fooPointer, - inputA.data(), - inputB, - outputBuffer, - 3, - aPipe, - defaultDeviceQueue, - error - ); - - cl::copy(outputBuffer, begin(output), end(output)); - - cl::Device d = cl::Device::getDefault(); - - std::cout << "Output:\n"; - for (int i = 1; i < numElements; ++i) { - std::cout << "\t" << output[i] << "\n"; - } - std::cout << "\n\n"; - - return 0; - } - * - * \endcode - * - */ -#ifndef CL_HPP_ -#define CL_HPP_ - -/* Handle deprecated preprocessor definitions. In each case, we only check for - * the old name if the new name is not defined, so that user code can define - * both and hence work with either version of the bindings. - */ -#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP) -# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") -# define CL_HPP_USE_DX_INTEROP -#endif -#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION) -# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") -# define CL_HPP_USE_CL_DEVICE_FISSION -#endif -#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS) -# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") -# define CL_HPP_ENABLE_EXCEPTIONS -#endif -#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR) -# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") -# define CL_HPP_NO_STD_VECTOR -#endif -#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING) -# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") -# define CL_HPP_NO_STD_STRING -#endif -#if defined(VECTOR_CLASS) -# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") -#endif -#if defined(STRING_CLASS) -# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") -#endif -#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") -# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS -#endif - -/* Warn about features that are no longer supported - */ -#if defined(__USE_DEV_VECTOR) -# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") -#endif -#if defined(__USE_DEV_STRING) -# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") -#endif - -/* Detect which version to target */ -#if !defined(CL_HPP_TARGET_OPENCL_VERSION) -# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)") -# define CL_HPP_TARGET_OPENCL_VERSION 300 -#endif -#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \ - CL_HPP_TARGET_OPENCL_VERSION != 110 && \ - CL_HPP_TARGET_OPENCL_VERSION != 120 && \ - CL_HPP_TARGET_OPENCL_VERSION != 200 && \ - CL_HPP_TARGET_OPENCL_VERSION != 210 && \ - CL_HPP_TARGET_OPENCL_VERSION != 220 && \ - CL_HPP_TARGET_OPENCL_VERSION != 300 -# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).") -# undef CL_HPP_TARGET_OPENCL_VERSION -# define CL_HPP_TARGET_OPENCL_VERSION 300 -#endif - -/* Forward target OpenCL version to C headers if necessary */ -#if defined(CL_TARGET_OPENCL_VERSION) -/* Warn if prior definition of CL_TARGET_OPENCL_VERSION is lower than - * requested C++ bindings version */ -#if CL_TARGET_OPENCL_VERSION < CL_HPP_TARGET_OPENCL_VERSION -# pragma message("CL_TARGET_OPENCL_VERSION is already defined as is lower than CL_HPP_TARGET_OPENCL_VERSION") -#endif -#else -# define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION -#endif - -#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION) -# define CL_HPP_MINIMUM_OPENCL_VERSION 200 -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 300 -# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100") -# undef CL_HPP_MINIMUM_OPENCL_VERSION -# define CL_HPP_MINIMUM_OPENCL_VERSION 100 -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION -# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION" -#endif - -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) -# define CL_USE_DEPRECATED_OPENCL_1_0_APIS -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -# define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) -# define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) -# define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) -# define CL_USE_DEPRECATED_OPENCL_2_1_APIS -#endif -#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) -# define CL_USE_DEPRECATED_OPENCL_2_2_APIS -#endif - -#ifdef _WIN32 - -#include - -#if defined(CL_HPP_USE_DX_INTEROP) -#include -#include -#endif -#endif // _WIN32 - -#if defined(_MSC_VER) -#include -#endif // _MSC_VER - - // Check for a valid C++ version - -// Need to do both tests here because for some reason __cplusplus is not -// updated in visual studio -#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700) -#error Visual studio 2013 or another C++11-supporting compiler required -#endif - -// -#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) -#include -#endif - -#if defined(__APPLE__) || defined(__MACOSX) -#include -#else -#include -#endif // !__APPLE__ - -#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L ) -#define CL_HPP_NOEXCEPT_ noexcept -#else -#define CL_HPP_NOEXCEPT_ -#endif - -#if __cplusplus >= 201703L -# define CL_HPP_DEFINE_STATIC_MEMBER_ inline -#elif defined(_MSC_VER) -# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany) -#elif defined(__MINGW32__) -# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany)) -#else -# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak)) -#endif // !_MSC_VER - -// Define deprecated prefixes and suffixes to ensure compilation -// in case they are not pre-defined -#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) -#define CL_API_PREFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) -#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) -#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) - -#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) -#define CL_API_PREFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) -#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) -#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) - -#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) -#define CL_API_PREFIX__VERSION_2_2_DEPRECATED -#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) -#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) -#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED -#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) - -#if !defined(CL_CALLBACK) -#define CL_CALLBACK -#endif //CL_CALLBACK - -#include -#include -#include -#include -#include -#include - - -// Define a size_type to represent a correctly resolved size_t -#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) -namespace cl { - using size_type = ::size_t; -} // namespace cl -#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) -namespace cl { - using size_type = size_t; -} // namespace cl -#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) - - -#if defined(CL_HPP_ENABLE_EXCEPTIONS) -#include -#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) - -#if !defined(CL_HPP_NO_STD_VECTOR) -#include -namespace cl { - template < class T, class Alloc = std::allocator > - using vector = std::vector; -} // namespace cl -#endif // #if !defined(CL_HPP_NO_STD_VECTOR) - -#if !defined(CL_HPP_NO_STD_STRING) -#include -namespace cl { - using string = std::string; -} // namespace cl -#endif // #if !defined(CL_HPP_NO_STD_STRING) - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -#if !defined(CL_HPP_NO_STD_UNIQUE_PTR) -#include -namespace cl { - // Replace unique_ptr and allocate_pointer for internal use - // to allow user to replace them - template - using pointer = std::unique_ptr; -} // namespace cl -#endif -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if !defined(CL_HPP_NO_STD_ARRAY) -#include -namespace cl { - template < class T, size_type N > - using array = std::array; -} // namespace cl -#endif // #if !defined(CL_HPP_NO_STD_ARRAY) - -// Define size_type appropriately to allow backward-compatibility -// use of the old size_t interface class -#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) -namespace cl { - namespace compatibility { - /*! \brief class used to interface between C++ and - * OpenCL C calls that require arrays of size_t values, whose - * size is known statically. - */ - template - class size_t - { - private: - size_type data_[N]; - - public: - //! \brief Initialize size_t to all 0s - size_t() - { - for (int i = 0; i < N; ++i) { - data_[i] = 0; - } - } - - size_t(const array &rhs) - { - for (int i = 0; i < N; ++i) { - data_[i] = rhs[i]; - } - } - - size_type& operator[](int index) - { - return data_[index]; - } - - const size_type& operator[](int index) const - { - return data_[index]; - } - - //! \brief Conversion operator to T*. - operator size_type* () { return data_; } - - //! \brief Conversion operator to const T*. - operator const size_type* () const { return data_; } - - operator array() const - { - array ret; - - for (int i = 0; i < N; ++i) { - ret[i] = data_[i]; - } - return ret; - } - }; - } // namespace compatibility - - template - using size_t = compatibility::size_t; -} // namespace cl -#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) - -// Helper alias to avoid confusing the macros -namespace cl { - namespace detail { - using size_t_array = array; - } // namespace detail -} // namespace cl - - -/*! \namespace cl - * - * \brief The OpenCL C++ bindings are defined within this namespace. - * - */ -namespace cl { - class Memory; - -#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \ - if (!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddress(#name); \ - if (!pfn_##name) { \ - } \ - } - -#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \ - if (!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddressForPlatform(platform, #name); \ - if (!pfn_##name) { \ - } \ - } - - class Program; - class Device; - class Context; - class CommandQueue; - class DeviceCommandQueue; - class Memory; - class Buffer; - class Pipe; - -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - /*! \brief Exception class - * - * This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined. - */ - class Error : public std::exception - { - private: - cl_int err_; - const char * errStr_; - public: - /*! \brief Create a new CL error exception for a given error code - * and corresponding message. - * - * \param err error code value. - * - * \param errStr a descriptive string that must remain in scope until - * handling of the exception has concluded. If set, it - * will be returned by what(). - */ - Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) - {} - - ~Error() throw() {} - - /*! \brief Get error string associated with exception - * - * \return A memory pointer to the error message string. - */ - virtual const char * what() const throw () - { - if (errStr_ == NULL) { - return "empty"; - } - else { - return errStr_; - } - } - - /*! \brief Get error code associated with exception - * - * \return The error code. - */ - cl_int err(void) const { return err_; } - }; -#define CL_HPP_ERR_STR_(x) #x -#else -#define CL_HPP_ERR_STR_(x) NULL -#endif // CL_HPP_ENABLE_EXCEPTIONS - - -namespace detail -{ -#if defined(CL_HPP_ENABLE_EXCEPTIONS) -static inline cl_int errHandler ( - cl_int err, - const char * errStr = NULL) -{ - if (err != CL_SUCCESS) { - throw Error(err, errStr); - } - return err; -} -#else -static inline cl_int errHandler (cl_int err, const char * errStr = NULL) -{ - (void) errStr; // suppress unused variable warning - return err; -} -#endif // CL_HPP_ENABLE_EXCEPTIONS -} - - - -//! \cond DOXYGEN_DETAIL -#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) -#define __GET_DEVICE_INFO_ERR CL_HPP_ERR_STR_(clGetDeviceInfo) -#define __GET_PLATFORM_INFO_ERR CL_HPP_ERR_STR_(clGetPlatformInfo) -#define __GET_DEVICE_IDS_ERR CL_HPP_ERR_STR_(clGetDeviceIDs) -#define __GET_PLATFORM_IDS_ERR CL_HPP_ERR_STR_(clGetPlatformIDs) -#define __GET_CONTEXT_INFO_ERR CL_HPP_ERR_STR_(clGetContextInfo) -#define __GET_EVENT_INFO_ERR CL_HPP_ERR_STR_(clGetEventInfo) -#define __GET_EVENT_PROFILE_INFO_ERR CL_HPP_ERR_STR_(clGetEventProfileInfo) -#define __GET_MEM_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetMemObjectInfo) -#define __GET_IMAGE_INFO_ERR CL_HPP_ERR_STR_(clGetImageInfo) -#define __GET_SAMPLER_INFO_ERR CL_HPP_ERR_STR_(clGetSamplerInfo) -#define __GET_KERNEL_INFO_ERR CL_HPP_ERR_STR_(clGetKernelInfo) -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __GET_KERNEL_ARG_INFO_ERR CL_HPP_ERR_STR_(clGetKernelArgInfo) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __GET_KERNEL_SUB_GROUP_INFO_ERR CL_HPP_ERR_STR_(clGetKernelSubGroupInfo) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __GET_KERNEL_WORK_GROUP_INFO_ERR CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo) -#define __GET_PROGRAM_INFO_ERR CL_HPP_ERR_STR_(clGetProgramInfo) -#define __GET_PROGRAM_BUILD_INFO_ERR CL_HPP_ERR_STR_(clGetProgramBuildInfo) -#define __GET_COMMAND_QUEUE_INFO_ERR CL_HPP_ERR_STR_(clGetCommandQueueInfo) - -#define __CREATE_CONTEXT_ERR CL_HPP_ERR_STR_(clCreateContext) -#define __CREATE_CONTEXT_FROM_TYPE_ERR CL_HPP_ERR_STR_(clCreateContextFromType) -#define __GET_SUPPORTED_IMAGE_FORMATS_ERR CL_HPP_ERR_STR_(clGetSupportedImageFormats) - -#define __CREATE_BUFFER_ERR CL_HPP_ERR_STR_(clCreateBuffer) -#define __COPY_ERR CL_HPP_ERR_STR_(cl::copy) -#define __CREATE_SUBBUFFER_ERR CL_HPP_ERR_STR_(clCreateSubBuffer) -#define __CREATE_GL_BUFFER_ERR CL_HPP_ERR_STR_(clCreateFromGLBuffer) -#define __CREATE_GL_RENDER_BUFFER_ERR CL_HPP_ERR_STR_(clCreateFromGLBuffer) -#define __GET_GL_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetGLObjectInfo) -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __CREATE_IMAGE_ERR CL_HPP_ERR_STR_(clCreateImage) -#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture) -#define __IMAGE_DIMENSION_ERR CL_HPP_ERR_STR_(Incorrect image dimensions) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback) - -#define __CREATE_USER_EVENT_ERR CL_HPP_ERR_STR_(clCreateUserEvent) -#define __SET_USER_EVENT_STATUS_ERR CL_HPP_ERR_STR_(clSetUserEventStatus) -#define __SET_EVENT_CALLBACK_ERR CL_HPP_ERR_STR_(clSetEventCallback) -#define __WAIT_FOR_EVENTS_ERR CL_HPP_ERR_STR_(clWaitForEvents) - -#define __CREATE_KERNEL_ERR CL_HPP_ERR_STR_(clCreateKernel) -#define __SET_KERNEL_ARGS_ERR CL_HPP_ERR_STR_(clSetKernelArg) -#define __CREATE_PROGRAM_WITH_SOURCE_ERR CL_HPP_ERR_STR_(clCreateProgramWithSource) -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __CREATE_PROGRAM_WITH_IL_ERR CL_HPP_ERR_STR_(clCreateProgramWithIL) -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __CREATE_PROGRAM_WITH_BINARY_ERR CL_HPP_ERR_STR_(clCreateProgramWithBinary) -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 -#define __CREATE_PROGRAM_WITH_IL_ERR CL_HPP_ERR_STR_(clCreateProgramWithIL) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __BUILD_PROGRAM_ERR CL_HPP_ERR_STR_(clBuildProgram) -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __COMPILE_PROGRAM_ERR CL_HPP_ERR_STR_(clCompileProgram) -#define __LINK_PROGRAM_ERR CL_HPP_ERR_STR_(clLinkProgram) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __CREATE_KERNELS_IN_PROGRAM_ERR CL_HPP_ERR_STR_(clCreateKernelsInProgram) - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties) -#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR CL_HPP_ERR_STR_(clCreateSamplerWithProperties) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#define __SET_COMMAND_QUEUE_PROPERTY_ERR CL_HPP_ERR_STR_(clSetCommandQueueProperty) -#define __ENQUEUE_READ_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueReadBuffer) -#define __ENQUEUE_READ_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueReadBufferRect) -#define __ENQUEUE_WRITE_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueWriteBuffer) -#define __ENQUEUE_WRITE_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueWriteBufferRect) -#define __ENQEUE_COPY_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueCopyBuffer) -#define __ENQEUE_COPY_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueCopyBufferRect) -#define __ENQUEUE_FILL_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueFillBuffer) -#define __ENQUEUE_READ_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueReadImage) -#define __ENQUEUE_WRITE_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueWriteImage) -#define __ENQUEUE_COPY_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueCopyImage) -#define __ENQUEUE_FILL_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueFillImage) -#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer) -#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage) -#define __ENQUEUE_MAP_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueMapBuffer) -#define __ENQUEUE_MAP_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueMapImage) -#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR CL_HPP_ERR_STR_(clEnqueueUnMapMemObject) -#define __ENQUEUE_NDRANGE_KERNEL_ERR CL_HPP_ERR_STR_(clEnqueueNDRangeKernel) -#define __ENQUEUE_NATIVE_KERNEL CL_HPP_ERR_STR_(clEnqueueNativeKernel) -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 -#define __ENQUEUE_MIGRATE_SVM_ERR CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem) -#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 - - -#define __ENQUEUE_ACQUIRE_GL_ERR CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects) -#define __ENQUEUE_RELEASE_GL_ERR CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects) - -#define __CREATE_PIPE_ERR CL_HPP_ERR_STR_(clCreatePipe) -#define __GET_PIPE_INFO_ERR CL_HPP_ERR_STR_(clGetPipeInfo) - - -#define __RETAIN_ERR CL_HPP_ERR_STR_(Retain Object) -#define __RELEASE_ERR CL_HPP_ERR_STR_(Release Object) -#define __FLUSH_ERR CL_HPP_ERR_STR_(clFlush) -#define __FINISH_ERR CL_HPP_ERR_STR_(clFinish) -#define __VECTOR_CAPACITY_ERR CL_HPP_ERR_STR_(Vector capacity error) - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 -#define __GET_HOST_TIMER_ERR CL_HPP_ERR_STR_(clGetHostTimer) -#define __GET_DEVICE_AND_HOST_TIMER_ERR CL_HPP_ERR_STR_(clGetDeviceAndHostTimer) -#endif -#if CL_HPP_TARGET_OPENCL_VERSION >= 220 -#define __SET_PROGRAM_RELEASE_CALLBACK_ERR CL_HPP_ERR_STR_(clSetProgramReleaseCallback) -#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR CL_HPP_ERR_STR_(clSetProgramSpecializationConstant) -#endif - - -/** - * CL 1.2 version that uses device fission. - */ -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __CREATE_SUB_DEVICES_ERR CL_HPP_ERR_STR_(clCreateSubDevices) -#else -#define __CREATE_SUB_DEVICES_ERR CL_HPP_ERR_STR_(clCreateSubDevicesEXT) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define __ENQUEUE_MARKER_ERR CL_HPP_ERR_STR_(clEnqueueMarker) -#define __ENQUEUE_WAIT_FOR_EVENTS_ERR CL_HPP_ERR_STR_(clEnqueueWaitForEvents) -#define __ENQUEUE_BARRIER_ERR CL_HPP_ERR_STR_(clEnqueueBarrier) -#define __UNLOAD_COMPILER_ERR CL_HPP_ERR_STR_(clUnloadCompiler) -#define __CREATE_GL_TEXTURE_2D_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture2D) -#define __CREATE_GL_TEXTURE_3D_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture3D) -#define __CREATE_IMAGE2D_ERR CL_HPP_ERR_STR_(clCreateImage2D) -#define __CREATE_IMAGE3D_ERR CL_HPP_ERR_STR_(clCreateImage3D) -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -/** - * Deprecated APIs for 2.0 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) -#define __CREATE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clCreateCommandQueue) -#define __ENQUEUE_TASK_ERR CL_HPP_ERR_STR_(clEnqueueTask) -#define __CREATE_SAMPLER_ERR CL_HPP_ERR_STR_(clCreateSampler) -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -/** - * CL 1.2 marker and barrier commands - */ -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#define __ENQUEUE_MARKER_WAIT_LIST_ERR CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList) -#define __ENQUEUE_BARRIER_WAIT_LIST_ERR CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 -#define __CLONE_KERNEL_ERR CL_HPP_ERR_STR_(clCloneKernel) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 - -#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS -//! \endcond - - -namespace detail { - -// Generic getInfoHelper. The final parameter is used to guide overload -// resolution: the actual parameter passed is an int, which makes this -// a worse conversion sequence than a specialization that declares the -// parameter as an int. -template -inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) -{ - return f(name, sizeof(T), param, NULL); -} - -// Specialized for getInfo -// Assumes that the output vector was correctly resized on the way in -template -inline cl_int getInfoHelper(Func f, cl_uint name, vector>* param, int) -{ - if (name != CL_PROGRAM_BINARIES) { - return CL_INVALID_VALUE; - } - if (param) { - // Create array of pointers, calculate total size and pass pointer array in - size_type numBinaries = param->size(); - vector binariesPointers(numBinaries); - - for (size_type i = 0; i < numBinaries; ++i) - { - binariesPointers[i] = (*param)[i].data(); - } - - cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL); - - if (err != CL_SUCCESS) { - return err; - } - } - - - return CL_SUCCESS; -} - -// Specialized getInfoHelper for vector params -template -inline cl_int getInfoHelper(Func f, cl_uint name, vector* param, long) -{ - size_type required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - const size_type elements = required / sizeof(T); - - // Temporary to avoid changing param on an error - vector localData(elements); - err = f(name, required, localData.data(), NULL); - if (err != CL_SUCCESS) { - return err; - } - if (param) { - *param = std::move(localData); - } - - return CL_SUCCESS; -} - -/* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ -template -inline cl_int getInfoHelper( - Func f, cl_uint name, vector* param, int, typename T::cl_type = 0) -{ - size_type required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - const size_type elements = required / sizeof(typename T::cl_type); - - vector value(elements); - err = f(name, required, value.data(), NULL); - if (err != CL_SUCCESS) { - return err; - } - - if (param) { - // Assign to convert CL type to T for each element - param->resize(elements); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < elements; i++) { - (*param)[i] = T(value[i], true); - } - } - return CL_SUCCESS; -} - -// Specialized GetInfoHelper for string params -template -inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long) -{ - size_type required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - // std::string has a constant data member - // a char vector does not - if (required > 0) { - vector value(required); - err = f(name, required, value.data(), NULL); - if (err != CL_SUCCESS) { - return err; - } - if (param) { - param->assign(begin(value), prev(end(value))); - } - } - else if (param) { - param->assign(""); - } - return CL_SUCCESS; -} - -// Specialized GetInfoHelper for clsize_t params -template -inline cl_int getInfoHelper(Func f, cl_uint name, array* param, long) -{ - size_type required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - size_type elements = required / sizeof(size_type); - vector value(elements, 0); - - err = f(name, required, value.data(), NULL); - if (err != CL_SUCCESS) { - return err; - } - - // Bound the copy with N to prevent overruns - // if passed N > than the amount copied - if (elements > N) { - elements = N; - } - for (size_type i = 0; i < elements; ++i) { - (*param)[i] = value[i]; - } - - return CL_SUCCESS; -} - -template struct ReferenceHandler; - -/* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) -{ - typename T::cl_type value; - cl_int err = f(name, sizeof(value), &value, NULL); - if (err != CL_SUCCESS) { - return err; - } - *param = value; - if (value != NULL) - { - err = param->retain(); - if (err != CL_SUCCESS) { - return err; - } - } - return CL_SUCCESS; -} - -#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \ - F(cl_platform_info, CL_PLATFORM_PROFILE, string) \ - F(cl_platform_info, CL_PLATFORM_VERSION, string) \ - F(cl_platform_info, CL_PLATFORM_NAME, string) \ - F(cl_platform_info, CL_PLATFORM_VENDOR, string) \ - F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \ - \ - F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ - F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ - F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \ - F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ - F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \ - F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ - F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ - F(cl_device_info, CL_DEVICE_NAME, string) \ - F(cl_device_info, CL_DEVICE_VENDOR, string) \ - F(cl_device_info, CL_DRIVER_VERSION, string) \ - F(cl_device_info, CL_DEVICE_PROFILE, string) \ - F(cl_device_info, CL_DEVICE_VERSION, string) \ - F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \ - \ - F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ - F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector) \ - F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector) \ - \ - F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ - F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ - F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ - F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \ - \ - F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ - \ - F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ - F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ - F(cl_mem_info, CL_MEM_SIZE, size_type) \ - F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ - F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ - \ - F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ - F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \ - F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \ - F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \ - F(cl_image_info, CL_IMAGE_WIDTH, size_type) \ - F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \ - F(cl_image_info, CL_IMAGE_DEPTH, size_type) \ - \ - F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ - F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ - F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \ - F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \ - F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \ - \ - F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ - F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ - F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ - F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector) \ - F(cl_program_info, CL_PROGRAM_SOURCE, string) \ - F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector) \ - F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector>) \ - \ - F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \ - \ - F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \ - F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ - F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \ - F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \ - F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ - \ - F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ - F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ - F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ - F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) - - -#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \ - F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \ - \ - F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ - F(cl_mem_info, CL_MEM_OFFSET, size_type) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \ - F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ - \ - F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) - -#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \ - F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \ - F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \ - \ - F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ - \ - F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \ - \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \ - \ - F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \ - F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \ - F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \ - F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \ - F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ - F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \ - F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \ - \ - F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \ - F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \ - F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint) - -#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \ - F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \ - F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \ - F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \ - F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \ - F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \ - F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \ - F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \ - F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \ - F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \ - F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \ - F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \ - F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \ - F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \ - F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \ - F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint) - -#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \ - F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type) - -#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \ - F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \ - F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector) - -#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \ - F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \ - F(cl_program_info, CL_PROGRAM_IL, cl::vector) \ - F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \ - F(cl_device_info, CL_DEVICE_IL_VERSION, string) \ - F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \ - F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \ - F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \ - F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) - -#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \ - F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \ - F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool) - -#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector) \ - F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ - F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector) - -#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \ - F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \ - F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ - \ - F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \ - F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ - F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector) \ - F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector) - -#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr) - -#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \ - F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \ - F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector) \ - \ - F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \ - F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector) \ - F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector) \ - F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector) \ - F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \ - F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \ - F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector) \ - F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \ - F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector) \ - F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \ - F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \ - \ - F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector) \ - F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector) \ - F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector) \ - F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector) - -template -struct param_traits {}; - -#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \ -struct token; \ -template<> \ -struct param_traits \ -{ \ - enum { value = param_name }; \ - typedef T param_type; \ -}; - -CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 -CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 -CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 -#if CL_HPP_TARGET_OPENCL_VERSION >= 220 -CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 -#if CL_HPP_TARGET_OPENCL_VERSION >= 300 -CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300 - -#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 -CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 - -#if defined(CL_HPP_USE_IL_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 -CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // #if defined(CL_HPP_USE_IL_KHR) - - -// Flags deprecated in OpenCL 2.0 -#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \ - F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) - -#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \ - F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) - -#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \ - F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) - -// Include deprecated query flags based on versions -// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash -#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200 -CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110 -#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 -CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 -#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 -CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - -#if defined(CL_HPP_USE_CL_DEVICE_FISSION) -CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_); -#endif // CL_HPP_USE_CL_DEVICE_FISSION - -#if defined(cl_khr_extended_versioning) -#if CL_HPP_TARGET_OPENCL_VERSION < 300 -CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION < 300 -CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // cl_khr_extended_versioning - -#if defined(cl_khr_device_uuid) -using uuid_array = array; -using luid_array = array; -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint) -#endif - -#if defined(cl_khr_pci_bus_info) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr) -#endif - -#if defined(cl_khr_integer_dot_product) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr) -#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr) -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr) -#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) -#endif // defined(cl_khr_integer_dot_product) - -#ifdef CL_PLATFORM_ICD_SUFFIX_KHR -CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string) -#endif - -#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) -#endif -#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector) -#endif -#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_WIDTH_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_BOARD_NAME_AMD -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string) -#endif - -#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong) -#endif -#ifdef CL_DEVICE_JOB_SLOTS_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint) -#endif -#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield) -#endif -#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector) -#endif -#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint) -#endif -#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint) -#endif -#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint) -#endif -#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int) -#endif -#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint) -#endif -#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM -CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint) -#endif - -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) -#endif -#ifdef CL_DEVICE_WARP_SIZE_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) -#endif -#ifdef CL_DEVICE_GPU_OVERLAP_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) -#endif -#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) -#endif -#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV -CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) -#endif - -// Convenience functions - -template -inline cl_int -getInfo(Func f, cl_uint name, T* param) -{ - return getInfoHelper(f, name, param, 0); -} - -template -struct GetInfoFunctor0 -{ - Func f_; const Arg0& arg0_; - cl_int operator ()( - cl_uint param, size_type size, void* value, size_type* size_ret) - { return f_(arg0_, param, size, value, size_ret); } -}; - -template -struct GetInfoFunctor1 -{ - Func f_; const Arg0& arg0_; const Arg1& arg1_; - cl_int operator ()( - cl_uint param, size_type size, void* value, size_type* size_ret) - { return f_(arg0_, arg1_, param, size, value, size_ret); } -}; - -template -inline cl_int -getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) -{ - GetInfoFunctor0 f0 = { f, arg0 }; - return getInfoHelper(f0, name, param, 0); -} - -template -inline cl_int -getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) -{ - GetInfoFunctor1 f0 = { f, arg0, arg1 }; - return getInfoHelper(f0, name, param, 0); -} - - -template -struct ReferenceHandler -{ }; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -/** - * OpenCL 1.2 devices do have retain/release. - */ -template <> -struct ReferenceHandler -{ - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int retain(cl_device_id device) - { return ::clRetainDevice(device); } - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int release(cl_device_id device) - { return ::clReleaseDevice(device); } -}; -#else // CL_HPP_TARGET_OPENCL_VERSION >= 120 -/** - * OpenCL 1.1 devices do not have retain/release. - */ -template <> -struct ReferenceHandler -{ - // cl_device_id does not have retain(). - static cl_int retain(cl_device_id) - { return CL_SUCCESS; } - // cl_device_id does not have release(). - static cl_int release(cl_device_id) - { return CL_SUCCESS; } -}; -#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120) - -template <> -struct ReferenceHandler -{ - // cl_platform_id does not have retain(). - static cl_int retain(cl_platform_id) - { return CL_SUCCESS; } - // cl_platform_id does not have release(). - static cl_int release(cl_platform_id) - { return CL_SUCCESS; } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_context context) - { return ::clRetainContext(context); } - static cl_int release(cl_context context) - { return ::clReleaseContext(context); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_command_queue queue) - { return ::clRetainCommandQueue(queue); } - static cl_int release(cl_command_queue queue) - { return ::clReleaseCommandQueue(queue); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_mem memory) - { return ::clRetainMemObject(memory); } - static cl_int release(cl_mem memory) - { return ::clReleaseMemObject(memory); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_sampler sampler) - { return ::clRetainSampler(sampler); } - static cl_int release(cl_sampler sampler) - { return ::clReleaseSampler(sampler); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_program program) - { return ::clRetainProgram(program); } - static cl_int release(cl_program program) - { return ::clReleaseProgram(program); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_kernel kernel) - { return ::clRetainKernel(kernel); } - static cl_int release(cl_kernel kernel) - { return ::clReleaseKernel(kernel); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_event event) - { return ::clRetainEvent(event); } - static cl_int release(cl_event event) - { return ::clReleaseEvent(event); } -}; - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 -// Extracts version number with major in the upper 16 bits, minor in the lower 16 -static cl_uint getVersion(const vector &versionInfo) -{ - int highVersion = 0; - int lowVersion = 0; - int index = 7; - while(versionInfo[index] != '.' ) { - highVersion *= 10; - highVersion += versionInfo[index]-'0'; - ++index; - } - ++index; - while(versionInfo[index] != ' ' && versionInfo[index] != '\0') { - lowVersion *= 10; - lowVersion += versionInfo[index]-'0'; - ++index; - } - return (highVersion << 16) | lowVersion; -} - -static cl_uint getPlatformVersion(cl_platform_id platform) -{ - size_type size = 0; - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); - - vector versionInfo(size); - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size); - return getVersion(versionInfo); -} - -static cl_uint getDevicePlatformVersion(cl_device_id device) -{ - cl_platform_id platform; - clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); - return getPlatformVersion(platform); -} - -static cl_uint getContextPlatformVersion(cl_context context) -{ - // The platform cannot be queried directly, so we first have to grab a - // device and obtain its context - size_type size = 0; - clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); - if (size == 0) - return 0; - vector devices(size/sizeof(cl_device_id)); - clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL); - return getDevicePlatformVersion(devices[0]); -} -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 - -template -class Wrapper -{ -public: - typedef T cl_type; - -protected: - cl_type object_; - -public: - Wrapper() : object_(NULL) { } - - Wrapper(const cl_type &obj, bool retainObject) : object_(obj) - { - if (retainObject) { - detail::errHandler(retain(), __RETAIN_ERR); - } - } - - ~Wrapper() - { - if (object_ != NULL) { release(); } - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - detail::errHandler(retain(), __RETAIN_ERR); - } - - Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT_ - { - object_ = rhs.object_; - rhs.object_ = NULL; - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (this != &rhs) { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs.object_; - detail::errHandler(retain(), __RETAIN_ERR); - } - return *this; - } - - Wrapper& operator = (Wrapper&& rhs) - { - if (this != &rhs) { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs.object_; - rhs.object_ = NULL; - } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs; - return *this; - } - - const cl_type& operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - - cl_type get() const { return object_; } - -protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - cl_int retain() const - { - if (object_ != nullptr) { - return ReferenceHandler::retain(object_); - } - else { - return CL_SUCCESS; - } - } - - cl_int release() const - { - if (object_ != nullptr) { - return ReferenceHandler::release(object_); - } - else { - return CL_SUCCESS; - } - } -}; - -template <> -class Wrapper -{ -public: - typedef cl_device_id cl_type; - -protected: - cl_type object_; - bool referenceCountable_; - - static bool isReferenceCountable(cl_device_id device) - { - bool retVal = false; -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 - if (device != NULL) { - int version = getDevicePlatformVersion(device); - if(version > ((1 << 16) + 1)) { - retVal = true; - } - } -#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120 - retVal = true; -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - (void)device; - return retVal; - } - -public: - Wrapper() : object_(NULL), referenceCountable_(false) - { - } - - Wrapper(const cl_type &obj, bool retainObject) : - object_(obj), - referenceCountable_(false) - { - referenceCountable_ = isReferenceCountable(obj); - - if (retainObject) { - detail::errHandler(retain(), __RETAIN_ERR); - } - } - - ~Wrapper() - { - release(); - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - referenceCountable_ = isReferenceCountable(object_); - detail::errHandler(retain(), __RETAIN_ERR); - } - - Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT_ - { - object_ = rhs.object_; - referenceCountable_ = rhs.referenceCountable_; - rhs.object_ = NULL; - rhs.referenceCountable_ = false; - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (this != &rhs) { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs.object_; - referenceCountable_ = rhs.referenceCountable_; - detail::errHandler(retain(), __RETAIN_ERR); - } - return *this; - } - - Wrapper& operator = (Wrapper&& rhs) - { - if (this != &rhs) { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs.object_; - referenceCountable_ = rhs.referenceCountable_; - rhs.object_ = NULL; - rhs.referenceCountable_ = false; - } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - detail::errHandler(release(), __RELEASE_ERR); - object_ = rhs; - referenceCountable_ = isReferenceCountable(object_); - return *this; - } - - const cl_type& operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - - cl_type get() const { return object_; } - -protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - template - friend inline cl_int getInfoHelper(Func, cl_uint, vector*, int, typename U::cl_type); - - cl_int retain() const - { - if( object_ != nullptr && referenceCountable_ ) { - return ReferenceHandler::retain(object_); - } - else { - return CL_SUCCESS; - } - } - - cl_int release() const - { - if (object_ != nullptr && referenceCountable_) { - return ReferenceHandler::release(object_); - } - else { - return CL_SUCCESS; - } - } -}; - -template -inline bool operator==(const Wrapper &lhs, const Wrapper &rhs) -{ - return lhs() == rhs(); -} - -template -inline bool operator!=(const Wrapper &lhs, const Wrapper &rhs) -{ - return !operator==(lhs, rhs); -} - -} // namespace detail -//! \endcond - - - - - -/*! \stuct ImageFormat - * \brief Adds constructors and member functions for cl_image_format. - * - * \see cl_image_format - */ -struct ImageFormat : public cl_image_format -{ - //! \brief Default constructor - performs no initialization. - ImageFormat(){} - - //! \brief Initializing constructor. - ImageFormat(cl_channel_order order, cl_channel_type type) - { - image_channel_order = order; - image_channel_data_type = type; - } - - //! \brief Copy constructor. - ImageFormat(const ImageFormat &other) { *this = other; } - - //! \brief Assignment operator. - ImageFormat& operator = (const ImageFormat& rhs) - { - if (this != &rhs) { - this->image_channel_data_type = rhs.image_channel_data_type; - this->image_channel_order = rhs.image_channel_order; - } - return *this; - } -}; - -/*! \brief Class interface for cl_device_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_device_id - */ -class Device : public detail::Wrapper -{ -private: - static std::once_flag default_initialized_; - static Device default_; - static cl_int default_error_; - - /*! \brief Create the default context. - * - * This sets @c default_ and @c default_error_. It does not throw - * @c cl::Error. - */ - static void makeDefault(); - - /*! \brief Create the default platform from a provided platform. - * - * This sets @c default_. It does not throw - * @c cl::Error. - */ - static void makeDefaultProvided(const Device &p) { - default_ = p; - } - -public: -#ifdef CL_HPP_UNIT_TEST_ENABLE - /*! \brief Reset the default. - * - * This sets @c default_ to an empty value to support cleanup in - * the unit test framework. - * This function is not thread safe. - */ - static void unitTestClearDefault() { - default_ = Device(); - } -#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE - - //! \brief Default constructor - initializes to NULL. - Device() : detail::Wrapper() { } - - /*! \brief Constructor from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - explicit Device(const cl_device_id &device, bool retainObject = false) : - detail::Wrapper(device, retainObject) { } - - /*! \brief Returns the first device on the default context. - * - * \see Context::getDefault() - */ - static Device getDefault( - cl_int *errResult = NULL) - { - std::call_once(default_initialized_, makeDefault); - detail::errHandler(default_error_); - if (errResult != NULL) { - *errResult = default_error_; - } - return default_; - } - - /** - * Modify the default device to be used by - * subsequent operations. - * Will only set the default if no default was previously created. - * @return updated default device. - * Should be compared to the passed value to ensure that it was updated. - */ - static Device setDefault(const Device &default_device) - { - std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device)); - detail::errHandler(default_error_); - return default_; - } - - /*! \brief Assignment operator from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device& operator = (const cl_device_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Device(const Device& dev) : detail::Wrapper(dev) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Device& operator = (const Device &dev) - { - detail::Wrapper::operator=(dev); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(dev)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Device& operator = (Device &&dev) - { - detail::Wrapper::operator=(std::move(dev)); - return *this; - } - - //! \brief Wrapper for clGetDeviceInfo(). - template - cl_int getInfo(cl_device_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetDeviceInfo, object_, name, param), - __GET_DEVICE_INFO_ERR); - } - - //! \brief Wrapper for clGetDeviceInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_device_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - /** - * Return the current value of the host clock as seen by the device. - * The resolution of the device timer may be queried with the - * CL_DEVICE_PROFILING_TIMER_RESOLUTION query. - * @return The host timer value. - */ - cl_ulong getHostTimer(cl_int *error = nullptr) - { - cl_ulong retVal = 0; - cl_int err = - clGetHostTimer(this->get(), &retVal); - detail::errHandler( - err, - __GET_HOST_TIMER_ERR); - if (error) { - *error = err; - } - return retVal; - } - - /** - * Return a synchronized pair of host and device timestamps as seen by device. - * Use to correlate the clocks and get the host timer only using getHostTimer - * as a lower cost mechanism in between calls. - * The resolution of the host timer may be queried with the - * CL_PLATFORM_HOST_TIMER_RESOLUTION query. - * The resolution of the device timer may be queried with the - * CL_DEVICE_PROFILING_TIMER_RESOLUTION query. - * @return A pair of (device timer, host timer) timer values. - */ - std::pair getDeviceAndHostTimer(cl_int *error = nullptr) - { - std::pair retVal; - cl_int err = - clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second)); - detail::errHandler( - err, - __GET_DEVICE_AND_HOST_TIMER_ERR); - if (error) { - *error = err; - } - return retVal; - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - /** - * CL 1.2 version - */ -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - //! \brief Wrapper for clCreateSubDevices(). - cl_int createSubDevices( - const cl_device_partition_property * properties, - vector* devices) - { - cl_uint n = 0; - cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); - } - - vector ids(n); - err = clCreateSubDevices(object_, properties, n, ids.data(), NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); - } - - // Cannot trivially assign because we need to capture intermediates - // with safe construction - if (devices) { - devices->resize(ids.size()); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < ids.size(); i++) { - // We do not need to retain because this device is being created - // by the runtime - (*devices)[i] = Device(ids[i], false); - } - } - - return CL_SUCCESS; - } -#elif defined(CL_HPP_USE_CL_DEVICE_FISSION) - -/** - * CL 1.1 version that uses device fission extension. - */ - cl_int createSubDevices( - const cl_device_partition_property_ext * properties, - vector* devices) - { - typedef CL_API_ENTRY cl_int - ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( - cl_device_id /*in_device*/, - const cl_device_partition_property_ext * /* properties */, - cl_uint /*num_entries*/, - cl_device_id * /*out_devices*/, - cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1; - - static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; - CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT); - - cl_uint n = 0; - cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); - } - - vector ids(n); - err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); - } - // Cannot trivially assign because we need to capture intermediates - // with safe construction - if (devices) { - devices->resize(ids.size()); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < ids.size(); i++) { - // We do not need to retain because this device is being created - // by the runtime - (*devices)[i] = Device(ids[i], false); - } - } - return CL_SUCCESS; - } -#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION) -}; - -using BuildLogType = vector::param_type>>; -#if defined(CL_HPP_ENABLE_EXCEPTIONS) -/** -* Exception class for build errors to carry build info -*/ -class BuildError : public Error -{ -private: - BuildLogType buildLogs; -public: - BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) - { - } - - BuildLogType getBuildLog() const - { - return buildLogs; - } -}; -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - if (err != CL_SUCCESS) { - throw BuildError(err, errStr, buildLogs); - } - return err; - } -} // namespace detail - -#else -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - (void)buildLogs; // suppress unused variable warning - (void)errStr; - return err; - } -} // namespace detail -#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) - -CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_; -CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_; -CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS; - -/*! \brief Class interface for cl_platform_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_platform_id - */ -class Platform : public detail::Wrapper -{ -private: - static std::once_flag default_initialized_; - static Platform default_; - static cl_int default_error_; - - /*! \brief Create the default context. - * - * This sets @c default_ and @c default_error_. It does not throw - * @c cl::Error. - */ - static void makeDefault() { - /* Throwing an exception from a call_once invocation does not do - * what we wish, so we catch it and save the error. - */ -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - try -#endif - { - // If default wasn't passed ,generate one - // Otherwise set it - cl_uint n = 0; - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - default_error_ = err; - return; - } - if (n == 0) { - default_error_ = CL_INVALID_PLATFORM; - return; - } - - vector ids(n); - err = ::clGetPlatformIDs(n, ids.data(), NULL); - if (err != CL_SUCCESS) { - default_error_ = err; - return; - } - - default_ = Platform(ids[0]); - } -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - catch (cl::Error &e) { - default_error_ = e.err(); - } -#endif - } - - /*! \brief Create the default platform from a provided platform. - * - * This sets @c default_. It does not throw - * @c cl::Error. - */ - static void makeDefaultProvided(const Platform &p) { - default_ = p; - } - -public: -#ifdef CL_HPP_UNIT_TEST_ENABLE - /*! \brief Reset the default. - * - * This sets @c default_ to an empty value to support cleanup in - * the unit test framework. - * This function is not thread safe. - */ - static void unitTestClearDefault() { - default_ = Platform(); - } -#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE - - //! \brief Default constructor - initializes to NULL. - Platform() : detail::Wrapper() { } - - /*! \brief Constructor from cl_platform_id. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * This simply copies the platform ID value, which is an inexpensive operation. - */ - explicit Platform(const cl_platform_id &platform, bool retainObject = false) : - detail::Wrapper(platform, retainObject) { } - - /*! \brief Assignment operator from cl_platform_id. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform& operator = (const cl_platform_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - static Platform getDefault( - cl_int *errResult = NULL) - { - std::call_once(default_initialized_, makeDefault); - detail::errHandler(default_error_); - if (errResult != NULL) { - *errResult = default_error_; - } - return default_; - } - - /** - * Modify the default platform to be used by - * subsequent operations. - * Will only set the default if no default was previously created. - * @return updated default platform. - * Should be compared to the passed value to ensure that it was updated. - */ - static Platform setDefault(const Platform &default_platform) - { - std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform)); - detail::errHandler(default_error_); - return default_; - } - - //! \brief Wrapper for clGetPlatformInfo(). - template - cl_int getInfo(cl_platform_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetPlatformInfo, object_, name, param), - __GET_PLATFORM_INFO_ERR); - } - - //! \brief Wrapper for clGetPlatformInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_platform_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of devices for this platform. - * - * Wraps clGetDeviceIDs(). - */ - cl_int getDevices( - cl_device_type type, - vector* devices) const - { - cl_uint n = 0; - if( devices == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); - if (err != CL_SUCCESS && err != CL_DEVICE_NOT_FOUND) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - vector ids(n); - if (n>0) { - err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - } - - // Cannot trivially assign because we need to capture intermediates - // with safe construction - // We must retain things we obtain from the API to avoid releasing - // API-owned objects. - if (devices) { - devices->resize(ids.size()); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < ids.size(); i++) { - (*devices)[i] = Device(ids[i], true); - } - } - return CL_SUCCESS; - } - -#if defined(CL_HPP_USE_DX_INTEROP) - /*! \brief Get the list of available D3D10 devices. - * - * \param d3d_device_source. - * - * \param d3d_object. - * - * \param d3d_device_set. - * - * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device - * values returned in devices can be used to identify a specific OpenCL - * device. If \a devices argument is NULL, this argument is ignored. - * - * \return One of the following values: - * - CL_SUCCESS if the function is executed successfully. - * - * The application can query specific capabilities of the OpenCL device(s) - * returned by cl::getDevices. This can be used by the application to - * determine which device(s) to use. - * - * \note In the case that exceptions are enabled and a return value - * other than CL_SUCCESS is generated, then cl::Error exception is - * generated. - */ - cl_int getDevices( - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - vector* devices) const - { - typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint* num_devices); - - if( devices == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - - static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; - CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR); - - cl_uint n = 0; - cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - 0, - NULL, - &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - vector ids(n); - err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - n, - ids.data(), - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - // Cannot trivially assign because we need to capture intermediates - // with safe construction - // We must retain things we obtain from the API to avoid releasing - // API-owned objects. - if (devices) { - devices->resize(ids.size()); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < ids.size(); i++) { - (*devices)[i] = Device(ids[i], true); - } - } - return CL_SUCCESS; - } -#endif - - /*! \brief Gets a list of available platforms. - * - * Wraps clGetPlatformIDs(). - */ - static cl_int get( - vector* platforms) - { - cl_uint n = 0; - - if( platforms == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); - } - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - vector ids(n); - err = ::clGetPlatformIDs(n, ids.data(), NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - if (platforms) { - platforms->resize(ids.size()); - - // Platforms don't reference count - for (size_type i = 0; i < ids.size(); i++) { - (*platforms)[i] = Platform(ids[i]); - } - } - return CL_SUCCESS; - } - - /*! \brief Gets the first available platform. - * - * Wraps clGetPlatformIDs(), returning the first result. - */ - static cl_int get( - Platform * platform) - { - cl_int err; - Platform default_platform = Platform::getDefault(&err); - if (platform) { - *platform = default_platform; - } - return err; - } - - /*! \brief Gets the first available platform, returning it by value. - * - * \return Returns a valid platform if one is available. - * If no platform is available will return a null platform. - * Throws an exception if no platforms are available - * or an error condition occurs. - * Wraps clGetPlatformIDs(), returning the first result. - */ - static Platform get( - cl_int * errResult = NULL) - { - cl_int err; - Platform default_platform = Platform::getDefault(&err); - if (errResult) { - *errResult = err; - } - return default_platform; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - //! \brief Wrapper for clUnloadCompiler(). - cl_int - unloadCompiler() - { - return ::clUnloadPlatformCompiler(object_); - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -}; // class Platform - -CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_; -CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_; -CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS; - - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -/** - * Unload the OpenCL compiler. - * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. - */ -inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int -UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED; -inline cl_int -UnloadCompiler() -{ - return ::clUnloadCompiler(); -} -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -/*! \brief Class interface for cl_context. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_context as the original. For details, see - * clRetainContext() and clReleaseContext(). - * - * \see cl_context - */ -class Context - : public detail::Wrapper -{ -private: - static std::once_flag default_initialized_; - static Context default_; - static cl_int default_error_; - - /*! \brief Create the default context from the default device type in the default platform. - * - * This sets @c default_ and @c default_error_. It does not throw - * @c cl::Error. - */ - static void makeDefault() { - /* Throwing an exception from a call_once invocation does not do - * what we wish, so we catch it and save the error. - */ -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - try -#endif - { -#if !defined(__APPLE__) && !defined(__MACOS) - const Platform &p = Platform::getDefault(); - cl_platform_id defaultPlatform = p(); - cl_context_properties properties[3] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0 - }; -#else // #if !defined(__APPLE__) && !defined(__MACOS) - cl_context_properties *properties = nullptr; -#endif // #if !defined(__APPLE__) && !defined(__MACOS) - - default_ = Context( - CL_DEVICE_TYPE_DEFAULT, - properties, - NULL, - NULL, - &default_error_); - } -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - catch (cl::Error &e) { - default_error_ = e.err(); - } -#endif - } - - - /*! \brief Create the default context from a provided Context. - * - * This sets @c default_. It does not throw - * @c cl::Error. - */ - static void makeDefaultProvided(const Context &c) { - default_ = c; - } - -public: -#ifdef CL_HPP_UNIT_TEST_ENABLE - /*! \brief Reset the default. - * - * This sets @c default_ to an empty value to support cleanup in - * the unit test framework. - * This function is not thread safe. - */ - static void unitTestClearDefault() { - default_ = Context(); - } -#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE - - /*! \brief Constructs a context including a list of specified devices. - * - * Wraps clCreateContext(). - */ - Context( - const vector& devices, - const cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - size_type, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - size_type numDevices = devices.size(); - vector deviceIDs(numDevices); - - for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateContext( - properties, (cl_uint) numDevices, - deviceIDs.data(), - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a context including a specific device. - * - * Wraps clCreateContext(). - */ - Context( - const Device& device, - const cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - size_type, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_device_id deviceID = device(); - - object_ = ::clCreateContext( - properties, 1, - &deviceID, - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a context including all or a subset of devices of a specified type. - * - * Wraps clCreateContextFromType(). - */ - Context( - cl_device_type type, - const cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - size_type, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - -#if !defined(__APPLE__) && !defined(__MACOS) - cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; - - if (properties == NULL) { - // Get a valid platform ID as we cannot send in a blank one - vector platforms; - error = Platform::get(&platforms); - if (error != CL_SUCCESS) { - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - return; - } - - // Check the platforms we found for a device of our specified type - cl_context_properties platform_id = 0; - for (unsigned int i = 0; i < platforms.size(); i++) { - - vector devices; - -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - try { -#endif - - error = platforms[i].getDevices(type, &devices); - -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - } catch (cl::Error& e) { - error = e.err(); - } - // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type - // We do error checking next anyway, and can throw there if needed -#endif - - // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND - if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) { - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - } - - if (devices.size() > 0) { - platform_id = (cl_context_properties)platforms[i](); - break; - } - } - - if (platform_id == 0) { - detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = CL_DEVICE_NOT_FOUND; - } - return; - } - - prop[1] = platform_id; - properties = &prop[0]; - } -#endif - object_ = ::clCreateContextFromType( - properties, type, notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Context(const Context& ctx) : detail::Wrapper(ctx) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Context& operator = (const Context &ctx) - { - detail::Wrapper::operator=(ctx); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(ctx)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Context& operator = (Context &&ctx) - { - detail::Wrapper::operator=(std::move(ctx)); - return *this; - } - - - /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. - * - * \note All calls to this function return the same cl_context as the first. - */ - static Context getDefault(cl_int * err = NULL) - { - std::call_once(default_initialized_, makeDefault); - detail::errHandler(default_error_); - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - /** - * Modify the default context to be used by - * subsequent operations. - * Will only set the default if no default was previously created. - * @return updated default context. - * Should be compared to the passed value to ensure that it was updated. - */ - static Context setDefault(const Context &default_context) - { - std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context)); - detail::errHandler(default_error_); - return default_; - } - - //! \brief Default constructor - initializes to NULL. - Context() : detail::Wrapper() { } - - /*! \brief Constructor from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_context - * into the new Context object. - */ - explicit Context(const cl_context& context, bool retainObject = false) : - detail::Wrapper(context, retainObject) { } - - /*! \brief Assignment operator from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseContext() on the value previously held by this instance. - */ - Context& operator = (const cl_context& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetContextInfo(). - template - cl_int getInfo(cl_context_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetContextInfo, object_, name, param), - __GET_CONTEXT_INFO_ERR); - } - - //! \brief Wrapper for clGetContextInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_context_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of supported image formats. - * - * Wraps clGetSupportedImageFormats(). - */ - cl_int getSupportedImageFormats( - cl_mem_flags flags, - cl_mem_object_type type, - vector* formats) const - { - cl_uint numEntries; - - if (!formats) { - return CL_SUCCESS; - } - - cl_int err = ::clGetSupportedImageFormats( - object_, - flags, - type, - 0, - NULL, - &numEntries); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - if (numEntries > 0) { - vector value(numEntries); - err = ::clGetSupportedImageFormats( - object_, - flags, - type, - numEntries, - (cl_image_format*)value.data(), - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - formats->assign(begin(value), end(value)); - } - else { - // If no values are being returned, ensure an empty vector comes back - formats->clear(); - } - - return CL_SUCCESS; - } -}; - -inline void Device::makeDefault() -{ - /* Throwing an exception from a call_once invocation does not do - * what we wish, so we catch it and save the error. - */ -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - try -#endif - { - cl_int error = 0; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_CONTEXT_ERR); - - if (error != CL_SUCCESS) { - default_error_ = error; - } - else { - default_ = context.getInfo()[0]; - default_error_ = CL_SUCCESS; - } - } -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - catch (cl::Error &e) { - default_error_ = e.err(); - } -#endif -} - -CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_; -CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_; -CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS; - -/*! \brief Class interface for cl_event. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_event as the original. For details, see - * clRetainEvent() and clReleaseEvent(). - * - * \see cl_event - */ -class Event : public detail::Wrapper -{ -public: - //! \brief Default constructor - initializes to NULL. - Event() : detail::Wrapper() { } - - /*! \brief Constructor from cl_event - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * This effectively transfers ownership of a refcount on the cl_event - * into the new Event object. - */ - explicit Event(const cl_event& event, bool retainObject = false) : - detail::Wrapper(event, retainObject) { } - - /*! \brief Assignment operator from cl_event - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseEvent() on the value previously held by this instance. - */ - Event& operator = (const cl_event& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetEventInfo(). - template - cl_int getInfo(cl_event_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetEventInfo, object_, name, param), - __GET_EVENT_INFO_ERR); - } - - //! \brief Wrapper for clGetEventInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_event_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - //! \brief Wrapper for clGetEventProfilingInfo(). - template - cl_int getProfilingInfo(cl_profiling_info name, T* param) const - { - return detail::errHandler(detail::getInfo( - &::clGetEventProfilingInfo, object_, name, param), - __GET_EVENT_PROFILE_INFO_ERR); - } - - //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. - template typename - detail::param_traits::param_type - getProfilingInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_profiling_info, name>::param_type param; - cl_int result = getProfilingInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Blocks the calling thread until this event completes. - * - * Wraps clWaitForEvents(). - */ - cl_int wait() const - { - return detail::errHandler( - ::clWaitForEvents(1, &object_), - __WAIT_FOR_EVENTS_ERR); - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - /*! \brief Registers a user callback function for a specific command execution status. - * - * Wraps clSetEventCallback(). - */ - cl_int setCallback( - cl_int type, - void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetEventCallback( - object_, - type, - pfn_notify, - user_data), - __SET_EVENT_CALLBACK_ERR); - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 - - /*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ - static cl_int - waitForEvents(const vector& events) - { - return detail::errHandler( - ::clWaitForEvents( - (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), - __WAIT_FOR_EVENTS_ERR); - } -}; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 -/*! \brief Class interface for user events (a subset of cl_event's). - * - * See Event for details about copy semantics, etc. - */ -class UserEvent : public Event -{ -public: - /*! \brief Constructs a user event on a given context. - * - * Wraps clCreateUserEvent(). - */ - UserEvent( - const Context& context, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateUserEvent( - context(), - &error); - - detail::errHandler(error, __CREATE_USER_EVENT_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - UserEvent() : Event() { } - - /*! \brief Sets the execution status of a user event object. - * - * Wraps clSetUserEventStatus(). - */ - cl_int setStatus(cl_int status) - { - return detail::errHandler( - ::clSetUserEventStatus(object_,status), - __SET_USER_EVENT_STATUS_ERR); - } -}; -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 - -/*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ -inline static cl_int -WaitForEvents(const vector& events) -{ - return detail::errHandler( - ::clWaitForEvents( - (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), - __WAIT_FOR_EVENTS_ERR); -} - -/*! \brief Class interface for cl_mem. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_mem as the original. For details, see - * clRetainMemObject() and clReleaseMemObject(). - * - * \see cl_mem - */ -class Memory : public detail::Wrapper -{ -public: - //! \brief Default constructor - initializes to NULL. - Memory() : detail::Wrapper() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * Optionally transfer ownership of a refcount on the cl_mem - * into the new Memory object. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * - * See Memory for further details. - */ - explicit Memory(const cl_mem& memory, bool retainObject) : - detail::Wrapper(memory, retainObject) { } - - /*! \brief Assignment operator from cl_mem - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseMemObject() on the value previously held by this instance. - */ - Memory& operator = (const cl_mem& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Memory(const Memory& mem) : detail::Wrapper(mem) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Memory& operator = (const Memory &mem) - { - detail::Wrapper::operator=(mem); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(mem)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Memory& operator = (Memory &&mem) - { - detail::Wrapper::operator=(std::move(mem)); - return *this; - } - - - //! \brief Wrapper for clGetMemObjectInfo(). - template - cl_int getInfo(cl_mem_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetMemObjectInfo, object_, name, param), - __GET_MEM_OBJECT_INFO_ERR); - } - - //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_mem_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - /*! \brief Registers a callback function to be called when the memory object - * is no longer needed. - * - * Wraps clSetMemObjectDestructorCallback(). - * - * Repeated calls to this function, for a given cl_mem value, will append - * to the list of functions called (in reverse order) when memory object's - * resources are freed and the memory object is deleted. - * - * \note - * The registered callbacks are associated with the underlying cl_mem - * value - not the Memory class instance. - */ - cl_int setDestructorCallback( - void (CL_CALLBACK * pfn_notify)(cl_mem, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetMemObjectDestructorCallback( - object_, - pfn_notify, - user_data), - __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 - -}; - -// Pre-declare copy functions -class Buffer; -template< typename IteratorType > -cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); -template< typename IteratorType > -cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); -template< typename IteratorType > -cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); -template< typename IteratorType > -cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -namespace detail -{ - class SVMTraitNull - { - public: - static cl_svm_mem_flags getSVMMemFlags() - { - return 0; - } - }; -} // namespace detail - -template -class SVMTraitReadWrite -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return CL_MEM_READ_WRITE | - Trait::getSVMMemFlags(); - } -}; - -template -class SVMTraitReadOnly -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return CL_MEM_READ_ONLY | - Trait::getSVMMemFlags(); - } -}; - -template -class SVMTraitWriteOnly -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return CL_MEM_WRITE_ONLY | - Trait::getSVMMemFlags(); - } -}; - -template> -class SVMTraitCoarse -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return Trait::getSVMMemFlags(); - } -}; - -template> -class SVMTraitFine -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return CL_MEM_SVM_FINE_GRAIN_BUFFER | - Trait::getSVMMemFlags(); - } -}; - -template> -class SVMTraitAtomic -{ -public: - static cl_svm_mem_flags getSVMMemFlags() - { - return - CL_MEM_SVM_FINE_GRAIN_BUFFER | - CL_MEM_SVM_ATOMICS | - Trait::getSVMMemFlags(); - } -}; - -// Pre-declare SVM map function -template -inline cl_int enqueueMapSVM( - T* ptr, - cl_bool blocking, - cl_map_flags flags, - size_type size, - const vector* events = NULL, - Event* event = NULL); - -/** - * STL-like allocator class for managing SVM objects provided for convenience. - * - * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects, - * care must be taken when using with smart pointers. - * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because - * the coarse-grained management behaviour would behave incorrectly with respect to reference counting. - * - * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used - * with the allocate_shared and allocate_ptr supplied operations. - */ -template -class SVMAllocator { -private: - Context context_; - -public: - typedef T value_type; - typedef value_type* pointer; - typedef const value_type* const_pointer; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - template - struct rebind - { - typedef SVMAllocator other; - }; - - template - friend class SVMAllocator; - - SVMAllocator() : - context_(Context::getDefault()) - { - } - - explicit SVMAllocator(cl::Context context) : - context_(context) - { - } - - - SVMAllocator(const SVMAllocator &other) : - context_(other.context_) - { - } - - template - SVMAllocator(const SVMAllocator &other) : - context_(other.context_) - { - } - - ~SVMAllocator() - { - } - - pointer address(reference r) CL_HPP_NOEXCEPT_ - { - return std::addressof(r); - } - - const_pointer address(const_reference r) CL_HPP_NOEXCEPT_ - { - return std::addressof(r); - } - - /** - * Allocate an SVM pointer. - * - * If the allocator is coarse-grained, this will take ownership to allow - * containers to correctly construct data in place. - */ - pointer allocate( - size_type size, - typename cl::SVMAllocator::const_pointer = 0) - { - // Allocate memory with default alignment matching the size of the type - void* voidPointer = - clSVMAlloc( - context_(), - SVMTrait::getSVMMemFlags(), - size*sizeof(T), - 0); - pointer retValue = reinterpret_cast( - voidPointer); -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - if (!retValue) { - std::bad_alloc excep; - throw excep; - } -#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) - - // If allocation was coarse-grained then map it - if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) { - cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T)); - if (err != CL_SUCCESS) { - std::bad_alloc excep; - throw excep; - } - } - - // If exceptions disabled, return null pointer from allocator - return retValue; - } - - void deallocate(pointer p, size_type) - { - clSVMFree(context_(), p); - } - - /** - * Return the maximum possible allocation size. - * This is the minimum of the maximum sizes of all devices in the context. - */ - size_type max_size() const CL_HPP_NOEXCEPT_ - { - size_type maxSize = std::numeric_limits::max() / sizeof(T); - - for (const Device &d : context_.getInfo()) { - maxSize = std::min( - maxSize, - static_cast(d.getInfo())); - } - - return maxSize; - } - - template< class U, class... Args > - void construct(U* p, Args&&... args) - { - new(p)T(args...); - } - - template< class U > - void destroy(U* p) - { - p->~U(); - } - - /** - * Returns true if the contexts match. - */ - inline bool operator==(SVMAllocator const& rhs) - { - return (context_==rhs.context_); - } - - inline bool operator!=(SVMAllocator const& a) - { - return !operator==(a); - } -}; // class SVMAllocator return cl::pointer(tmp, detail::Deleter{alloc, copies}); - - -template -class SVMAllocator { -public: - typedef void value_type; - typedef value_type* pointer; - typedef const value_type* const_pointer; - - template - struct rebind - { - typedef SVMAllocator other; - }; - - template - friend class SVMAllocator; -}; - -#if !defined(CL_HPP_NO_STD_UNIQUE_PTR) -namespace detail -{ - template - class Deleter { - private: - Alloc alloc_; - size_type copies_; - - public: - typedef typename std::allocator_traits::pointer pointer; - - Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies } - { - } - - void operator()(pointer ptr) const { - Alloc tmpAlloc{ alloc_ }; - std::allocator_traits::destroy(tmpAlloc, std::addressof(*ptr)); - std::allocator_traits::deallocate(tmpAlloc, ptr, copies_); - } - }; -} // namespace detail - -/** - * Allocation operation compatible with std::allocate_ptr. - * Creates a unique_ptr by default. - * This requirement is to ensure that the control block is not - * allocated in memory inaccessible to the host. - */ -template -cl::pointer> allocate_pointer(const Alloc &alloc_, Args&&... args) -{ - Alloc alloc(alloc_); - static const size_type copies = 1; - - // Ensure that creation of the management block and the - // object are dealt with separately such that we only provide a deleter - - T* tmp = std::allocator_traits::allocate(alloc, copies); - if (!tmp) { - std::bad_alloc excep; - throw excep; - } - try { - std::allocator_traits::construct( - alloc, - std::addressof(*tmp), - std::forward(args)...); - - return cl::pointer>(tmp, detail::Deleter{alloc, copies}); - } - catch (std::bad_alloc&) - { - std::allocator_traits::deallocate(alloc, tmp, copies); - throw; - } -} - -template< class T, class SVMTrait, class... Args > -cl::pointer>> allocate_svm(Args... args) -{ - SVMAllocator alloc; - return cl::allocate_pointer(alloc, args...); -} - -template< class T, class SVMTrait, class... Args > -cl::pointer>> allocate_svm(const cl::Context &c, Args... args) -{ - SVMAllocator alloc(c); - return cl::allocate_pointer(alloc, args...); -} -#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR) - -/*! \brief Vector alias to simplify contruction of coarse-grained SVM containers. - * - */ -template < class T > -using coarse_svm_vector = vector>>; - -/*! \brief Vector alias to simplify contruction of fine-grained SVM containers. -* -*/ -template < class T > -using fine_svm_vector = vector>>; - -/*! \brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics. -* -*/ -template < class T > -using atomic_svm_vector = vector>>; - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - -/*! \brief Class interface for Buffer Memory Objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Buffer : public Memory -{ -public: - - /*! \brief Constructs a Buffer in a specified context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - */ - Buffer( - const Context& context, - cl_mem_flags flags, - size_type size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a Buffer in the default context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - * - * \see Context::getDefault() - */ - Buffer( - cl_mem_flags flags, - size_type size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(err); - - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! - * \brief Construct a Buffer from a host container via iterators. - * IteratorType must be random access. - * If useHostPtr is specified iterators must represent contiguous data. - */ - template< typename IteratorType > - Buffer( - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr = false, - cl_int* err = NULL) - { - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if( readOnly ) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if( useHostPtr ) { - flags |= CL_MEM_USE_HOST_PTR; - } - - size_type size = sizeof(DataType)*(endIterator - startIterator); - - Context context = Context::getDefault(err); - - if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); - } else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if( !useHostPtr ) { - error = cl::copy(startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - } - - /*! - * \brief Construct a Buffer from a host container via iterators using a specified context. - * IteratorType must be random access. - * If useHostPtr is specified iterators must represent contiguous data. - */ - template< typename IteratorType > - Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); - - /*! - * \brief Construct a Buffer from a host container via iterators using a specified queue. - * If useHostPtr is specified iterators must be random access. - */ - template< typename IteratorType > - Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); - - //! \brief Default constructor - initializes to NULL. - Buffer() : Memory() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with earlier versions. - * - * See Memory for further details. - */ - explicit Buffer(const cl_mem& buffer, bool retainObject = false) : - Memory(buffer, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Buffer& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Buffer(const Buffer& buf) : Memory(buf) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Buffer& operator = (const Buffer &buf) - { - Memory::operator=(buf); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Buffer& operator = (Buffer &&buf) - { - Memory::operator=(std::move(buf)); - return *this; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - /*! \brief Creates a new buffer object from this. - * - * Wraps clCreateSubBuffer(). - */ - Buffer createSubBuffer( - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void * buffer_create_info, - cl_int * err = NULL) - { - Buffer result; - cl_int error; - result.object_ = ::clCreateSubBuffer( - object_, - flags, - buffer_create_type, - buffer_create_info, - &error); - - detail::errHandler(error, __CREATE_SUBBUFFER_ERR); - if (err != NULL) { - *err = error; - } - - return result; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 -}; - -#if defined (CL_HPP_USE_DX_INTEROP) -/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. - * - * This is provided to facilitate interoperability with Direct3D. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class BufferD3D10 : public Buffer -{ -public: - - - /*! \brief Constructs a BufferD3D10, in a specified context, from a - * given ID3D10Buffer. - * - * Wraps clCreateFromD3D10BufferKHR(). - */ - BufferD3D10( - const Context& context, - cl_mem_flags flags, - ID3D10Buffer* bufobj, - cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr) - { - typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, - cl_int* errcode_ret); - PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR; -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - vector props = context.getInfo(); - cl_platform platform = -1; - for( int i = 0; i < props.size(); ++i ) { - if( props[i] == CL_CONTEXT_PLATFORM ) { - platform = props[i+1]; - } - } - CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR); -#elif CL_HPP_TARGET_OPENCL_VERSION >= 110 - CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR); -#endif - - cl_int error; - object_ = pfn_clCreateFromD3D10BufferKHR( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferD3D10() : Buffer() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : - Buffer(buffer, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferD3D10& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferD3D10(const BufferD3D10& buf) : - Buffer(buf) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferD3D10& operator = (const BufferD3D10 &buf) - { - Buffer::operator=(buf); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferD3D10& operator = (BufferD3D10 &&buf) - { - Buffer::operator=(std::move(buf)); - return *this; - } -}; -#endif - -/*! \brief Class interface for GL Buffer Memory Objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class BufferGL : public Buffer -{ -public: - /*! \brief Constructs a BufferGL in a specified context, from a given - * GL buffer. - * - * Wraps clCreateFromGLBuffer(). - */ - BufferGL( - const Context& context, - cl_mem_flags flags, - cl_GLuint bufobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLBuffer( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferGL() : Buffer() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit BufferGL(const cl_mem& buffer, bool retainObject = false) : - Buffer(buffer, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferGL& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferGL(const BufferGL& buf) : Buffer(buf) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferGL& operator = (const BufferGL &buf) - { - Buffer::operator=(buf); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferGL& operator = (BufferGL &&buf) - { - Buffer::operator=(std::move(buf)); - return *this; - } - - //! \brief Wrapper for clGetGLObjectInfo(). - cl_int getObjectInfo( - cl_gl_object_type *type, - cl_GLuint * gl_object_name) - { - return detail::errHandler( - ::clGetGLObjectInfo(object_,type,gl_object_name), - __GET_GL_OBJECT_INFO_ERR); - } -}; - -/*! \brief Class interface for GL Render Buffer Memory Objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class BufferRenderGL : public Buffer -{ -public: - /*! \brief Constructs a BufferRenderGL in a specified context, from a given - * GL Renderbuffer. - * - * Wraps clCreateFromGLRenderbuffer(). - */ - BufferRenderGL( - const Context& context, - cl_mem_flags flags, - cl_GLuint bufobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLRenderbuffer( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferRenderGL() : Buffer() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) : - Buffer(buffer, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferRenderGL& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - BufferRenderGL& operator = (const BufferRenderGL &buf) - { - Buffer::operator=(buf); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - BufferRenderGL& operator = (BufferRenderGL &&buf) - { - Buffer::operator=(std::move(buf)); - return *this; - } - - //! \brief Wrapper for clGetGLObjectInfo(). - cl_int getObjectInfo( - cl_gl_object_type *type, - cl_GLuint * gl_object_name) - { - return detail::errHandler( - ::clGetGLObjectInfo(object_,type,gl_object_name), - __GET_GL_OBJECT_INFO_ERR); - } -}; - -/*! \brief C++ base class for Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image : public Memory -{ -protected: - //! \brief Default constructor - initializes to NULL. - Image() : Memory() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image(const cl_mem& image, bool retainObject = false) : - Memory(image, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image(const Image& img) : Memory(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image& operator = (const Image &img) - { - Memory::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image& operator = (Image &&img) - { - Memory::operator=(std::move(img)); - return *this; - } - - -public: - //! \brief Wrapper for clGetImageInfo(). - template - cl_int getImageInfo(cl_image_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetImageInfo, object_, name, param), - __GET_IMAGE_INFO_ERR); - } - - //! \brief Wrapper for clGetImageInfo() that returns by value. - template typename - detail::param_traits::param_type - getImageInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_image_info, name>::param_type param; - cl_int result = getImageInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -}; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -/*! \brief Class interface for 1D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image1D : public Image -{ -public: - /*! \brief Constructs a 1D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image1D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type width, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE1D; - desc.image_width = width; - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Image1D() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image1D(const cl_mem& image1D, bool retainObject = false) : - Image(image1D, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image1D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1D(const Image1D& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1D& operator = (const Image1D &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1D& operator = (Image1D &&img) - { - Image::operator=(std::move(img)); - return *this; - } - -}; - -/*! \class Image1DBuffer - * \brief Image interface for 1D buffer images. - */ -class Image1DBuffer : public Image -{ -public: - Image1DBuffer( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type width, - const Buffer &buffer, - cl_int* err = NULL) - { - cl_int error; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - desc.image_width = width; - desc.buffer = buffer(); - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - NULL, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DBuffer() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) : - Image(image1D, retainObject) { } - - Image1DBuffer& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1DBuffer(const Image1DBuffer& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1DBuffer& operator = (const Image1DBuffer &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1DBuffer& operator = (Image1DBuffer &&img) - { - Image::operator=(std::move(img)); - return *this; - } - -}; - -/*! \class Image1DArray - * \brief Image interface for arrays of 1D images. - */ -class Image1DArray : public Image -{ -public: - Image1DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type arraySize, - size_type width, - size_type rowPitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; - desc.image_width = width; - desc.image_array_size = arraySize; - desc.image_row_pitch = rowPitch; - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DArray() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) : - Image(imageArray, retainObject) { } - - - Image1DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1DArray(const Image1DArray& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image1DArray& operator = (const Image1DArray &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image1DArray& operator = (Image1DArray &&img) - { - Image::operator=(std::move(img)); - return *this; - } - -}; -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120 - - -/*! \brief Class interface for 2D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image2D : public Image -{ -public: - /*! \brief Constructs a 2D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image2D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type width, - size_type height, - size_type row_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 120 - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - if (useCreateImage) - { - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = width; - desc.image_height = height; - desc.image_row_pitch = row_pitch; - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 - if (!useCreateImage) - { - object_ = ::clCreateImage2D( - context(), flags,&format, width, height, row_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE2D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR) - /*! \brief Constructs a 2D Image from a buffer. - * \note This will share storage with the underlying buffer. - * - * Wraps clCreateImage(). - */ - Image2D( - const Context& context, - ImageFormat format, - const Buffer &sourceBuffer, - size_type width, - size_type height, - size_type row_pitch = 0, - cl_int* err = nullptr) - { - cl_int error; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = width; - desc.image_height = height; - desc.image_row_pitch = row_pitch; - desc.buffer = sourceBuffer(); - - object_ = ::clCreateImage( - context(), - 0, // flags inherited from buffer - &format, - &desc, - nullptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != nullptr) { - *err = error; - } - } -#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR) - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - /*! \brief Constructs a 2D Image from an image. - * \note This will share storage with the underlying image but may - * reinterpret the channel order and type. - * - * The image will be created matching with a descriptor matching the source. - * - * \param order is the channel order to reinterpret the image data as. - * The channel order may differ as described in the OpenCL - * 2.0 API specification. - * - * Wraps clCreateImage(). - */ - Image2D( - const Context& context, - cl_channel_order order, - const Image &sourceImage, - cl_int* err = nullptr) - { - cl_int error; - - // Descriptor fields have to match source image - size_type sourceWidth = - sourceImage.getImageInfo(); - size_type sourceHeight = - sourceImage.getImageInfo(); - size_type sourceRowPitch = - sourceImage.getImageInfo(); - cl_uint sourceNumMIPLevels = - sourceImage.getImageInfo(); - cl_uint sourceNumSamples = - sourceImage.getImageInfo(); - cl_image_format sourceFormat = - sourceImage.getImageInfo(); - - // Update only the channel order. - // Channel format inherited from source. - sourceFormat.image_channel_order = order; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = sourceWidth; - desc.image_height = sourceHeight; - desc.image_row_pitch = sourceRowPitch; - desc.num_mip_levels = sourceNumMIPLevels; - desc.num_samples = sourceNumSamples; - desc.buffer = sourceImage(); - - object_ = ::clCreateImage( - context(), - 0, // flags should be inherited from mem_object - &sourceFormat, - &desc, - nullptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != nullptr) { - *err = error; - } - } -#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - //! \brief Default constructor - initializes to NULL. - Image2D() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image2D(const cl_mem& image2D, bool retainObject = false) : - Image(image2D, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image2D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2D(const Image2D& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2D& operator = (const Image2D &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2D& operator = (Image2D &&img) - { - Image::operator=(std::move(img)); - return *this; - } - -}; - - -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -/*! \brief Class interface for GL 2D Image Memory objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. - */ -class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D -{ -public: - /*! \brief Constructs an Image2DGL in a specified context, from a given - * GL Texture. - * - * Wraps clCreateFromGLTexture2D(). - */ - Image2DGL( - const Context& context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture2D( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); - if (err != NULL) { - *err = error; - } - - } - - //! \brief Default constructor - initializes to NULL. - Image2DGL() : Image2D() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image2DGL(const cl_mem& image, bool retainObject = false) : - Image2D(image, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - *c - * See Memory for further details. - */ - Image2DGL& operator = (const cl_mem& rhs) - { - Image2D::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2DGL(const Image2DGL& img) : Image2D(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2DGL& operator = (const Image2DGL &img) - { - Image2D::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2DGL& operator = (Image2DGL &&img) - { - Image2D::operator=(std::move(img)); - return *this; - } - -} CL_API_SUFFIX__VERSION_1_1_DEPRECATED; -#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -/*! \class Image2DArray - * \brief Image interface for arrays of 2D images. - */ -class Image2DArray : public Image -{ -public: - Image2DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type arraySize, - size_type width, - size_type height, - size_type rowPitch, - size_type slicePitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; - desc.image_width = width; - desc.image_height = height; - desc.image_array_size = arraySize; - desc.image_row_pitch = rowPitch; - desc.image_slice_pitch = slicePitch; - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image2DArray() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { } - - Image2DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2DArray(const Image2DArray& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image2DArray& operator = (const Image2DArray &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image2DArray& operator = (Image2DArray &&img) - { - Image::operator=(std::move(img)); - return *this; - } -}; -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120 - -/*! \brief Class interface for 3D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image3D : public Image -{ -public: - /*! \brief Constructs a 3D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image3D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - size_type width, - size_type height, - size_type depth, - size_type row_pitch = 0, - size_type slice_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 120 - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - if (useCreateImage) - { - cl_image_desc desc = {0}; - desc.image_type = CL_MEM_OBJECT_IMAGE3D; - desc.image_width = width; - desc.image_height = height; - desc.image_depth = depth; - desc.image_row_pitch = row_pitch; - desc.image_slice_pitch = slice_pitch; - - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 - if (!useCreateImage) - { - object_ = ::clCreateImage3D( - context(), flags, &format, width, height, depth, row_pitch, - slice_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE3D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 - } - - //! \brief Default constructor - initializes to NULL. - Image3D() : Image() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image3D(const cl_mem& image3D, bool retainObject = false) : - Image(image3D, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image3D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image3D(const Image3D& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image3D& operator = (const Image3D &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image3D& operator = (Image3D &&img) - { - Image::operator=(std::move(img)); - return *this; - } -}; - -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -/*! \brief Class interface for GL 3D Image Memory objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image3DGL : public Image3D -{ -public: - /*! \brief Constructs an Image3DGL in a specified context, from a given - * GL Texture. - * - * Wraps clCreateFromGLTexture3D(). - */ - Image3DGL( - const Context& context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture3D( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Image3DGL() : Image3D() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit Image3DGL(const cl_mem& image, bool retainObject = false) : - Image3D(image, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image3DGL& operator = (const cl_mem& rhs) - { - Image3D::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image3DGL(const Image3DGL& img) : Image3D(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Image3DGL& operator = (const Image3DGL &img) - { - Image3D::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Image3DGL& operator = (Image3DGL &&img) - { - Image3D::operator=(std::move(img)); - return *this; - } -}; -#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -/*! \class ImageGL - * \brief general image interface for GL interop. - * We abstract the 2D and 3D GL images into a single instance here - * that wraps all GL sourced images on the grounds that setup information - * was performed by OpenCL anyway. - */ -class ImageGL : public Image -{ -public: - ImageGL( - const Context& context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); - if (err != NULL) { - *err = error; - } - } - - ImageGL() : Image() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * See Memory for further details. - */ - explicit ImageGL(const cl_mem& image, bool retainObject = false) : - Image(image, retainObject) { } - - ImageGL& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - ImageGL(const ImageGL& img) : Image(img) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - ImageGL& operator = (const ImageGL &img) - { - Image::operator=(img); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - ImageGL& operator = (ImageGL &&img) - { - Image::operator=(std::move(img)); - return *this; - } -}; -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -/*! \brief Class interface for Pipe Memory Objects. -* -* See Memory for details about copy semantics, etc. -* -* \see Memory -*/ -class Pipe : public Memory -{ -public: - - /*! \brief Constructs a Pipe in a specified context. - * - * Wraps clCreatePipe(). - * @param context Context in which to create the pipe. - * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid. - * @param packet_size Size in bytes of a single packet of the pipe. - * @param max_packets Number of packets that may be stored in the pipe. - * - */ - Pipe( - const Context& context, - cl_uint packet_size, - cl_uint max_packets, - cl_int* err = NULL) - { - cl_int error; - - cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS; - object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error); - - detail::errHandler(error, __CREATE_PIPE_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a Pipe in a the default context. - * - * Wraps clCreatePipe(). - * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid. - * @param packet_size Size in bytes of a single packet of the pipe. - * @param max_packets Number of packets that may be stored in the pipe. - * - */ - Pipe( - cl_uint packet_size, - cl_uint max_packets, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(err); - - cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS; - object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error); - - detail::errHandler(error, __CREATE_PIPE_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Pipe() : Memory() { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with earlier versions. - * - * See Memory for further details. - */ - explicit Pipe(const cl_mem& pipe, bool retainObject = false) : - Memory(pipe, retainObject) { } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Pipe& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Pipe(const Pipe& pipe) : Memory(pipe) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Pipe& operator = (const Pipe &pipe) - { - Memory::operator=(pipe); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Pipe& operator = (Pipe &&pipe) - { - Memory::operator=(std::move(pipe)); - return *this; - } - - //! \brief Wrapper for clGetMemObjectInfo(). - template - cl_int getInfo(cl_pipe_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetPipeInfo, object_, name, param), - __GET_PIPE_INFO_ERR); - } - - //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_pipe_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -}; // class Pipe -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 - - -/*! \brief Class interface for cl_sampler. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_sampler as the original. For details, see - * clRetainSampler() and clReleaseSampler(). - * - * \see cl_sampler - */ -class Sampler : public detail::Wrapper -{ -public: - //! \brief Default constructor - initializes to NULL. - Sampler() { } - - /*! \brief Constructs a Sampler in a specified context. - * - * Wraps clCreateSampler(). - */ - Sampler( - const Context& context, - cl_bool normalized_coords, - cl_addressing_mode addressing_mode, - cl_filter_mode filter_mode, - cl_int* err = NULL) - { - cl_int error; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - cl_sampler_properties sampler_properties[] = { - CL_SAMPLER_NORMALIZED_COORDS, normalized_coords, - CL_SAMPLER_ADDRESSING_MODE, addressing_mode, - CL_SAMPLER_FILTER_MODE, filter_mode, - 0 }; - object_ = ::clCreateSamplerWithProperties( - context(), - sampler_properties, - &error); - - detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } -#else - object_ = ::clCreateSampler( - context(), - normalized_coords, - addressing_mode, - filter_mode, - &error); - - detail::errHandler(error, __CREATE_SAMPLER_ERR); - if (err != NULL) { - *err = error; - } -#endif - } - - /*! \brief Constructor from cl_sampler - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * This effectively transfers ownership of a refcount on the cl_sampler - * into the new Sampler object. - */ - explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : - detail::Wrapper(sampler, retainObject) { } - - /*! \brief Assignment operator from cl_sampler - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseSampler() on the value previously held by this instance. - */ - Sampler& operator = (const cl_sampler& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Sampler(const Sampler& sam) : detail::Wrapper(sam) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Sampler& operator = (const Sampler &sam) - { - detail::Wrapper::operator=(sam); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(sam)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Sampler& operator = (Sampler &&sam) - { - detail::Wrapper::operator=(std::move(sam)); - return *this; - } - - //! \brief Wrapper for clGetSamplerInfo(). - template - cl_int getInfo(cl_sampler_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetSamplerInfo, object_, name, param), - __GET_SAMPLER_INFO_ERR); - } - - //! \brief Wrapper for clGetSamplerInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_sampler_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -}; - -class Program; -class CommandQueue; -class DeviceCommandQueue; -class Kernel; - -//! \brief Class interface for specifying NDRange values. -class NDRange -{ -private: - size_type sizes_[3]; - cl_uint dimensions_; - -public: - //! \brief Default constructor - resulting range has zero dimensions. - NDRange() - : dimensions_(0) - { - sizes_[0] = 0; - sizes_[1] = 0; - sizes_[2] = 0; - } - - //! \brief Constructs one-dimensional range. - NDRange(size_type size0) - : dimensions_(1) - { - sizes_[0] = size0; - sizes_[1] = 1; - sizes_[2] = 1; - } - - //! \brief Constructs two-dimensional range. - NDRange(size_type size0, size_type size1) - : dimensions_(2) - { - sizes_[0] = size0; - sizes_[1] = size1; - sizes_[2] = 1; - } - - //! \brief Constructs three-dimensional range. - NDRange(size_type size0, size_type size1, size_type size2) - : dimensions_(3) - { - sizes_[0] = size0; - sizes_[1] = size1; - sizes_[2] = size2; - } - - /*! \brief Conversion operator to const size_type *. - * - * \returns a pointer to the size of the first dimension. - */ - operator const size_type*() const { - return sizes_; - } - - //! \brief Queries the number of dimensions in the range. - size_type dimensions() const - { - return dimensions_; - } - - //! \brief Returns the size of the object in bytes based on the - // runtime number of dimensions - size_type size() const - { - return dimensions_*sizeof(size_type); - } - - size_type* get() - { - return sizes_; - } - - const size_type* get() const - { - return sizes_; - } -}; - -//! \brief A zero-dimensional range. -static const NDRange NullRange; - -//! \brief Local address wrapper for use with Kernel::setArg -struct LocalSpaceArg -{ - size_type size_; -}; - -namespace detail { - -template -struct KernelArgumentHandler; - -// Enable for objects that are not subclasses of memory -// Pointers, constants etc -template -struct KernelArgumentHandler::value>::type> -{ - static size_type size(const T&) { return sizeof(T); } - static const T* ptr(const T& value) { return &value; } -}; - -// Enable for subclasses of memory where we want to get a reference to the cl_mem out -// and pass that in for safety -template -struct KernelArgumentHandler::value>::type> -{ - static size_type size(const T&) { return sizeof(cl_mem); } - static const cl_mem* ptr(const T& value) { return &(value()); } -}; - -// Specialization for DeviceCommandQueue defined later - -template <> -struct KernelArgumentHandler -{ - static size_type size(const LocalSpaceArg& value) { return value.size_; } - static const void* ptr(const LocalSpaceArg&) { return NULL; } -}; - -} -//! \endcond - -/*! Local - * \brief Helper function for generating LocalSpaceArg objects. - */ -inline LocalSpaceArg -Local(size_type size) -{ - LocalSpaceArg ret = { size }; - return ret; -} - -/*! \brief Class interface for cl_kernel. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_kernel as the original. For details, see - * clRetainKernel() and clReleaseKernel(). - * - * \see cl_kernel - */ -class Kernel : public detail::Wrapper -{ -public: - inline Kernel(const Program& program, const char* name, cl_int* err = NULL); - - //! \brief Default constructor - initializes to NULL. - Kernel() { } - - /*! \brief Constructor from cl_kernel - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - * This effectively transfers ownership of a refcount on the cl_kernel - * into the new Kernel object. - */ - explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : - detail::Wrapper(kernel, retainObject) { } - - /*! \brief Assignment operator from cl_kernel - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseKernel() on the value previously held by this instance. - */ - Kernel& operator = (const cl_kernel& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Kernel(const Kernel& kernel) : detail::Wrapper(kernel) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Kernel& operator = (const Kernel &kernel) - { - detail::Wrapper::operator=(kernel); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(kernel)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Kernel& operator = (Kernel &&kernel) - { - detail::Wrapper::operator=(std::move(kernel)); - return *this; - } - - template - cl_int getInfo(cl_kernel_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelInfo, object_, name, param), - __GET_KERNEL_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - template - cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), - __GET_KERNEL_ARG_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getArgInfo(cl_uint argIndex, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_arg_info, name>::param_type param; - cl_int result = getArgInfo(argIndex, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - template - cl_int getWorkGroupInfo( - const Device& device, cl_kernel_work_group_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetKernelWorkGroupInfo, object_, device(), name, param), - __GET_KERNEL_WORK_GROUP_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getWorkGroupInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_work_group_info, name>::param_type param; - cl_int result = getWorkGroupInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if (CL_HPP_TARGET_OPENCL_VERSION >= 200 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)) || CL_HPP_TARGET_OPENCL_VERSION >= 210 - cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const - { -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - return detail::errHandler( - clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr), - __GET_KERNEL_SUB_GROUP_INFO_ERR); - -#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR; - static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL; - CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR); - - return detail::errHandler( - pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr), - __GET_KERNEL_SUB_GROUP_INFO_ERR); - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - } - - template - size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const - { - size_type param; - cl_int result = getSubGroupInfo(dev, name, range, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - /*! \brief setArg overload taking a shared_ptr type - */ - template - cl_int setArg(cl_uint index, const cl::pointer &argPtr) - { - return detail::errHandler( - ::clSetKernelArgSVMPointer(object_, index, argPtr.get()), - __SET_KERNEL_ARGS_ERR); - } - - /*! \brief setArg overload taking a vector type. - */ - template - cl_int setArg(cl_uint index, const cl::vector &argPtr) - { - return detail::errHandler( - ::clSetKernelArgSVMPointer(object_, index, argPtr.data()), - __SET_KERNEL_ARGS_ERR); - } - - /*! \brief setArg overload taking a pointer type - */ - template - typename std::enable_if::value, cl_int>::type - setArg(cl_uint index, const T argPtr) - { - return detail::errHandler( - ::clSetKernelArgSVMPointer(object_, index, argPtr), - __SET_KERNEL_ARGS_ERR); - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - /*! \brief setArg overload taking a POD type - */ - template - typename std::enable_if::value, cl_int>::type - setArg(cl_uint index, const T &value) - { - return detail::errHandler( - ::clSetKernelArg( - object_, - index, - detail::KernelArgumentHandler::size(value), - detail::KernelArgumentHandler::ptr(value)), - __SET_KERNEL_ARGS_ERR); - } - - cl_int setArg(cl_uint index, size_type size, const void* argPtr) - { - return detail::errHandler( - ::clSetKernelArg(object_, index, size, argPtr), - __SET_KERNEL_ARGS_ERR); - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - /*! - * Specify a vector of SVM pointers that the kernel may access in - * addition to its arguments. - */ - cl_int setSVMPointers(const vector &pointerList) - { - return detail::errHandler( - ::clSetKernelExecInfo( - object_, - CL_KERNEL_EXEC_INFO_SVM_PTRS, - sizeof(void*)*pointerList.size(), - pointerList.data())); - } - - /*! - * Specify a std::array of SVM pointers that the kernel may access in - * addition to its arguments. - */ - template - cl_int setSVMPointers(const std::array &pointerList) - { - return detail::errHandler( - ::clSetKernelExecInfo( - object_, - CL_KERNEL_EXEC_INFO_SVM_PTRS, - sizeof(void*)*pointerList.size(), - pointerList.data())); - } - - /*! \brief Enable fine-grained system SVM. - * - * \note It is only possible to enable fine-grained system SVM if all devices - * in the context associated with kernel support it. - * - * \param svmEnabled True if fine-grained system SVM is requested. False otherwise. - * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION - * if no devices in the context support fine-grained system SVM. - * - * \see clSetKernelExecInfo - */ - cl_int enableFineGrainedSystemSVM(bool svmEnabled) - { - cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE; - return detail::errHandler( - ::clSetKernelExecInfo( - object_, - CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, - sizeof(cl_bool), - &svmEnabled_ - ) - ); - } - - template - void setSVMPointersHelper(std::array &pointerList, const pointer &t0, const pointer &t1, Ts & ... ts) - { - pointerList[index] = static_cast(t0.get()); - setSVMPointersHelper(pointerList, t1, ts...); - } - - template - typename std::enable_if::value, void>::type - setSVMPointersHelper(std::array &pointerList, T0 t0, T1 t1, Ts... ts) - { - pointerList[index] = static_cast(t0); - setSVMPointersHelper(pointerList, t1, ts...); - } - - template - void setSVMPointersHelper(std::array &pointerList, const pointer &t0) - { - pointerList[index] = static_cast(t0.get()); - } - - - template - typename std::enable_if::value, void>::type - setSVMPointersHelper(std::array &pointerList, T0 t0) - { - pointerList[index] = static_cast(t0); - } - - template - cl_int setSVMPointers(const T0 &t0, Ts & ... ts) - { - std::array pointerList; - - setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...); - return detail::errHandler( - ::clSetKernelExecInfo( - object_, - CL_KERNEL_EXEC_INFO_SVM_PTRS, - sizeof(void*)*(1 + sizeof...(Ts)), - pointerList.data())); - } - - template - cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val) - { - return detail::errHandler( - ::clSetKernelExecInfo( - object_, - param_name, - sizeof(T), - &val)); - } - - template - cl_int setExecInfo(typename detail::param_traits::param_type& val) - { - return setExecInfo(name, val); - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - /** - * Make a deep copy of the kernel object including its arguments. - * @return A new kernel object with internal state entirely separate from that - * of the original but with any arguments set on the original intact. - */ - Kernel clone() - { - cl_int error; - Kernel retValue(clCloneKernel(this->get(), &error)); - - detail::errHandler(error, __CLONE_KERNEL_ERR); - return retValue; - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 -}; - -/*! \class Program - * \brief Program interface that implements cl_program. - */ -class Program : public detail::Wrapper -{ -public: -#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - typedef vector> Binaries; - typedef vector Sources; -#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - typedef vector > Binaries; - typedef vector > Sources; -#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - - Program( - const string& source, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const size_type length = source.size(); - - Context context = Context::getDefault(err); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, -#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - "-cl-std=CL2.0", -#else - "", -#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - NULL, - NULL); - - detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const Context& context, - const string& source, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const size_type length = source.size(); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - error = ::clBuildProgram( - object_, - 0, - NULL, -#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - "-cl-std=CL2.0", -#else - "", -#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - NULL, - NULL); - - detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - - if (err != NULL) { - *err = error; - } - } - - /** - * Create a program from a vector of source strings and the default context. - * Does not compile or link the program. - */ - Program( - const Sources& sources, - cl_int* err = NULL) - { - cl_int error; - Context context = Context::getDefault(err); - - const size_type n = (size_type)sources.size(); - - vector lengths(n); - vector strings(n); - - for (size_type i = 0; i < n; ++i) { -#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - strings[i] = sources[(int)i].data(); - lengths[i] = sources[(int)i].length(); -#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - strings[i] = sources[(int)i].first; - lengths[i] = sources[(int)i].second; -#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - } - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)n, strings.data(), lengths.data(), &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - if (err != NULL) { - *err = error; - } - } - - /** - * Create a program from a vector of source strings and a provided context. - * Does not compile or link the program. - */ - Program( - const Context& context, - const Sources& sources, - cl_int* err = NULL) - { - cl_int error; - - const size_type n = (size_type)sources.size(); - - vector lengths(n); - vector strings(n); - - for (size_type i = 0; i < n; ++i) { -#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - strings[i] = sources[(int)i].data(); - lengths[i] = sources[(int)i].length(); -#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - strings[i] = sources[(int)i].first; - lengths[i] = sources[(int)i].second; -#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - } - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)n, strings.data(), lengths.data(), &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - if (err != NULL) { - *err = error; - } - } - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 || (CL_HPP_TARGET_OPENCL_VERSION==200 && defined(CL_HPP_USE_IL_KHR)) - /** - * Program constructor to allow construction of program from SPIR-V or another IL. - * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined. - */ - Program( - const vector& IL, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(err); - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - object_ = ::clCreateProgramWithIL( - context(), static_cast(IL.data()), IL.size(), &error); - -#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR; - static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; - CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - - object_ = pfn_clCreateProgramWithILKHR( - context(), static_cast(IL.data()), IL.size(), &error); - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, -#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - "-cl-std=CL2.0", -#else - "", -#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - NULL, - NULL); - - detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - - if (err != NULL) { - *err = error; - } - } - - /** - * Program constructor to allow construction of program from SPIR-V or another IL - * for a specific context. - * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined. - */ - Program( - const Context& context, - const vector& IL, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - object_ = ::clCreateProgramWithIL( - context(), static_cast(IL.data()), IL.size(), &error); - -#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR; - static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; - CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - - object_ = pfn_clCreateProgramWithILKHR( - context(), static_cast(IL.data()), IL.size(), &error); - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR); - - if (error == CL_SUCCESS && build) { - error = ::clBuildProgram( - object_, - 0, - NULL, -#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - "-cl-std=CL2.0", -#else - "", -#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) - NULL, - NULL); - - detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - - if (err != NULL) { - *err = error; - } - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - /** - * Construct a program object from a list of devices and a per-device list of binaries. - * \param context A valid OpenCL context in which to construct the program. - * \param devices A vector of OpenCL device objects for which the program will be created. - * \param binaries A vector of pairs of a pointer to a binary object and its length. - * \param binaryStatus An optional vector that on completion will be resized to - * match the size of binaries and filled with values to specify if each binary - * was successfully loaded. - * Set to CL_SUCCESS if the binary was successfully loaded. - * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. - * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. - * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: - * CL_INVALID_CONTEXT if context is not a valid context. - * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; - * or if any entry in binaries is NULL or has length 0. - * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. - * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. - * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. - */ - Program( - const Context& context, - const vector& devices, - const Binaries& binaries, - vector* binaryStatus = NULL, - cl_int* err = NULL) - { - cl_int error; - - const size_type numDevices = devices.size(); - - // Catch size mismatch early and return - if(binaries.size() != numDevices) { - error = CL_INVALID_VALUE; - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - return; - } - - - vector lengths(numDevices); - vector images(numDevices); -#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - for (size_type i = 0; i < numDevices; ++i) { - images[i] = binaries[i].data(); - lengths[i] = binaries[(int)i].size(); - } -#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - for (size_type i = 0; i < numDevices; ++i) { - images[i] = (const unsigned char*)binaries[i].first; - lengths[i] = binaries[(int)i].second; - } -#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) - - vector deviceIDs(numDevices); - for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - if(binaryStatus) { - binaryStatus->resize(numDevices); - } - - object_ = ::clCreateProgramWithBinary( - context(), (cl_uint) devices.size(), - deviceIDs.data(), - lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0) - ? &binaryStatus->front() - : NULL, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - } - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - /** - * Create program using builtin kernels. - * \param kernelNames Semi-colon separated list of builtin kernel names - */ - Program( - const Context& context, - const vector& devices, - const string& kernelNames, - cl_int* err = NULL) - { - cl_int error; - - - size_type numDevices = devices.size(); - vector deviceIDs(numDevices); - for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateProgramWithBuiltInKernels( - context(), - (cl_uint) devices.size(), - deviceIDs.data(), - kernelNames.c_str(), - &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - Program() { } - - - /*! \brief Constructor from cl_program - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - */ - explicit Program(const cl_program& program, bool retainObject = false) : - detail::Wrapper(program, retainObject) { } - - Program& operator = (const cl_program& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - Program(const Program& program) : detail::Wrapper(program) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - Program& operator = (const Program &program) - { - detail::Wrapper::operator=(program); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(program)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - Program& operator = (Program &&program) - { - detail::Wrapper::operator=(std::move(program)); - return *this; - } - - cl_int build( - const vector& devices, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - size_type numDevices = devices.size(); - vector deviceIDs(numDevices); - - for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - cl_int buildError = ::clBuildProgram( - object_, - (cl_uint) - devices.size(), - deviceIDs.data(), - options, - notifyFptr, - data); - - return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - - cl_int build( - const Device& device, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - cl_device_id deviceID = device(); - - cl_int buildError = ::clBuildProgram( - object_, - 1, - &deviceID, - options, - notifyFptr, - data); - - BuildLogType buildLog(0); - buildLog.push_back(std::make_pair(device, getBuildInfo(device))); - return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog); - } - - cl_int build( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - cl_int buildError = ::clBuildProgram( - object_, - 0, - NULL, - options, - notifyFptr, - data); - - return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - cl_int compile( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - cl_int error = ::clCompileProgram( - object_, - 0, - NULL, - options, - 0, - NULL, - NULL, - notifyFptr, - data); - return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo()); - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - template - cl_int getInfo(cl_program_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetProgramInfo, object_, name, param), - __GET_PROGRAM_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - template - cl_int getBuildInfo( - const Device& device, cl_program_build_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetProgramBuildInfo, object_, device(), name, param), - __GET_PROGRAM_BUILD_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getBuildInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_build_info, name>::param_type param; - cl_int result = getBuildInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /** - * Build info function that returns a vector of device/info pairs for the specified - * info type and for all devices in the program. - * On an error reading the info for any device, an empty vector of info will be returned. - */ - template - vector::param_type>> - getBuildInfo(cl_int *err = NULL) const - { - cl_int result = CL_SUCCESS; - - auto devs = getInfo(&result); - vector::param_type>> - devInfo; - - // If there was an initial error from getInfo return the error - if (result != CL_SUCCESS) { - if (err != NULL) { - *err = result; - } - return devInfo; - } - - for (const cl::Device &d : devs) { - typename detail::param_traits< - detail::cl_program_build_info, name>::param_type param; - result = getBuildInfo(d, name, ¶m); - devInfo.push_back( - std::pair::param_type> - (d, param)); - if (result != CL_SUCCESS) { - // On error, leave the loop and return the error code - break; - } - } - if (err != NULL) { - *err = result; - } - if (result != CL_SUCCESS) { - devInfo.clear(); - } - return devInfo; - } - - cl_int createKernels(vector* kernels) - { - cl_uint numKernels; - cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - vector value(numKernels); - - err = ::clCreateKernelsInProgram( - object_, numKernels, value.data(), NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - if (kernels) { - kernels->resize(value.size()); - - // Assign to param, constructing with retain behaviour - // to correctly capture each underlying CL object - for (size_type i = 0; i < value.size(); i++) { - // We do not need to retain because this kernel is being created - // by the runtime - (*kernels)[i] = Kernel(value[i], false); - } - } - return CL_SUCCESS; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 220 -#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) - /*! \brief Registers a callback function to be called when destructors for - * program scope global variables are complete and before the - * program is released. - * - * Wraps clSetProgramReleaseCallback(). - * - * Each call to this function registers the specified user callback function - * on a callback stack associated with program. The registered user callback - * functions are called in the reverse order in which they were registered. - */ - CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback( - void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), - void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED - { - return detail::errHandler( - ::clSetProgramReleaseCallback( - object_, - pfn_notify, - user_data), - __SET_PROGRAM_RELEASE_CALLBACK_ERR); - } -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) - - /*! \brief Sets a SPIR-V specialization constant. - * - * Wraps clSetProgramSpecializationConstant(). - */ - template - typename std::enable_if::value, cl_int>::type - setSpecializationConstant(cl_uint index, const T &value) - { - return detail::errHandler( - ::clSetProgramSpecializationConstant( - object_, - index, - sizeof(value), - &value), - __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); - } - - /*! \brief Sets a SPIR-V specialization constant. - * - * Wraps clSetProgramSpecializationConstant(). - */ - cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value) - { - return detail::errHandler( - ::clSetProgramSpecializationConstant( - object_, - index, - size, - value), - __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 -}; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 -inline Program linkProgram( - Program input1, - Program input2, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) -{ - cl_int error_local = CL_SUCCESS; - - cl_program programs[2] = { input1(), input2() }; - - Context ctx = input1.getInfo(&error_local); - if(error_local!=CL_SUCCESS) { - detail::errHandler(error_local, __LINK_PROGRAM_ERR); - } - - cl_program prog = ::clLinkProgram( - ctx(), - 0, - NULL, - options, - 2, - programs, - notifyFptr, - data, - &error_local); - - detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = error_local; - } - - return Program(prog); -} - -inline Program linkProgram( - vector inputPrograms, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) -{ - cl_int error_local = CL_SUCCESS; - - vector programs(inputPrograms.size()); - - for (unsigned int i = 0; i < inputPrograms.size(); i++) { - programs[i] = inputPrograms[i](); - } - - Context ctx; - if(inputPrograms.size() > 0) { - ctx = inputPrograms[0].getInfo(&error_local); - if(error_local!=CL_SUCCESS) { - detail::errHandler(error_local, __LINK_PROGRAM_ERR); - } - } - cl_program prog = ::clLinkProgram( - ctx(), - 0, - NULL, - options, - (cl_uint)inputPrograms.size(), - programs.data(), - notifyFptr, - data, - &error_local); - - detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = error_local; - } - - return Program(prog, false); -} -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - -// Template specialization for CL_PROGRAM_BINARIES -template <> -inline cl_int cl::Program::getInfo(cl_program_info name, vector>* param) const -{ - if (name != CL_PROGRAM_BINARIES) { - return CL_INVALID_VALUE; - } - if (param) { - // Resize the parameter array appropriately for each allocation - // and pass down to the helper - - vector sizes = getInfo(); - size_type numBinaries = sizes.size(); - - // Resize the parameter array and constituent arrays - param->resize(numBinaries); - for (size_type i = 0; i < numBinaries; ++i) { - (*param)[i].resize(sizes[i]); - } - - return detail::errHandler( - detail::getInfo(&::clGetProgramInfo, object_, name, param), - __GET_PROGRAM_INFO_ERR); - } - - return CL_SUCCESS; -} - -template<> -inline vector> cl::Program::getInfo(cl_int* err) const -{ - vector> binariesVectors; - - cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors); - if (err != NULL) { - *err = result; - } - return binariesVectors; -} - -#if CL_HPP_TARGET_OPENCL_VERSION >= 220 -// Template specialization for clSetProgramSpecializationConstant -template <> -inline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value) -{ - cl_uchar ucValue = value ? CL_UCHAR_MAX : 0; - return detail::errHandler( - ::clSetProgramSpecializationConstant( - object_, - index, - sizeof(ucValue), - &ucValue), - __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); -} -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 - -inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) -{ - cl_int error; - - object_ = ::clCreateKernel(program(), name, &error); - detail::errHandler(error, __CREATE_KERNEL_ERR); - - if (err != NULL) { - *err = error; - } - -} - -enum class QueueProperties : cl_command_queue_properties -{ - None = 0, - Profiling = CL_QUEUE_PROFILING_ENABLE, - OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, -}; - -inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs) -{ - return static_cast(static_cast(lhs) | static_cast(rhs)); -} - -inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs) -{ - return static_cast(static_cast(lhs) & static_cast(rhs)); -} - -/*! \class CommandQueue - * \brief CommandQueue interface for cl_command_queue. - */ -class CommandQueue : public detail::Wrapper -{ -private: - static std::once_flag default_initialized_; - static CommandQueue default_; - static cl_int default_error_; - - /*! \brief Create the default command queue returned by @ref getDefault. - * - * It sets default_error_ to indicate success or failure. It does not throw - * @c cl::Error. - */ - static void makeDefault() - { - /* We don't want to throw an error from this function, so we have to - * catch and set the error flag. - */ -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - try -#endif - { - int error; - Context context = Context::getDefault(&error); - - if (error != CL_SUCCESS) { - default_error_ = error; - } - else { - Device device = Device::getDefault(); - default_ = CommandQueue(context, device, 0, &default_error_); - } - } -#if defined(CL_HPP_ENABLE_EXCEPTIONS) - catch (cl::Error &e) { - default_error_ = e.err(); - } -#endif - } - - /*! \brief Create the default command queue. - * - * This sets @c default_. It does not throw - * @c cl::Error. - */ - static void makeDefaultProvided(const CommandQueue &c) { - default_ = c; - } - -public: -#ifdef CL_HPP_UNIT_TEST_ENABLE - /*! \brief Reset the default. - * - * This sets @c default_ to an empty value to support cleanup in - * the unit test framework. - * This function is not thread safe. - */ - static void unitTestClearDefault() { - default_ = CommandQueue(); - } -#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE - - - /*! - * \brief Constructs a CommandQueue based on passed properties. - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - CommandQueue( - cl_command_queue_properties properties, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_CONTEXT_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - bool useWithProperties; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, 0 }; - if ((properties & CL_QUEUE_ON_DEVICE) == 0) { - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - } - else { - error = CL_INVALID_QUEUE_PROPERTIES; - } - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - } - } - - /*! - * \brief Constructs a CommandQueue based on passed properties. - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - CommandQueue( - QueueProperties properties, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_CONTEXT_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - bool useWithProperties; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; - - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), device(), static_cast(properties), &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - - } - } - - /*! - * \brief Constructs a CommandQueue for an implementation defined device in the given context - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - explicit CommandQueue( - const Context& context, - cl_command_queue_properties properties = 0, - cl_int* err = NULL) - { - cl_int error; - bool useWithProperties; - vector devices; - error = context.getInfo(CL_CONTEXT_DEVICES, &devices); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - - if (error != CL_SUCCESS) - { - if (err != NULL) { - *err = error; - } - return; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, 0 }; - if ((properties & CL_QUEUE_ON_DEVICE) == 0) { - object_ = ::clCreateCommandQueueWithProperties( - context(), devices[0](), queue_properties, &error); - } - else { - error = CL_INVALID_QUEUE_PROPERTIES; - } - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), devices[0](), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - } - - /*! - * \brief Constructs a CommandQueue for an implementation defined device in the given context - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - explicit CommandQueue( - const Context& context, - QueueProperties properties, - cl_int* err = NULL) - { - cl_int error; - bool useWithProperties; - vector devices; - error = context.getInfo(CL_CONTEXT_DEVICES, &devices); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - - if (error != CL_SUCCESS) - { - if (err != NULL) { - *err = error; - } - return; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), devices[0](), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), devices[0](), static_cast(properties), &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - } - - /*! - * \brief Constructs a CommandQueue for a passed device and context - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - CommandQueue( - const Context& context, - const Device& device, - cl_command_queue_properties properties = 0, - cl_int* err = NULL) - { - cl_int error; - bool useWithProperties; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - } - - /*! - * \brief Constructs a CommandQueue for a passed device and context - * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. - */ - CommandQueue( - const Context& context, - const Device& device, - QueueProperties properties, - cl_int* err = NULL) - { - cl_int error; - bool useWithProperties; - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above - } -#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - useWithProperties = true; -#else - useWithProperties = false; -#endif - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (useWithProperties) { - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 - if (!useWithProperties) { - object_ = ::clCreateCommandQueue( - context(), device(), static_cast(properties), &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 - } - - static CommandQueue getDefault(cl_int * err = NULL) - { - std::call_once(default_initialized_, makeDefault); -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); -#else // CL_HPP_TARGET_OPENCL_VERSION >= 200 - detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR); -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - /** - * Modify the default command queue to be used by - * subsequent operations. - * Will only set the default if no default was previously created. - * @return updated default command queue. - * Should be compared to the passed value to ensure that it was updated. - */ - static CommandQueue setDefault(const CommandQueue &default_queue) - { - std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue)); - detail::errHandler(default_error_); - return default_; - } - - CommandQueue() { } - - - /*! \brief Constructor from cl_command_queue - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - */ - explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : - detail::Wrapper(commandQueue, retainObject) { } - - CommandQueue& operator = (const cl_command_queue& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - CommandQueue(const CommandQueue& queue) : detail::Wrapper(queue) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - CommandQueue& operator = (const CommandQueue &queue) - { - detail::Wrapper::operator=(queue); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(queue)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - CommandQueue& operator = (CommandQueue &&queue) - { - detail::Wrapper::operator=(std::move(queue)); - return *this; - } - - template - cl_int getInfo(cl_command_queue_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetCommandQueueInfo, object_, name, param), - __GET_COMMAND_QUEUE_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_command_queue_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - size_type offset, - size_type size, - void* ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - size_type offset, - size_type size, - const void* ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - size_type src_offset, - size_type dst_offset, - size_type size, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBuffer( - object_, src(), dst(), src_offset, dst_offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const array& buffer_offset, - const array& host_offset, - const array& region, - size_type buffer_row_pitch, - size_type buffer_slice_pitch, - size_type host_row_pitch, - size_type host_slice_pitch, - void *ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBufferRect( - object_, - buffer(), - blocking, - buffer_offset.data(), - host_offset.data(), - region.data(), - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const array& buffer_offset, - const array& host_offset, - const array& region, - size_type buffer_row_pitch, - size_type buffer_slice_pitch, - size_type host_row_pitch, - size_type host_slice_pitch, - const void *ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBufferRect( - object_, - buffer(), - blocking, - buffer_offset.data(), - host_offset.data(), - region.data(), - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const array& src_origin, - const array& dst_origin, - const array& region, - size_type src_row_pitch, - size_type src_slice_pitch, - size_type dst_row_pitch, - size_type dst_slice_pitch, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferRect( - object_, - src(), - dst(), - src_origin.data(), - dst_origin.data(), - region.data(), - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - /** - * Enqueue a command to fill a buffer object with a pattern - * of a given size. The pattern is specified as a vector type. - * \tparam PatternType The datatype of the pattern field. - * The pattern type must be an accepted OpenCL data type. - * \tparam offset Is the offset in bytes into the buffer at - * which to start filling. This must be a multiple of - * the pattern size. - * \tparam size Is the size in bytes of the region to fill. - * This must be a multiple of the pattern size. - */ - template - cl_int enqueueFillBuffer( - const Buffer& buffer, - PatternType pattern, - size_type offset, - size_type size, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillBuffer( - object_, - buffer(), - static_cast(&pattern), - sizeof(PatternType), - offset, - size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const array& origin, - const array& region, - size_type row_pitch, - size_type slice_pitch, - void* ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadImage( - object_, - image(), - blocking, - origin.data(), - region.data(), - row_pitch, - slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const array& origin, - const array& region, - size_type row_pitch, - size_type slice_pitch, - const void* ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteImage( - object_, - image(), - blocking, - origin.data(), - region.data(), - row_pitch, - slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const array& src_origin, - const array& dst_origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImage( - object_, - src(), - dst(), - src_origin.data(), - dst_origin.data(), - region.data(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA floating-point color value if - * the image channel data type is not an unnormalized signed or - * unsigned data type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_float4 fillColor, - const array& origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - origin.data(), - region.data(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA signed integer color value if - * the image channel data type is an unnormalized signed integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_int4 fillColor, - const array& origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - origin.data(), - region.data(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA unsigned integer color value if - * the image channel data type is an unnormalized unsigned integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_uint4 fillColor, - const array& origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - origin.data(), - region.data(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const array& src_origin, - const array& region, - size_type dst_offset, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImageToBuffer( - object_, - src(), - dst(), - src_origin.data(), - region.data(), - dst_offset, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - size_type src_offset, - const array& dst_origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferToImage( - object_, - src(), - dst(), - src_offset, - dst_origin.data(), - region.data(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - size_type offset, - size_type size, - const vector* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_event tmp; - cl_int error; - void * result = ::clEnqueueMapBuffer( - object_, buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - if (event != NULL && error == CL_SUCCESS) - *event = tmp; - - return result; - } - - void* enqueueMapImage( - const Image& buffer, - cl_bool blocking, - cl_map_flags flags, - const array& origin, - const array& region, - size_type * row_pitch, - size_type * slice_pitch, - const vector* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_event tmp; - cl_int error; - void * result = ::clEnqueueMapImage( - object_, buffer(), blocking, flags, - origin.data(), - region.data(), - row_pitch, slice_pitch, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - if (event != NULL && error == CL_SUCCESS) - *event = tmp; - return result; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - /** - * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. - * This variant takes a raw SVM pointer. - */ - template - cl_int enqueueMapSVM( - T* ptr, - cl_bool blocking, - cl_map_flags flags, - size_type size, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler(::clEnqueueSVMMap( - object_, blocking, flags, static_cast(ptr), size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MAP_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - - /** - * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. - * This variant takes a cl::pointer instance. - */ - template - cl_int enqueueMapSVM( - cl::pointer &ptr, - cl_bool blocking, - cl_map_flags flags, - size_type size, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler(::clEnqueueSVMMap( - object_, blocking, flags, static_cast(ptr.get()), size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MAP_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. - * This variant takes a cl::vector instance. - */ - template - cl_int enqueueMapSVM( - cl::vector &container, - cl_bool blocking, - cl_map_flags flags, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler(::clEnqueueSVMMap( - object_, blocking, flags, static_cast(container.data()), container.size()*sizeof(T), - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MAP_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - object_, memory(), mapped_ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - /** - * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. - * This variant takes a raw SVM pointer. - */ - template - cl_int enqueueUnmapSVM( - T* ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueSVMUnmap( - object_, static_cast(ptr), - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. - * This variant takes a cl::pointer instance. - */ - template - cl_int enqueueUnmapSVM( - cl::pointer &ptr, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueSVMUnmap( - object_, static_cast(ptr.get()), - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. - * This variant takes a cl::vector instance. - */ - template - cl_int enqueueUnmapSVM( - cl::vector &container, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueSVMUnmap( - object_, static_cast(container.data()), - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - /** - * Enqueues a marker command which waits for either a list of events to complete, - * or all previously enqueued commands to complete. - * - * Enqueues a marker command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command returns an event which can be waited on, - * i.e. this event can be waited on to insure that all events either in the event_wait_list - * or all previously enqueued commands, queued before this command to command_queue, - * have completed. - */ - cl_int enqueueMarkerWithWaitList( - const vector *events = 0, - Event *event = 0) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueMarkerWithWaitList( - object_, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MARKER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * A synchronization point that enqueues a barrier operation. - * - * Enqueues a barrier command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command blocks command execution, that is, any - * following commands enqueued after it do not execute until it completes. This command - * returns an event which can be waited on, i.e. this event can be waited on to insure that - * all events either in the event_wait_list or all previously enqueued commands, queued - * before this command to command_queue, have completed. - */ - cl_int enqueueBarrierWithWaitList( - const vector *events = 0, - Event *event = 0) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueBarrierWithWaitList( - object_, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_BARRIER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command to indicate with which device a set of memory objects - * should be associated. - */ - cl_int enqueueMigrateMemObjects( - const vector &memObjects, - cl_mem_migration_flags flags, - const vector* events = NULL, - Event* event = NULL - ) const - { - cl_event tmp; - - vector localMemObjects(memObjects.size()); - - for( int i = 0; i < (int)memObjects.size(); ++i ) { - localMemObjects[i] = memObjects[i](); - } - - cl_int err = detail::errHandler( - ::clEnqueueMigrateMemObjects( - object_, - (cl_uint)memObjects.size(), - localMemObjects.data(), - flags, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - /** - * Enqueues a command that will allow the host associate ranges within a set of - * SVM allocations with a device. - * @param sizes - The length from each pointer to migrate. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector &svmRawPointers, - const cl::vector &sizes, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem( - object_, - svmRawPointers.size(), static_cast(svmRawPointers.data()), - sizes.data(), // array of sizes not passed - flags, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MIGRATE_SVM_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command that will allow the host associate a set of SVM allocations with - * a device. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector &svmRawPointers, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - return enqueueMigrateSVM(svmRawPointers, cl::vector(svmRawPointers.size()), flags, events, event); - } - - - /** - * Enqueues a command that will allow the host associate ranges within a set of - * SVM allocations with a device. - * @param sizes - The length from each pointer to migrate. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector> &svmPointers, - const cl::vector &sizes, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - cl::vector svmRawPointers; - svmRawPointers.reserve(svmPointers.size()); - for (auto p : svmPointers) { - svmRawPointers.push_back(static_cast(p.get())); - } - - return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event); - } - - - /** - * Enqueues a command that will allow the host associate a set of SVM allocations with - * a device. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector> &svmPointers, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - return enqueueMigrateSVM(svmPointers, cl::vector(svmPointers.size()), flags, events, event); - } - - /** - * Enqueues a command that will allow the host associate ranges within a set of - * SVM allocations with a device. - * @param sizes - The length from the beginning of each container to migrate. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector> &svmContainers, - const cl::vector &sizes, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - cl::vector svmRawPointers; - svmRawPointers.reserve(svmContainers.size()); - for (auto p : svmContainers) { - svmRawPointers.push_back(static_cast(p.data())); - } - - return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event); - } - - /** - * Enqueues a command that will allow the host associate a set of SVM allocations with - * a device. - */ - template - cl_int enqueueMigrateSVM( - const cl::vector> &svmContainers, - cl_mem_migration_flags flags = 0, - const vector* events = NULL, - Event* event = NULL) const - { - return enqueueMigrateSVM(svmContainers, cl::vector(svmContainers.size()), flags, events, event); - } - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 - - cl_int enqueueNDRangeKernel( - const Kernel& kernel, - const NDRange& offset, - const NDRange& global, - const NDRange& local = NullRange, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNDRangeKernel( - object_, kernel(), (cl_uint) global.dimensions(), - offset.dimensions() != 0 ? (const size_type*) offset : NULL, - (const size_type*) global, - local.dimensions() != 0 ? (const size_type*) local : NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NDRANGE_KERNEL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) - CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( - const Kernel& kernel, - const vector* events = NULL, - Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueTask( - object_, kernel(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_TASK_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) - - cl_int enqueueNativeKernel( - void (CL_CALLBACK *userFptr)(void *), - std::pair args, - const vector* mem_objects = NULL, - const vector* mem_locs = NULL, - const vector* events = NULL, - Event* event = NULL) const - { - size_type elements = 0; - if (mem_objects != NULL) { - elements = mem_objects->size(); - } - vector mems(elements); - for (unsigned int i = 0; i < elements; i++) { - mems[i] = ((*mem_objects)[i])(); - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNativeKernel( - object_, userFptr, args.first, args.second, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - mems.data(), - (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NATIVE_KERNEL); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_API_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueMarker( - object_, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MARKER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - CL_API_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueWaitForEvents(const vector& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueWaitForEvents( - object_, - (cl_uint) events.size(), - events.size() > 0 ? (const cl_event*) &events.front() : NULL), - __ENQUEUE_WAIT_FOR_EVENTS_ERR); - } -#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - - cl_int enqueueAcquireGLObjects( - const vector* mem_objects = NULL, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueAcquireGLObjects( - object_, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_ACQUIRE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReleaseGLObjects( - const vector* mem_objects = NULL, - const vector* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReleaseGLObjects( - object_, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_RELEASE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined (CL_HPP_USE_DX_INTEROP) -typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem* mem_objects, cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, cl_event* event); -typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem* mem_objects, cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, cl_event* event); - - cl_int enqueueAcquireD3D10Objects( - const vector* mem_objects = NULL, - const vector* events = NULL, - Event* event = NULL) const - { - static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - cl_context context = getInfo(); - cl::Device device(getInfo()); - cl_platform_id platform = device.getInfo(); - CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR); -#endif -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR); -#endif - - cl_event tmp; - cl_int err = detail::errHandler( - pfn_clEnqueueAcquireD3D10ObjectsKHR( - object_, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_ACQUIRE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReleaseD3D10Objects( - const vector* mem_objects = NULL, - const vector* events = NULL, - Event* event = NULL) const - { - static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 - cl_context context = getInfo(); - cl::Device device(getInfo()); - cl_platform_id platform = device.getInfo(); - CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR); -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 - CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR); -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 - - cl_event tmp; - cl_int err = detail::errHandler( - pfn_clEnqueueReleaseD3D10ObjectsKHR( - object_, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_RELEASE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_API_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueBarrier(object_), - __ENQUEUE_BARRIER_ERR); - } -#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS - - cl_int flush() const - { - return detail::errHandler(::clFlush(object_), __FLUSH_ERR); - } - - cl_int finish() const - { - return detail::errHandler(::clFinish(object_), __FINISH_ERR); - } -}; // CommandQueue - -CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_; -CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_; -CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS; - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -enum class DeviceQueueProperties : cl_command_queue_properties -{ - None = 0, - Profiling = CL_QUEUE_PROFILING_ENABLE, -}; - -inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs) -{ - return static_cast(static_cast(lhs) | static_cast(rhs)); -} - -/*! \class DeviceCommandQueue - * \brief DeviceCommandQueue interface for device cl_command_queues. - */ -class DeviceCommandQueue : public detail::Wrapper -{ -public: - - /*! - * Trivial empty constructor to create a null queue. - */ - DeviceCommandQueue() { } - - /*! - * Default construct device command queue on default context and device - */ - DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL) - { - cl_int error; - cl::Context context = cl::Context::getDefault(); - cl::Device device = cl::Device::getDefault(); - - cl_command_queue_properties mergedProperties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); - - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, mergedProperties, 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! - * Create a device command queue for a specified device in the passed context. - */ - DeviceCommandQueue( - const Context& context, - const Device& device, - DeviceQueueProperties properties = DeviceQueueProperties::None, - cl_int* err = NULL) - { - cl_int error; - - cl_command_queue_properties mergedProperties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, mergedProperties, 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! - * Create a device command queue for a specified device in the passed context. - */ - DeviceCommandQueue( - const Context& context, - const Device& device, - cl_uint queueSize, - DeviceQueueProperties properties = DeviceQueueProperties::None, - cl_int* err = NULL) - { - cl_int error; - - cl_command_queue_properties mergedProperties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, mergedProperties, - CL_QUEUE_SIZE, queueSize, - 0 }; - object_ = ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructor from cl_command_queue - takes ownership. - * - * \param retainObject will cause the constructor to retain its cl object. - * Defaults to false to maintain compatibility with - * earlier versions. - */ - explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : - detail::Wrapper(commandQueue, retainObject) { } - - DeviceCommandQueue& operator = (const cl_command_queue& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - /*! \brief Copy constructor to forward copy to the superclass correctly. - * Required for MSVC. - */ - DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper(queue) {} - - /*! \brief Copy assignment to forward copy to the superclass correctly. - * Required for MSVC. - */ - DeviceCommandQueue& operator = (const DeviceCommandQueue &queue) - { - detail::Wrapper::operator=(queue); - return *this; - } - - /*! \brief Move constructor to forward move to the superclass correctly. - * Required for MSVC. - */ - DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(queue)) {} - - /*! \brief Move assignment to forward move to the superclass correctly. - * Required for MSVC. - */ - DeviceCommandQueue& operator = (DeviceCommandQueue &&queue) - { - detail::Wrapper::operator=(std::move(queue)); - return *this; - } - - template - cl_int getInfo(cl_command_queue_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetCommandQueueInfo, object_, name, param), - __GET_COMMAND_QUEUE_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_command_queue_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! - * Create a new default device command queue for the default device, - * in the default context and of the default size. - * If there is already a default queue for the specified device this - * function will return the pre-existing queue. - */ - static DeviceCommandQueue makeDefault( - cl_int *err = nullptr) - { - cl_int error; - cl::Context context = cl::Context::getDefault(); - cl::Device device = cl::Device::getDefault(); - - cl_command_queue_properties properties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, - 0 }; - DeviceCommandQueue deviceQueue( - ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error)); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - - return deviceQueue; - } - - /*! - * Create a new default device command queue for the specified device - * and of the default size. - * If there is already a default queue for the specified device this - * function will return the pre-existing queue. - */ - static DeviceCommandQueue makeDefault( - const Context &context, const Device &device, cl_int *err = nullptr) - { - cl_int error; - - cl_command_queue_properties properties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, - 0 }; - DeviceCommandQueue deviceQueue( - ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error)); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - - return deviceQueue; - } - - /*! - * Create a new default device command queue for the specified device - * and of the requested size in bytes. - * If there is already a default queue for the specified device this - * function will return the pre-existing queue. - */ - static DeviceCommandQueue makeDefault( - const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr) - { - cl_int error; - - cl_command_queue_properties properties = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; - cl_queue_properties queue_properties[] = { - CL_QUEUE_PROPERTIES, properties, - CL_QUEUE_SIZE, queueSize, - 0 }; - DeviceCommandQueue deviceQueue( - ::clCreateCommandQueueWithProperties( - context(), device(), queue_properties, &error)); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); - if (err != NULL) { - *err = error; - } - - return deviceQueue; - } - - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 210 - /*! - * Modify the default device command queue to be used for subsequent kernels. - * This can update the default command queue for a device repeatedly to account - * for kernels that rely on the default. - * @return updated default device command queue. - */ - static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr) - { - cl_int error; - error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get()); - - detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - return default_queue; - } - - /*! - * Return the current default command queue for the specified command queue - */ - static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = NULL) - { - return queue.getInfo(err); - } - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 -}; // DeviceCommandQueue - -namespace detail -{ - // Specialization for device command queue - template <> - struct KernelArgumentHandler - { - static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); } - static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); } - }; -} // namespace detail - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - -template< typename IteratorType > -Buffer::Buffer( - const Context &context, - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr, - cl_int* err) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if( readOnly ) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if( useHostPtr ) { - flags |= CL_MEM_USE_HOST_PTR; - } - - size_type size = sizeof(DataType)*(endIterator - startIterator); - - if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); - } else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if( !useHostPtr ) { - CommandQueue queue(context, 0, &error); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - error = cl::copy(queue, startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } -} - -template< typename IteratorType > -Buffer::Buffer( - const CommandQueue &queue, - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr, - cl_int* err) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if (readOnly) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if (useHostPtr) { - flags |= CL_MEM_USE_HOST_PTR; - } - - size_type size = sizeof(DataType)*(endIterator - startIterator); - - Context context = queue.getInfo(); - - if (useHostPtr) { - object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); - } - else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if (!useHostPtr) { - error = cl::copy(queue, startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } -} - -inline cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - size_type offset, - size_type size, - void* ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); -} - -inline cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - size_type offset, - size_type size, - const void* ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); -} - -inline void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - size_type offset, - size_type size, - const vector* events = NULL, - Event* event = NULL, - cl_int* err = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - void * result = ::clEnqueueMapBuffer( - queue(), buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (cl_event*) event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - return result; -} - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -/** - * Enqueues to the default queue a command that will allow the host to - * update a region of a coarse-grained SVM buffer. - * This variant takes a raw SVM pointer. - */ -template -inline cl_int enqueueMapSVM( - T* ptr, - cl_bool blocking, - cl_map_flags flags, - size_type size, - const vector* events, - Event* event) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - } - - return queue.enqueueMapSVM( - ptr, blocking, flags, size, events, event); -} - -/** - * Enqueues to the default queue a command that will allow the host to - * update a region of a coarse-grained SVM buffer. - * This variant takes a cl::pointer instance. - */ -template -inline cl_int enqueueMapSVM( - cl::pointer &ptr, - cl_bool blocking, - cl_map_flags flags, - size_type size, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - } - - return queue.enqueueMapSVM( - ptr, blocking, flags, size, events, event); -} - -/** - * Enqueues to the default queue a command that will allow the host to - * update a region of a coarse-grained SVM buffer. - * This variant takes a cl::vector instance. - */ -template -inline cl_int enqueueMapSVM( - cl::vector &container, - cl_bool blocking, - cl_map_flags flags, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - } - - return queue.enqueueMapSVM( - container, blocking, flags, events, event); -} - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -inline cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (error != CL_SUCCESS) { - return error; - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - queue(), memory(), mapped_ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; -} - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -/** - * Enqueues to the default queue a command that will release a coarse-grained - * SVM buffer back to the OpenCL runtime. - * This variant takes a raw SVM pointer. - */ -template -inline cl_int enqueueUnmapSVM( - T* ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - } - - return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - -} - -/** - * Enqueues to the default queue a command that will release a coarse-grained - * SVM buffer back to the OpenCL runtime. - * This variant takes a cl::pointer instance. - */ -template -inline cl_int enqueueUnmapSVM( - cl::pointer &ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - } - - return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); -} - -/** - * Enqueues to the default queue a command that will release a coarse-grained - * SVM buffer back to the OpenCL runtime. - * This variant takes a cl::vector instance. - */ -template -inline cl_int enqueueUnmapSVM( - cl::vector &container, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) { - return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - } - - return detail::errHandler(queue.enqueueUnmapSVM(container, events, event), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); -} - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -inline cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - size_type src_offset, - size_type dst_offset, - size_type size, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Host to Device. - * Uses default command queue. - */ -template< typename IteratorType > -inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) - return error; - - return cl::copy(queue, startIterator, endIterator, buffer); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Device to Host. - * Uses default command queue. - */ -template< typename IteratorType > -inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) - return error; - - return cl::copy(queue, buffer, startIterator, endIterator); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Host to Device. - * Uses specified queue. - */ -template< typename IteratorType > -inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - size_type length = endIterator-startIterator; - size_type byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if( error != CL_SUCCESS ) { - return error; - } -#if defined(_MSC_VER) - std::copy( - startIterator, - endIterator, - stdext::checked_array_iterator( - pointer, length)); -#else - std::copy(startIterator, endIterator, pointer); -#endif - Event endEvent; - error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if( error != CL_SUCCESS ) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; -} - -/** - * Blocking copy operation between iterators and a buffer. - * Device to Host. - * Uses specified queue. - */ -template< typename IteratorType > -inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - size_type length = endIterator-startIterator; - size_type byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if( error != CL_SUCCESS ) { - return error; - } - std::copy(pointer, pointer + length, startIterator); - Event endEvent; - error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if( error != CL_SUCCESS ) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; -} - - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -/** - * Blocking SVM map operation - performs a blocking map underneath. - */ -template -inline cl_int mapSVM(cl::vector &container) -{ - return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE); -} - -/** -* Blocking SVM map operation - performs a blocking map underneath. -*/ -template -inline cl_int unmapSVM(cl::vector &container) -{ - return enqueueUnmapSVM(container); -} - -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - -#if CL_HPP_TARGET_OPENCL_VERSION >= 110 -inline cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const array& buffer_offset, - const array& host_offset, - const array& region, - size_type buffer_row_pitch, - size_type buffer_slice_pitch, - size_type host_row_pitch, - size_type host_slice_pitch, - void *ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const array& buffer_offset, - const array& host_offset, - const array& region, - size_type buffer_row_pitch, - size_type buffer_slice_pitch, - size_type host_row_pitch, - size_type host_slice_pitch, - const void *ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const array& src_origin, - const array& dst_origin, - const array& region, - size_type src_row_pitch, - size_type src_slice_pitch, - size_type dst_row_pitch, - size_type dst_slice_pitch, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferRect( - src, - dst, - src_origin, - dst_origin, - region, - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - events, - event); -} -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 - -inline cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const array& origin, - const array& region, - size_type row_pitch, - size_type slice_pitch, - void* ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const array& origin, - const array& region, - size_type row_pitch, - size_type slice_pitch, - const void* ptr, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const array& src_origin, - const array& dst_origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImage( - src, - dst, - src_origin, - dst_origin, - region, - events, - event); -} - -inline cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const array& src_origin, - const array& region, - size_type dst_offset, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImageToBuffer( - src, - dst, - src_origin, - region, - dst_offset, - events, - event); -} - -inline cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - size_type src_offset, - const array& dst_origin, - const array& region, - const vector* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferToImage( - src, - dst, - src_offset, - dst_origin, - region, - events, - event); -} - - -inline cl_int flush(void) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.flush(); -} - -inline cl_int finish(void) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - - return queue.finish(); -} - -class EnqueueArgs -{ -private: - CommandQueue queue_; - const NDRange offset_; - const NDRange global_; - const NDRange local_; - vector events_; - - template - friend class KernelFunctor; - -public: - EnqueueArgs(NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(Event e, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(const vector &events, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(const vector &events, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(const vector &events, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, const vector &events, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const vector &events, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const vector &events, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } -}; - - -//---------------------------------------------------------------------------------------------- - - -/** - * Type safe kernel functor. - * - */ -template -class KernelFunctor -{ -private: - Kernel kernel_; - - template - void setArgs(T0&& t0, T1s&&... t1s) - { - kernel_.setArg(index, t0); - setArgs(std::forward(t1s)...); - } - - template - void setArgs(T0&& t0) - { - kernel_.setArg(index, t0); - } - - template - void setArgs() - { - } - - -public: - KernelFunctor(Kernel kernel) : kernel_(kernel) - {} - - KernelFunctor( - const Program& program, - const string name, - cl_int * err = NULL) : - kernel_(program, name.c_str(), err) - {} - - //! \brief Return type of the functor - typedef Event result_type; - - /** - * Enqueue kernel. - * @param args Launch parameters of the kernel. - * @param t0... List of kernel arguments based on the template type of the functor. - */ - Event operator() ( - const EnqueueArgs& args, - Ts... ts) - { - Event event; - setArgs<0>(std::forward(ts)...); - - args.queue_.enqueueNDRangeKernel( - kernel_, - args.offset_, - args.global_, - args.local_, - &args.events_, - &event); - - return event; - } - - /** - * Enqueue kernel with support for error code. - * @param args Launch parameters of the kernel. - * @param t0... List of kernel arguments based on the template type of the functor. - * @param error Out parameter returning the error code from the execution. - */ - Event operator() ( - const EnqueueArgs& args, - Ts... ts, - cl_int &error) - { - Event event; - setArgs<0>(std::forward(ts)...); - - error = args.queue_.enqueueNDRangeKernel( - kernel_, - args.offset_, - args.global_, - args.local_, - &args.events_, - &event); - - return event; - } - -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 - cl_int setSVMPointers(const vector &pointerList) - { - return kernel_.setSVMPointers(pointerList); - } - - template - cl_int setSVMPointers(const T0 &t0, T1s &... ts) - { - return kernel_.setSVMPointers(t0, ts...); - } -#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 - - Kernel getKernel() - { - return kernel_; - } -}; - -namespace compatibility { - /** - * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp. - * Please use KernelFunctor directly. - */ - template - struct make_kernel - { - typedef KernelFunctor FunctorType; - - FunctorType functor_; - - make_kernel( - const Program& program, - const string name, - cl_int * err = NULL) : - functor_(FunctorType(program, name, err)) - {} - - make_kernel( - const Kernel kernel) : - functor_(FunctorType(kernel)) - {} - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - Ts...); - - Event operator()( - const EnqueueArgs& enqueueArgs, - Ts... args) - { - return functor_( - enqueueArgs, args...); - } - }; -} // namespace compatibility - - -//---------------------------------------------------------------------------------------------------------------------- - -#undef CL_HPP_ERR_STR_ -#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) -#undef __GET_DEVICE_INFO_ERR -#undef __GET_PLATFORM_INFO_ERR -#undef __GET_DEVICE_IDS_ERR -#undef __GET_PLATFORM_IDS_ERR -#undef __GET_CONTEXT_INFO_ERR -#undef __GET_EVENT_INFO_ERR -#undef __GET_EVENT_PROFILE_INFO_ERR -#undef __GET_MEM_OBJECT_INFO_ERR -#undef __GET_IMAGE_INFO_ERR -#undef __GET_SAMPLER_INFO_ERR -#undef __GET_KERNEL_INFO_ERR -#undef __GET_KERNEL_ARG_INFO_ERR -#undef __GET_KERNEL_SUB_GROUP_INFO_ERR -#undef __GET_KERNEL_WORK_GROUP_INFO_ERR -#undef __GET_PROGRAM_INFO_ERR -#undef __GET_PROGRAM_BUILD_INFO_ERR -#undef __GET_COMMAND_QUEUE_INFO_ERR -#undef __CREATE_CONTEXT_ERR -#undef __CREATE_CONTEXT_FROM_TYPE_ERR -#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR -#undef __CREATE_BUFFER_ERR -#undef __COPY_ERR -#undef __CREATE_SUBBUFFER_ERR -#undef __CREATE_GL_BUFFER_ERR -#undef __CREATE_GL_RENDER_BUFFER_ERR -#undef __GET_GL_OBJECT_INFO_ERR -#undef __CREATE_IMAGE_ERR -#undef __CREATE_GL_TEXTURE_ERR -#undef __IMAGE_DIMENSION_ERR -#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR -#undef __CREATE_USER_EVENT_ERR -#undef __SET_USER_EVENT_STATUS_ERR -#undef __SET_EVENT_CALLBACK_ERR -#undef __WAIT_FOR_EVENTS_ERR -#undef __CREATE_KERNEL_ERR -#undef __SET_KERNEL_ARGS_ERR -#undef __CREATE_PROGRAM_WITH_SOURCE_ERR -#undef __CREATE_PROGRAM_WITH_IL_ERR -#undef __CREATE_PROGRAM_WITH_BINARY_ERR -#undef __CREATE_PROGRAM_WITH_IL_ERR -#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR -#undef __BUILD_PROGRAM_ERR -#undef __COMPILE_PROGRAM_ERR -#undef __LINK_PROGRAM_ERR -#undef __CREATE_KERNELS_IN_PROGRAM_ERR -#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR -#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR -#undef __SET_COMMAND_QUEUE_PROPERTY_ERR -#undef __ENQUEUE_READ_BUFFER_ERR -#undef __ENQUEUE_READ_BUFFER_RECT_ERR -#undef __ENQUEUE_WRITE_BUFFER_ERR -#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR -#undef __ENQEUE_COPY_BUFFER_ERR -#undef __ENQEUE_COPY_BUFFER_RECT_ERR -#undef __ENQUEUE_FILL_BUFFER_ERR -#undef __ENQUEUE_READ_IMAGE_ERR -#undef __ENQUEUE_WRITE_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_ERR -#undef __ENQUEUE_FILL_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR -#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR -#undef __ENQUEUE_MAP_BUFFER_ERR -#undef __ENQUEUE_MAP_IMAGE_ERR -#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR -#undef __ENQUEUE_NDRANGE_KERNEL_ERR -#undef __ENQUEUE_NATIVE_KERNEL -#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR -#undef __ENQUEUE_MIGRATE_SVM_ERR -#undef __ENQUEUE_ACQUIRE_GL_ERR -#undef __ENQUEUE_RELEASE_GL_ERR -#undef __CREATE_PIPE_ERR -#undef __GET_PIPE_INFO_ERR -#undef __RETAIN_ERR -#undef __RELEASE_ERR -#undef __FLUSH_ERR -#undef __FINISH_ERR -#undef __VECTOR_CAPACITY_ERR -#undef __CREATE_SUB_DEVICES_ERR -#undef __CREATE_SUB_DEVICES_ERR -#undef __ENQUEUE_MARKER_ERR -#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR -#undef __ENQUEUE_BARRIER_ERR -#undef __UNLOAD_COMPILER_ERR -#undef __CREATE_GL_TEXTURE_2D_ERR -#undef __CREATE_GL_TEXTURE_3D_ERR -#undef __CREATE_IMAGE2D_ERR -#undef __CREATE_IMAGE3D_ERR -#undef __CREATE_COMMAND_QUEUE_ERR -#undef __ENQUEUE_TASK_ERR -#undef __CREATE_SAMPLER_ERR -#undef __ENQUEUE_MARKER_WAIT_LIST_ERR -#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR -#undef __CLONE_KERNEL_ERR -#undef __GET_HOST_TIMER_ERR -#undef __GET_DEVICE_AND_HOST_TIMER_ERR - -#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS - -// Extensions -#undef CL_HPP_INIT_CL_EXT_FCN_PTR_ -#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_ - -#if defined(CL_HPP_USE_CL_DEVICE_FISSION) -#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_ -#endif // CL_HPP_USE_CL_DEVICE_FISSION - -#undef CL_HPP_NOEXCEPT_ -#undef CL_HPP_DEFINE_STATIC_MEMBER_ - -} // namespace cl - -#endif // CL_HPP_ diff --git a/include/cblas.h b/include/cblas.h deleted file mode 100644 index 48b47059d..000000000 --- a/include/cblas.h +++ /dev/null @@ -1,413 +0,0 @@ -#pragma once - -#ifndef CBLAS_H -#define CBLAS_H - -#include -#include - -#ifdef __cplusplus -extern "C" { - /* Assume C declarations for C++ */ -#endif /* __cplusplus */ - -/*Set the number of threads on runtime.*/ -void openblas_set_num_threads(int num_threads); -void goto_set_num_threads(int num_threads); - -/*Get the number of threads on runtime.*/ -int openblas_get_num_threads(void); - -/*Get the number of physical processors (cores).*/ -int openblas_get_num_procs(void); - -/*Get the build configure on runtime.*/ -char* openblas_get_config(void); - -/*Get the CPU corename on runtime.*/ -char* openblas_get_corename(void); - -#ifdef OPENBLAS_OS_LINUX -/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ -int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); -/* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ -int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); -#endif - -/* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); -/* OpenBLAS is compiled for sequential use */ -#define OPENBLAS_SEQUENTIAL 0 -/* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 -/* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 - - -/* - * Since all of GotoBlas was written without const, - * we disable it at build time. - */ -#ifndef OPENBLAS_CONST -# define OPENBLAS_CONST const -#endif - - -#define CBLAS_INDEX size_t - -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; -typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; -typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; -typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; -typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -typedef CBLAS_ORDER CBLAS_LAYOUT; - -float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); -double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); -float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); -double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy); - -openblas_complex_float cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy); -openblas_complex_float cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy); -openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy); -openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy); - -void cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret); -void cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret); -void cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret); -void cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret); - -float cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); -double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); -float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); -double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); - -CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); -void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); -void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); -void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); - -void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); -void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); -void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); -void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); - -void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); -void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); -void cblas_cswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); -void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); - -void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); -void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); -void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); -void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); - -void cblas_srotg(float *a, float *b, float *c, float *s); -void cblas_drotg(double *a, double *b, double *c, double *s); -void cblas_crotg(void *a, void *b, float *c, void *s); -void cblas_zrotg(void *a, void *b, double *c, void *s); - - -void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); -void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); - -void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P); - -void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX); -void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX); -void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX); -void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX); -void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, void *X, OPENBLAS_CONST blasint incX); -void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, void *X, OPENBLAS_CONST blasint incX); - -void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, - OPENBLAS_CONST float alpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); -void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, - OPENBLAS_CONST double alpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy); -void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy); -void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy); - -void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); -void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); -void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); -void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); -void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); -void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); - -void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); - -void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); - -void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda); -void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda); -void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda); -void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda); - -void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, - OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); -void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, - OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); -void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, - OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); -void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, - OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda); - -void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); -void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); -void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); -void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); - -void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); -void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); - - -void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); - -void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX); - -void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX); - -void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); -void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); -void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX); -void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, - OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX); - -void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); -void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); -void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); -void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, - OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); - - -void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap, - OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); -void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap, - OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); - -void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap); -void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap); - -void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A); -void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X,OPENBLAS_CONST blasint incX, void *A); - -void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A); -void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A); -void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap); -void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap); - -void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); -void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); - -void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); -void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY); - -void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); -void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); -void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - - -void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); -void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); -void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); -void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); -void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); -void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); -void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, - OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); -void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); -void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb); -void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb); - -void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); -void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); -void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb); -void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb); - -void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST float alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST double alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc); - -void cblas_xerbla(blasint p, char *rout, char *form, ...); - -/*** BLAS extensions ***/ - -void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); - -void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy); - -void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy); - -void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy); - -void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, - OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); -void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, - OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); -void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, - OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); -void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, - OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); - -void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, - OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); - -void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, - float *c, OPENBLAS_CONST blasint cldc); -void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, - double *c, OPENBLAS_CONST blasint cldc); -void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, - float *c, OPENBLAS_CONST blasint cldc); -void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, - double *c, OPENBLAS_CONST blasint cldc); - -/*** BFLOAT16 and INT8 extensions ***/ -/* convert float array to BFLOAT16 array by rounding */ -void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); -/* convert double array to BFLOAT16 array by rounding */ -void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); -/* convert BFLOAT16 array to float array */ -void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); -/* convert BFLOAT16 array to double array */ -void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); -/* dot production of BFLOAT16 input arrays, and output as float */ -float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); -void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); - -void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif diff --git a/include/clblast.h b/include/clblast.h deleted file mode 100644 index 6c7481af0..000000000 --- a/include/clblast.h +++ /dev/null @@ -1,792 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions -// of the returned status codes and the layout and transpose types. This is the only header users -// of CLBlast should include and use. -// -// ================================================================================================= - -#ifndef CLBLAST_CLBLAST_H_ -#define CLBLAST_CLBLAST_H_ - -#include // For size_t -#include // For OverrideParameters function -#include // For OverrideParameters function - -// Includes the normal OpenCL C header -#if defined(__APPLE__) || defined(__MACOSX) - #include -#else - #include -#endif - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#if defined(_WIN32) && defined(CLBLAST_DLL) - #if defined(COMPILING_DLL) - #define PUBLIC_API __declspec(dllexport) - #else - #define PUBLIC_API __declspec(dllimport) - #endif -#else - #define PUBLIC_API -#endif - -// Version numbering (v1.6.0) -#define CLBLAST_VERSION_MAJOR 1 -#define CLBLAST_VERSION_MINOR 6 -#define CLBLAST_VERSION_PATCH 0 - -namespace clblast { -// ================================================================================================= - -// Status codes. These codes can be returned by functions declared in this header file. The error -// codes match either the standard OpenCL error codes or the clBLAS error codes. -enum class StatusCode { - - // Status codes in common with the OpenCL standard - kSuccess = 0, // CL_SUCCESS - kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE - kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE - kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES - kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY - kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error - kInvalidValue = -30, // CL_INVALID_VALUE - kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE - kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT - kInvalidBinary = -42, // CL_INVALID_BINARY - kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS - kInvalidProgram = -44, // CL_INVALID_PROGRAM - kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE - kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME - kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION - kInvalidKernel = -48, // CL_INVALID_KERNEL - kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX - kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE - kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE - kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS - kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions - kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total - kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension - kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET - kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST - kInvalidEvent = -58, // CL_INVALID_EVENT - kInvalidOperation = -59, // CL_INVALID_OPERATION - kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE - kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE - - // Status codes in common with the clBLAS library - kNotImplemented = -1024, // Routine or functionality not implemented yet - kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer - kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer - kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer - kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer - kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer - kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero - kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension - kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension - kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension - kInvalidIncrementX = -1013, // Increment of vector X cannot be zero - kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero - kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small - kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small - kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small - kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small - kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small - - // Custom additional status codes for CLBlast - kInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small - kInvalidBatchCount = -2049, // The batch count needs to be positive - kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel - kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel - kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device - kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device - kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device - kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer - kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small - kDatabaseError = -2041, // Entry for the device was not found in the database - kUnknownError = -2040, // A catch-all error code representing an unspecified error - kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception -}; - -// Matrix layout and transpose types -enum class Layout { kRowMajor = 101, kColMajor = 102 }; -enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; -enum class Triangle { kUpper = 121, kLower = 122 }; -enum class Diagonal { kNonUnit = 131, kUnit = 132 }; -enum class Side { kLeft = 141, kRight = 142 }; -enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 }; - -// Precision scoped enum (values in bits) -enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, - kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// Generate givens plane rotation: SROTG/DROTG -template -StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Generate modified givens plane rotation: SROTMG/DROTMG -template -StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Apply givens plane rotation: SROT/DROT -template -StatusCode Rot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const T cos, - const T sin, - cl_command_queue* queue, cl_event* event = nullptr); - -// Apply modified givens plane rotation: SROTM/DROTM -template -StatusCode Rotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP -template -StatusCode Swap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL -template -StatusCode Scal(const size_t n, - const T alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY -template -StatusCode Copy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY -template -StatusCode Axpy(const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Dot product of two vectors: SDOT/DDOT/HDOT -template -StatusCode Dot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Dot product of two complex vectors: CDOTU/ZDOTU -template -StatusCode Dotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -template -StatusCode Dotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -template -StatusCode Nrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -template -StatusCode Asum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -template -StatusCode Sum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -template -StatusCode Amax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN -template -StatusCode Amin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -template -StatusCode Max(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -template -StatusCode Min(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -template -StatusCode Gemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -template -StatusCode Gbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian matrix-vector multiplication: CHEMV/ZHEMV -template -StatusCode Hemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -template -StatusCode Hbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -template -StatusCode Hpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -template -StatusCode Symv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -template -StatusCode Sbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -template -StatusCode Spmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -template -StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -template -StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -template -StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -template -StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -template -StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -template -StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// General rank-1 matrix update: SGER/DGER/HGER -template -StatusCode Ger(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// General rank-1 complex matrix update: CGERU/ZGERU -template -StatusCode Geru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// General rank-1 complex conjugated matrix update: CGERC/ZGERC -template -StatusCode Gerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian rank-1 matrix update: CHER/ZHER -template -StatusCode Her(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian packed rank-1 matrix update: CHPR/ZHPR -template -StatusCode Hpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian rank-2 matrix update: CHER2/ZHER2 -template -StatusCode Her2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -template -StatusCode Hpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -template -StatusCode Syr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -template -StatusCode Spr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -template -StatusCode Syr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -template -StatusCode Spr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -template -StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr, - cl_mem temp_buffer = nullptr); - -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -template -StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -template -StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -template -StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Rank-K update of a hermitian matrix: CHERK/ZHERK -template -StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -template -StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -template -StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM -template -StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD -template -StatusCode Had(const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const T beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event = nullptr); - -// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -template -StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event = nullptr); - -// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL -template -StatusCode Im2col(const KernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM -template -StatusCode Col2im(const KernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM -template -StatusCode Convgemm(const KernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, - const cl_mem im_buffer, const size_t im_offset, - const cl_mem kernel_buffer, const size_t kernel_offset, - cl_mem result_buffer, const size_t result_offset, - cl_command_queue* queue, cl_event* event = nullptr); - -// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED -template -StatusCode AxpyBatched(const size_t n, - const T *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event = nullptr); - -// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED -template -StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const T *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event = nullptr); - -// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED -template -StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event = nullptr); - -// ================================================================================================= - -// Retrieves the required size of the temporary buffer for the GEMM kernel (optional) -template -StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, size_t& temp_buffer_size); - -// ================================================================================================= - -// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on -// for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode PUBLIC_API ClearCache(); - -// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. -// Further CLBlast routine calls will then run at maximum speed. -StatusCode PUBLIC_API FillCache(const cl_device_id device); - -// ================================================================================================= - -// Retrieves current tuning parameters for a specific device-precision-kernel combination -StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name, - const Precision precision, - std::unordered_map ¶meters); - -// Overrides tuning parameters for a specific device-precision-kernel combination. The next time -// the target routine is called it will re-compile and use the new parameters from then on. -StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, - const Precision precision, - const std::unordered_map ¶meters); - -// ================================================================================================= - -// Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP -template -StatusCode TuneXaxpy(cl_command_queue* queue, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM -template -StatusCode TuneXdot(cl_command_queue* queue, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV -template -StatusCode TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2 -template -StatusCode TuneXger(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TuneCopy(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TunePad(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K -template -StatusCode TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n, - const double fraction, std::unordered_map ¶meters); - -// Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM -template -StatusCode TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, - const double fraction, std::unordered_map ¶meters); - -// ================================================================================================= - -} // namespace clblast - -// CLBLAST_CLBLAST_H_ -#endif diff --git a/include/clblast_c.h b/include/clblast_c.h deleted file mode 100644 index c779c556e..000000000 --- a/include/clblast_c.h +++ /dev/null @@ -1,1707 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the plain C interface to the CLBlast BLAS routines, the counter-part of the -// normal 'clblast.h' C++ header. -// -// ================================================================================================= - -#ifndef CLBLAST_CLBLAST_C_H_ -#define CLBLAST_CLBLAST_C_H_ - -// Includes the normal OpenCL C header -#if defined(__APPLE__) || defined(__MACOSX) - #include -#else - #include -#endif - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#if defined(_WIN32) && defined(CLBLAST_DLL) - #if defined(COMPILING_DLL) - #define PUBLIC_API __declspec(dllexport) - #else - #define PUBLIC_API __declspec(dllimport) - #endif -#else - #define PUBLIC_API -#endif - -// Version numbering (v1.6.0) -#define CLBLAST_VERSION_MAJOR 1 -#define CLBLAST_VERSION_MINOR 6 -#define CLBLAST_VERSION_PATCH 0 - -// The C interface -#ifdef __cplusplus -extern "C" { -#endif - -// ================================================================================================= - -// Status codes. These codes can be returned by functions declared in this header file. The error -// codes match either the standard OpenCL error codes or the clBLAS error codes. -typedef enum CLBlastStatusCode_ { - - // Status codes in common with the OpenCL standard - CLBlastSuccess = 0, // CL_SUCCESS - CLBlastOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE - CLBlastTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE - CLBlastOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES - CLBlastOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY - CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error - CLBlastInvalidValue = -30, // CL_INVALID_VALUE - CLBlastInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE - CLBlastInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT - CLBlastInvalidBinary = -42, // CL_INVALID_BINARY - CLBlastInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS - CLBlastInvalidProgram = -44, // CL_INVALID_PROGRAM - CLBlastInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE - CLBlastInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME - CLBlastInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION - CLBlastInvalidKernel = -48, // CL_INVALID_KERNEL - CLBlastInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX - CLBlastInvalidArgValue = -50, // CL_INVALID_ARG_VALUE - CLBlastInvalidArgSize = -51, // CL_INVALID_ARG_SIZE - CLBlastInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS - CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions - CLBlastInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total - CLBlastInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension - CLBlastInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET - CLBlastInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST - CLBlastInvalidEvent = -58, // CL_INVALID_EVENT - CLBlastInvalidOperation = -59, // CL_INVALID_OPERATION - CLBlastInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE - CLBlastInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE - - // Status codes in common with the clBLAS library - CLBlastNotImplemented = -1024, // Routine or functionality not implemented yet - CLBlastInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer - CLBlastInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer - CLBlastInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer - CLBlastInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer - CLBlastInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer - CLBlastInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero - CLBlastInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension - CLBlastInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension - CLBlastInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension - CLBlastInvalidIncrementX = -1013, // Increment of vector X cannot be zero - CLBlastInvalidIncrementY = -1012, // Increment of vector Y cannot be zero - CLBlastInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small - CLBlastInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small - CLBlastInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small - CLBlastInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small - CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small - - // Custom additional status codes for CLBlast - CLBlastInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small - CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive - CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel - CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel - CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device - CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device - CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device - CLBlastInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer - CLBlastInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small - CLBlastDatabaseError = -2041, // Entry for the device was not found in the database - CLBlastUnknownError = -2040, // A catch-all error code representing an unspecified error - CLBlastUnexpectedError = -2039, // A catch-all error code representing an unexpected exception -} CLBlastStatusCode; - -// Matrix layout and transpose types -typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, - CLBlastLayoutColMajor = 102 } CLBlastLayout; -typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, - CLBlastTransposeConjugate = 113 } CLBlastTranspose; -typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, - CLBlastTriangleLower = 122 } CLBlastTriangle; -typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, - CLBlastDiagonalUnit = 132 } CLBlastDiagonal; -typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; -typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 151, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode; - -// Precision enum (values in bits) -typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, - CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, - CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// Generate givens plane rotation: SROTG/DROTG -CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, - cl_command_queue* queue, cl_event* event); - -// Generate modified givens plane rotation: SROTMG/DROTMG -CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event); - -// Apply givens plane rotation: SROT/DROT -CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float cos, - const float sin, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double cos, - const double sin, - cl_command_queue* queue, cl_event* event); - -// Apply modified givens plane rotation: SROTM/DROTM -CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event); - -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP -CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL -CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n, - const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n, - const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n, - const cl_float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n, - const cl_double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n, - const cl_half alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY -CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY -CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Dot product of two vectors: SDOT/DDOT/HDOT -CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Dot product of two complex vectors: CDOTU/ZDOTU -CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN -CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Hermitian matrix-vector multiplication: CHEMV/ZHEMV -CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); - -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); - -// General rank-1 matrix update: SGER/DGER/HGER -CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout, - const size_t m, const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout, - const size_t m, const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// General rank-1 complex matrix update: CGERU/ZGERU -CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// General rank-1 complex conjugated matrix update: CGERC/ZGERC -CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// Hermitian rank-1 matrix update: CHER/ZHER -CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// Hermitian packed rank-1 matrix update: CHPR/ZHPR -CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); - -// Hermitian rank-2 matrix update: CHER2/ZHER2 -CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); - -// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); - -// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); - -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Rank-K update of a hermitian matrix: CHERK/ZHERK -CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); - -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); - -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM -CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD -CLBlastStatusCode PUBLIC_API CLBlastShad(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDhad(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastChad(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const cl_float2 beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZhad(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const cl_double2 beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHhad(const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const cl_half beta, - cl_mem z_buffer, const size_t z_offset, const size_t z_inc, - cl_command_queue* queue, cl_event* event); - -// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); - -// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL -CLBlastStatusCode PUBLIC_API CLBlastSim2col(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDim2col(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCim2col(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZim2col(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHim2col(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem im_buffer, const size_t im_offset, - cl_mem col_buffer, const size_t col_offset, - cl_command_queue* queue, cl_event* event); - -// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM -CLBlastStatusCode PUBLIC_API CLBlastScol2im(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, - const cl_mem col_buffer, const size_t col_offset, - cl_mem im_buffer, const size_t im_offset, - cl_command_queue* queue, cl_event* event); - -// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM -CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, - const cl_mem im_buffer, const size_t im_offset, - const cl_mem kernel_buffer, const size_t kernel_offset, - cl_mem result_buffer, const size_t result_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, - const cl_mem im_buffer, const size_t im_offset, - const cl_mem kernel_buffer, const size_t kernel_offset, - cl_mem result_buffer, const size_t result_offset, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const CLBlastKernelMode kernel_mode, - const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, - const cl_mem im_buffer, const size_t im_offset, - const cl_mem kernel_buffer, const size_t kernel_offset, - cl_mem result_buffer, const size_t result_offset, - cl_command_queue* queue, cl_event* event); - -// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED -CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, - const float *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n, - const double *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n, - const cl_float2 *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n, - const cl_double2 *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n, - const cl_half *alphas, - const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, - cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); - -// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED -CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const float *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const float *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const double *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const double *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const cl_float2 *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const cl_double2 *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_half *alphas, - const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, - const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, - const cl_half *betas, - cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); - -// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED -CLBlastStatusCode PUBLIC_API CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, - const size_t batch_count, - cl_command_queue* queue, cl_event* event); - -// ================================================================================================= -// General matrix-matrix multiplication with temporary buffer from user (optional, for advanced users): SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -CLBlastStatusCode PUBLIC_API CLBlastSgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); -CLBlastStatusCode PUBLIC_API CLBlastDgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); -CLBlastStatusCode PUBLIC_API CLBlastCgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); -CLBlastStatusCode PUBLIC_API CLBlastZgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); -CLBlastStatusCode PUBLIC_API CLBlastHgemmWithTempBuffer(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event, cl_mem temp_buffer); - -// ================================================================================================= -// Retrieves the required size of the temporary buffer for the GEMM kernel: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM (optional) -CLBlastStatusCode PUBLIC_API CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, - size_t* temp_buffer_size); - -CLBlastStatusCode PUBLIC_API CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, - size_t* temp_buffer_size); - -CLBlastStatusCode PUBLIC_API CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, - size_t* temp_buffer_size); - -CLBlastStatusCode PUBLIC_API CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, - size_t* temp_buffer_size); - -CLBlastStatusCode PUBLIC_API CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const size_t m, const size_t n, const size_t k, - const size_t a_offset, const size_t a_ld, - const size_t b_offset, const size_t b_ld, - const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, - size_t* temp_buffer_size); - -// ================================================================================================= - -// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on -// for the same device. This cache can be cleared to free up system memory or in case of debugging. -CLBlastStatusCode PUBLIC_API CLBlastClearCache(); - -// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. -// Further CLBlast routine calls will then run at maximum speed. -CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); - -// ================================================================================================= - -// Overrides tuning parameters for a specific device-precision-kernel combination. The next time -// the target routine is called it will re-compile and use the new parameters from then on. -CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, - const CLBlastPrecision precision, const size_t num_parameters, - const char** parameters_names, const size_t* parameters_values); - -// ================================================================================================= - -#ifdef __cplusplus -} // extern "C" -#endif - -// CLBLAST_CLBLAST_C_H_ -#endif diff --git a/include/clblast_half.h b/include/clblast_half.h deleted file mode 100644 index cbea1723a..000000000 --- a/include/clblast_half.h +++ /dev/null @@ -1,249 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file provides simple conversion operations between fp16 (half) and fp32 (float). These -// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and -// are also part of the C++ half-precision header (http://half.sourceforge.net/). -// -// This file is pure C99. -// -// ================================================================================================= - -#ifndef CLBLAST_HALF_H_ -#define CLBLAST_HALF_H_ - -// ================================================================================================= - -// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL -// type, which is a typedef for unsigned short. -typedef unsigned short half; - -// 32-bit union for conversions -typedef union ConversionBits_ { - unsigned int i32; - float f32; -} ConversionBits; - -// ================================================================================================= - -// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function -// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding -// mode. -static half FloatToHalf(const float value) { - static const unsigned short base_table[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, - 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, - 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, - 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, - 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 - }; - static const unsigned char shift_table[512] = { - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 - }; - ConversionBits bits; - bits.f32 = value; - const unsigned short halfbits = base_table[bits.i32 >> 23] + - (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]); - return halfbits; -} - -// Converts a half-precision value to IEEE-compliant single-precision floating-point -static float HalfToFloat(const half value) { - static const unsigned int mantissa_table[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, - 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, - 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, - 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, - 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, - 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, - 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, - 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, - 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, - 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, - 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, - 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, - 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, - 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, - 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, - 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, - 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, - 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, - 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, - 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, - 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, - 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, - 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, - 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, - 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, - 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, - 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, - 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, - 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, - 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, - 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, - 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, - 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, - 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, - 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, - 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, - 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, - 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, - 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, - 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, - 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, - 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, - 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, - 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, - 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, - 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, - 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, - 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, - 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, - 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, - 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, - 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, - 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, - 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, - 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, - 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, - 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, - 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, - 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, - 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, - 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, - 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, - 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, - 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, - 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, - 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, - 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, - 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, - 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, - 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, - 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, - 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, - 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, - 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, - 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, - 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, - 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, - 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, - 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, - 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, - 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, - 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, - 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, - 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, - 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, - 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, - 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, - 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, - 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, - 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, - 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, - 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, - 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, - 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, - 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, - 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, - 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, - 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, - 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, - 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, - 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, - 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, - 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, - 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, - 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, - 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, - 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, - 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, - 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, - 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, - 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 - }; - static const unsigned int exponent_table[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, - 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, - 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 - }; - static const unsigned short offset_table[64] = { - 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, - 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 - }; - ConversionBits bits; - bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + - exponent_table[value >> 10]; - return bits.f32; -} - -// ================================================================================================= - -// CLBLAST_HALF_H_ -#endif diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h deleted file mode 100644 index 4c54fb188..000000000 --- a/include/clblast_netlib_c.h +++ /dev/null @@ -1,993 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer -// copies automatically and running on the default OpenCL platform and device. For full control over -// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. -// -// ================================================================================================= - -#ifndef CLBLAST_CLBLAST_NETLIB_C_H_ -#define CLBLAST_CLBLAST_NETLIB_C_H_ - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#if defined(_WIN32) && defined(CLBLAST_DLL) - #if defined(COMPILING_DLL) - #define PUBLIC_API __declspec(dllexport) - #else - #define PUBLIC_API __declspec(dllimport) - #endif -#else - #define PUBLIC_API -#endif - -// The C interface -#ifdef __cplusplus -extern "C" { -#endif - -// ================================================================================================= - -// Matrix layout and transpose types -typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, - CLBlastLayoutColMajor = 102 } CLBlastLayout; -typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, - CLBlastTransposeConjugate = 113 } CLBlastTranspose; -typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, - CLBlastTriangleLower = 122 } CLBlastTriangle; -typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, - CLBlastDiagonalUnit = 132 } CLBlastDiagonal; -typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; -typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 141, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode; - -// For full compatibility with CBLAS -typedef CLBlastLayout CBLAS_ORDER; -typedef CLBlastTranspose CBLAS_TRANSPOSE; -typedef CLBlastTriangle CBLAS_UPLO; -typedef CLBlastDiagonal CBLAS_DIAG; -typedef CLBlastSide CBLAS_SIDE; -#define CblasRowMajor CLBlastLayoutRowMajor -#define CblasColMajor CLBlastLayoutColMajor -#define CblasNoTrans CLBlastTransposeNo -#define CblasTrans CLBlastTransposeYes -#define CblasConjTrans CLBlastTransposeConjugate -#define CblasUpper CLBlastTriangleUpper -#define CblasLower CLBlastTriangleLower -#define CblasNonUnit CLBlastDiagonalNonUnit -#define CblasUnit CLBlastDiagonalUnit -#define CblasLeft CLBlastSideLeft -#define CblasRight CLBlastSideRight - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// Generate givens plane rotation: SROTG/DROTG -void PUBLIC_API cblas_srotg(float* sa, - float* sb, - float* sc, - float* ss); -void PUBLIC_API cblas_drotg(double* sa, - double* sb, - double* sc, - double* ss); - -// Generate modified givens plane rotation: SROTMG/DROTMG -void PUBLIC_API cblas_srotmg(float* sd1, - float* sd2, - float* sx1, - const float sy1, - float* sparam); -void PUBLIC_API cblas_drotmg(double* sd1, - double* sd2, - double* sx1, - const double sy1, - double* sparam); - -// Apply givens plane rotation: SROT/DROT -void PUBLIC_API cblas_srot(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - const float cos, - const float sin); -void PUBLIC_API cblas_drot(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - const double cos, - const double sin); - -// Apply modified givens plane rotation: SROTM/DROTM -void PUBLIC_API cblas_srotm(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - float* sparam); -void PUBLIC_API cblas_drotm(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - double* sparam); - -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP -void PUBLIC_API cblas_sswap(const int n, - float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_dswap(const int n, - double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_cswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); - -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL -void PUBLIC_API cblas_sscal(const int n, - const float alpha, - float* x, const int x_inc); -void PUBLIC_API cblas_dscal(const int n, - const double alpha, - double* x, const int x_inc); -void PUBLIC_API cblas_cscal(const int n, - const void* alpha, - void* x, const int x_inc); -void PUBLIC_API cblas_zscal(const int n, - const void* alpha, - void* x, const int x_inc); - -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY -void PUBLIC_API cblas_scopy(const int n, - const float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_dcopy(const int n, - const double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_ccopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); - -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY -void PUBLIC_API cblas_saxpy(const int n, - const float alpha, - const float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_daxpy(const int n, - const double alpha, - const double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_caxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zaxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); - -// Dot product of two vectors: SDOT/DDOT/HDOT -float PUBLIC_API cblas_sdot(const int n, - const float* x, const int x_inc, - const float* y, const int y_inc); -double PUBLIC_API cblas_ddot(const int n, - const double* x, const int x_inc, - const double* y, const int y_inc); - -// Dot product of two complex vectors: CDOTU/ZDOTU -void PUBLIC_API cblas_cdotu_sub(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* dot); -void PUBLIC_API cblas_zdotu_sub(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* dot); - -// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -void PUBLIC_API cblas_cdotc_sub(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* dot); -void PUBLIC_API cblas_zdotc_sub(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* dot); - -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -float PUBLIC_API cblas_snrm2(const int n, - const float* x, const int x_inc); -double PUBLIC_API cblas_dnrm2(const int n, - const double* x, const int x_inc); -float PUBLIC_API cblas_scnrm2(const int n, - const void* x, const int x_inc); -double PUBLIC_API cblas_dznrm2(const int n, - const void* x, const int x_inc); - -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -float PUBLIC_API cblas_sasum(const int n, - const float* x, const int x_inc); -double PUBLIC_API cblas_dasum(const int n, - const double* x, const int x_inc); -float PUBLIC_API cblas_scasum(const int n, - const void* x, const int x_inc); -double PUBLIC_API cblas_dzasum(const int n, - const void* x, const int x_inc); - -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -float PUBLIC_API cblas_ssum(const int n, - const float* x, const int x_inc); -double PUBLIC_API cblas_dsum(const int n, - const double* x, const int x_inc); -float PUBLIC_API cblas_scsum(const int n, - const void* x, const int x_inc); -double PUBLIC_API cblas_dzsum(const int n, - const void* x, const int x_inc); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -int PUBLIC_API cblas_isamax(const int n, - const float* x, const int x_inc); -int PUBLIC_API cblas_idamax(const int n, - const double* x, const int x_inc); -int PUBLIC_API cblas_icamax(const int n, - const void* x, const int x_inc); -int PUBLIC_API cblas_izamax(const int n, - const void* x, const int x_inc); - -// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN -int PUBLIC_API cblas_isamin(const int n, - const float* x, const int x_inc); -int PUBLIC_API cblas_idamin(const int n, - const double* x, const int x_inc); -int PUBLIC_API cblas_icamin(const int n, - const void* x, const int x_inc); -int PUBLIC_API cblas_izamin(const int n, - const void* x, const int x_inc); - -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -int PUBLIC_API cblas_ismax(const int n, - const float* x, const int x_inc); -int PUBLIC_API cblas_idmax(const int n, - const double* x, const int x_inc); -int PUBLIC_API cblas_icmax(const int n, - const void* x, const int x_inc); -int PUBLIC_API cblas_izmax(const int n, - const void* x, const int x_inc); - -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -int PUBLIC_API cblas_ismin(const int n, - const float* x, const int x_inc); -int PUBLIC_API cblas_idmin(const int n, - const double* x, const int x_inc); -int PUBLIC_API cblas_icmin(const int n, - const void* x, const int x_inc); -int PUBLIC_API cblas_izmin(const int n, - const void* x, const int x_inc); - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); -void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); -void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian matrix-vector multiplication: CHEMV/ZHEMV -void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* ap, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* ap, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc); -void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc); -void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); -void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); - -// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc); -void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc); -void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); -void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); - -// General rank-1 matrix update: SGER/DGER/HGER -void PUBLIC_API cblas_sger(const CLBlastLayout layout, - const int m, const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dger(const CLBlastLayout layout, - const int m, const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld); - -// General rank-1 complex matrix update: CGERU/ZGERU -void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// General rank-1 complex conjugated matrix update: CGERC/ZGERC -void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// Hermitian rank-1 matrix update: CHER/ZHER -void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* a, const int a_ld); - -// Hermitian packed rank-1 matrix update: CHPR/ZHPR -void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* ap); -void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* ap); - -// Hermitian rank-2 matrix update: CHER2/ZHER2 -void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); -void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); - -// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* a, const int a_ld); - -// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* ap); -void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* ap); - -// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld); - -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* ap); -void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* ap); - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-K update of a hermitian matrix: CHERK/ZHERK -void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const void* a, const int a_ld, - const float beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const void* a, const int a_ld, - const double beta, - void* c, const int c_ld); - -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const float beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const double beta, - void* c, const int c_ld); - -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM -void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD -void PUBLIC_API cblas_shad(const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - const float beta, - float* z, const int z_inc); -void PUBLIC_API cblas_dhad(const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - const double beta, - double* z, const int z_inc); -void PUBLIC_API cblas_chad(const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - const void* beta, - void* z, const int z_inc); -void PUBLIC_API cblas_zhad(const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - const void* beta, - void* z, const int z_inc); - -// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL -void PUBLIC_API cblas_sim2col(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const float* im, - float* col); -void PUBLIC_API cblas_dim2col(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const double* im, - double* col); -void PUBLIC_API cblas_cim2col(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const void* im, - void* col); -void PUBLIC_API cblas_zim2col(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const void* im, - void* col); - -// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM -void PUBLIC_API cblas_scol2im(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const float* col, - float* im); -void PUBLIC_API cblas_dcol2im(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const double* col, - double* im); -void PUBLIC_API cblas_ccol2im(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const void* col, - void* im); -void PUBLIC_API cblas_zcol2im(const CLBlastKernelMode kernel_mode, - const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, - const void* col, - void* im); - -// ================================================================================================= - -#ifdef __cplusplus -} // extern "C" -#endif - -// CLBLAST_CLBLAST_NETLIB_C_H_ -#endif diff --git a/include/openblas_config.h b/include/openblas_config.h deleted file mode 100644 index 1a783e1c0..000000000 --- a/include/openblas_config.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef OPENBLAS_CONFIG_H -#define OPENBLAS_CONFIG_H -#define OPENBLAS_OS_WINNT 1 -#define OPENBLAS_ARCH_X86_64 1 -#define OPENBLAS_C_GCC 1 -#define OPENBLAS___64BIT__ 1 -#define OPENBLAS_HAVE_C11 1 -#define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create -#define OPENBLAS_BUNDERSCORE _ -#define OPENBLAS_NEEDBUNDERSCORE 1 -#define OPENBLAS_GENERIC -#define OPENBLAS_L1_DATA_SIZE 32768 -#define OPENBLAS_L1_DATA_LINESIZE 128 -#define OPENBLAS_L2_SIZE 512488 -#define OPENBLAS_L2_LINESIZE 128 -#define OPENBLAS_DTB_DEFAULT_ENTRIES 128 -#define OPENBLAS_DTB_SIZE 4096 -#define OPENBLAS_L2_ASSOCIATIVE 8 -#define OPENBLAS_CORE_generic -#define OPENBLAS_CHAR_CORENAME "generic" -#define OPENBLAS_SLOCAL_BUFFER_SIZE 4096 -#define OPENBLAS_DLOCAL_BUFFER_SIZE 4096 -#define OPENBLAS_CLOCAL_BUFFER_SIZE 8192 -#define OPENBLAS_ZLOCAL_BUFFER_SIZE 8192 -#define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4 -#define OPENBLAS_VERSION " OpenBLAS 0.3.22 " -/*This is only for "make install" target.*/ - -#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX) -#define OPENBLAS_WINDOWS_ABI -#define OPENBLAS_OS_WINDOWS - -#ifdef DOUBLE -#define DOUBLE_DEFINED DOUBLE -#undef DOUBLE -#endif -#endif - -#ifdef OPENBLAS_NEEDBUNDERSCORE -#define BLASFUNC(FUNC) FUNC##_ -#else -#define BLASFUNC(FUNC) FUNC -#endif - -#ifdef OPENBLAS_QUAD_PRECISION -typedef struct { - unsigned long x[2]; -} xdouble; -#elif defined OPENBLAS_EXPRECISION -#define xdouble long double -#else -#define xdouble double -#endif - -#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifndef BFLOAT16 -#include -typedef uint16_t bfloat16; -#endif - -#ifdef OPENBLAS_USE64BITINT -typedef BLASLONG blasint; -#else -typedef int blasint; -#endif - -#if defined(XDOUBLE) || defined(DOUBLE) -#define FLOATRET FLOAT -#else -#ifdef NEED_F2CCONV -#define FLOATRET double -#else -#define FLOATRET float -#endif -#endif - -/* Inclusion of a standard header file is needed for definition of __STDC_* - predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs - as a side effect of including either or . */ -#include - -/* C99 supports complex floating numbers natively, which GCC also offers as an - extension since version 3.0. If neither are available, use a compatible - structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) - #define OPENBLAS_COMPLEX_C99 -#ifndef __cplusplus - #include -#endif - typedef float _Complex openblas_complex_float; - typedef double _Complex openblas_complex_double; - typedef xdouble _Complex openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_complex_float_real(z) (creal(z)) - #define openblas_complex_float_imag(z) (cimag(z)) - #define openblas_complex_double_real(z) (creal(z)) - #define openblas_complex_double_imag(z) (cimag(z)) - #define openblas_complex_xdouble_real(z) (creal(z)) - #define openblas_complex_xdouble_imag(z) (cimag(z)) -#else - #define OPENBLAS_COMPLEX_STRUCT - typedef struct { float real, imag; } openblas_complex_float; - typedef struct { double real, imag; } openblas_complex_double; - typedef struct { xdouble real, imag; } openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) {(real), (imag)} - #define openblas_make_complex_double(real, imag) {(real), (imag)} - #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} - #define openblas_complex_float_real(z) ((z).real) - #define openblas_complex_float_imag(z) ((z).imag) - #define openblas_complex_double_real(z) ((z).real) - #define openblas_complex_double_imag(z) ((z).imag) - #define openblas_complex_xdouble_real(z) ((z).real) - #define openblas_complex_xdouble_imag(z) ((z).imag) -#endif - -/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ -#ifdef OPENBLAS_OS_LINUX -#ifndef _GNU_SOURCE - #define _GNU_SOURCE -#endif -#include -#endif -#endif /* OPENBLAS_CONFIG_H */ diff --git a/lib/OpenCL.lib b/lib/OpenCL.lib deleted file mode 100644 index 60ab6e0b0..000000000 Binary files a/lib/OpenCL.lib and /dev/null differ diff --git a/lib/clblast.lib b/lib/clblast.lib deleted file mode 100644 index c913737ab..000000000 Binary files a/lib/clblast.lib and /dev/null differ diff --git a/otherarch/ggml_v2-opencl-legacy.c b/otherarch/ggml_v2-opencl-legacy.c deleted file mode 100644 index 50ade6de3..000000000 --- a/otherarch/ggml_v2-opencl-legacy.c +++ /dev/null @@ -1,427 +0,0 @@ -#include "ggml_v2-opencl-legacy.h" - -#define CL_TARGET_OPENCL_VERSION 110 -#include - -#include -#include -#include - -#include "ggml_v2.h" - -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -const char * clblast_dequant_legacy = MULTILINE_QUOTE( - -struct block_q4_0 -{ - float d; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_1 -{ - float d; - float m; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - const float m = blocks[i].m; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q4_2 -{ - ushort d; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d); - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_3 -{ - ushort d; - ushort m; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &(blocks[i].d)); - const float m = vload_half(0, (__global half*) &(blocks[i].m)); - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q5_0 -{ - float d; - uint qh; - uchar qs[16]; -}; - -__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint l2 = l * 2; - - const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; - const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; - - const uint index = i*32 + l2; - result[index + 0] = (((vi & 0xf) | vh0) - 16)*d; - result[index + 1] = (((vi >> 4) | vh1) - 16)*d; -} - -struct block_q5_1 -{ - ushort d; - ushort m; - uint qh; - uchar qs[16]; -}; - -__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d); - const float m = vload_half(0, (__global half*) &blocks[i].m); - - const uchar vi = blocks[i].qs[l]; - - const uint l2 = l * 2; - - const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; - const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; - - const uint index = i*32 + l2; - result[index + 0] = ((vi & 0xf) | vh0)*d + m; - result[index + 1] = ((vi >> 4) | vh1)*d + m; -} - -struct block_q8_0 -{ - float d; - char qs[32]; -}; - -__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; -} - -); - -#define CL_CHECK(err, name) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \ - exit(1); \ - } \ - } while (0) - -#define QK5_0 32 -typedef struct { - ggml_v2_fp16_t d; // delta - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} block_q5_0; - - -typedef struct { - float d; // delta - uint32_t qh; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} cl_block_q5_0; - -static cl_platform_id platform; -static cl_device_id device; -static cl_context context; -static cl_command_queue queue; -static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3, kernel_q5_0, kernel_q5_1, kernel_q8_0; -static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; -static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; - -static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { - cl_program p; - char *program_log; - size_t program_size, log_size; - int err; - - program_size = strlen(program_buffer); - - p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); - if(err < 0) { - fprintf(stderr, "OpenCL error creating program"); - exit(1); - } - - err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); - if(err < 0) { - - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - program_log = (char*) malloc(log_size + 1); - program_log[log_size] = '\0'; - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); - printf("%s\n", program_log); - free(program_log); - exit(1); - } - - return p; -} - -void ggml_v2_cl_init_legacy(void) { - cl_int err = 0; - char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_OPENCL_PLATFORM"); - char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_OPENCL_DEVICE"); - int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); - int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); - printf("\nInitializing LEGACY CLBlast (First Run)..."); - printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); - cl_uint num_platforms; - clGetPlatformIDs(0, NULL, &num_platforms); - cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); - clGetPlatformIDs(num_platforms, platforms, NULL); - platform = platforms[plat_num]; - char platform_buffer[1024]; - clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); - cl_uint num_devices; - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); - cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); - device = devices[dev_num]; - char device_buffer[1024]; - clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); - printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); - context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); - CL_CHECK(err, "clCreateContext"); - queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); - CL_CHECK(err, "clCreateCommandQueue"); - - free(platforms); - free(devices); - - program = build_program_from_source(context, device, clblast_dequant_legacy); - - // Prepare dequantize kernels - kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err); - CL_CHECK(err, "clCreateKernel"); -} - -static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { - if (req_size <= *cur_size) { - return; - } - - // Reallocate buffer with enough space - if (*cur_size > 0) { - clReleaseMemObject(*buf); - } - cl_int err; - *buf = clCreateBuffer(context, flags, req_size, NULL, &err); - *cur_size = req_size; - CL_CHECK(err, "clCreateBuffer"); -} - -void ggml_v2_cl_sgemm_wrapper_legacy( - const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, - const int m, const int n, const int k, - const float alpha, const void *host_a, const int lda, - const float *host_b, const int ldb, const float beta, - float *host_c, const int ldc, const int btype) { - cl_int err = 0; - - cl_kernel kernel; - size_t global = n * k, local, size_qb; - bool dequant; - cl_block_q5_0* cl_host_b; - - switch (btype) { - case GGML_V2_TYPE_F32: - dequant = false; - break; - case GGML_V2_TYPE_Q4_0: - dequant = true; - kernel = kernel_q4_0; - local = 16; - size_qb = global * (sizeof(float) + local) / 32; - break; - case GGML_V2_TYPE_Q4_1: - dequant = true; - kernel = kernel_q4_1; - local = 16; - size_qb = global * (sizeof(float) * 2 + local) / 32; - break; - case GGML_V2_TYPE_Q4_2: - dequant = true; - kernel = kernel_q4_2; - local = 8; - size_qb = global * (sizeof(ggml_v2_fp16_t) + local) / 16; - break; - case GGML_V2_TYPE_Q4_3: - dequant = true; - kernel = kernel_q4_3; - local = 8; - size_qb = global * (sizeof(short) * 2 + local) / 16; - break; - case GGML_V2_TYPE_Q5_0: - dequant = true; - kernel = kernel_q5_0; - local = 16; - // For some reason OpenCL seems to be incapable of working with structs of size 22. - // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU... - // TODO Find the reason, fix and remove workaround. - const block_q5_0* b = (const block_q5_0*) host_b; - cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); - for (size_t i = 0; i < global / 32; i++) { - cl_host_b[i].d = ggml_v2_fp16_to_fp32(b[i].d); - memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t)); - memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2); - } - host_b = (const float*) cl_host_b; - size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; - break; - case GGML_V2_TYPE_Q5_1: - dequant = true; - kernel = kernel_q5_1; - local = 16; - size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; - break; - case GGML_V2_TYPE_Q8_0: - dequant = true; - kernel = kernel_q8_0; - local = 32; - size_qb = global * (sizeof(float) + local) / 32; - break; - default: - fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); - abort(); - } - - const size_t size_a = m * k * sizeof(float); - const size_t size_b = n * k * sizeof(float); - const size_t size_c = m * n * sizeof(float); - - // Prepare buffers - ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); - if (dequant) { - ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); - } - ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); - ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); - - cl_event ev_a, ev_qb, ev_b; - - if (dequant) { - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); - err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); - CL_CHECK(err, "clSetKernelArg"); - err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); - CL_CHECK(err, "clEnqueueWriteBuffer qb"); - } else { - err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); - CL_CHECK(err, "clEnqueueWriteBuffer b"); - } - - err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); - CL_CHECK(err, "clEnqueueWriteBuffer a"); - if (dequant) { - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); - CL_CHECK(err, "clEnqueueNDRangeKernel"); - clReleaseEvent(ev_qb); - } - clWaitForEvents(1, &ev_a); - clWaitForEvents(1, &ev_b); - clReleaseEvent(ev_a); - clReleaseEvent(ev_b); - - cl_event ev_sgemm; - CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); - - if (status != CLBlastSuccess) { - fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); - abort(); - } - - cl_event ev_c; - clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); - - // Wait for completion - clWaitForEvents(1, &ev_c); - clReleaseEvent(ev_sgemm); - clReleaseEvent(ev_c); - if (btype == GGML_V2_TYPE_Q5_0) { - free((void*) cl_host_b); - } -} diff --git a/otherarch/ggml_v2-opencl-legacy.h b/otherarch/ggml_v2-opencl-legacy.h deleted file mode 100644 index bcfe670c9..000000000 --- a/otherarch/ggml_v2-opencl-legacy.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "ggml_v2-opencl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void ggml_v2_cl_init_legacy(void); - -void ggml_v2_cl_sgemm_wrapper_legacy(const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); - -#ifdef __cplusplus -} -#endif diff --git a/otherarch/ggml_v2-opencl.cpp b/otherarch/ggml_v2-opencl.cpp deleted file mode 100644 index 660d93baa..000000000 --- a/otherarch/ggml_v2-opencl.cpp +++ /dev/null @@ -1,1005 +0,0 @@ -#include "ggml_v2-opencl.h" - -#include -#include -#include - -#define CL_TARGET_OPENCL_VERSION 110 -#include -#include - -#include -#include -#include - -#include "ggml_v2.h" - -#define CL_DMMV_BLOCK_SIZE 32; - -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -static std::string program_source = MULTILINE_QUOTE( - -typedef char int8_t; -typedef uchar uint8_t; -typedef int int32_t; -typedef uint uint32_t; - -struct block_q4_0 -{ - float d; - uint8_t qs[16]; -}; - -struct block_q4_1 -{ - float d; - float m; - uint8_t qs[16]; -}; - -struct __attribute__ ((packed)) block_q5_0 -{ - half d; - uint32_t qh; - uint8_t qs[16]; -}; - -struct block_q5_1 -{ - half d; - half m; - uint32_t qh; - uint8_t qs[16]; -}; - -struct block_q8_0 -{ - float d; - uint8_t qs[32]; -}; - - -__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { - const uint i = get_global_id(0); - - y[i] = vload_half(0, &x[i]); -} - -void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = x[ib].d; - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = (vi0 - 8)*d; - *v1 = (vi1 - 8)*d; -} -void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = x[ib].d; - const float m = x[ib].m; - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = vi0*d + m; - *v1 = vi1*d + m; -} -void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, (__global half*) &x[ib].d); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; - - *v0 = x0*d; - *v1 = x1*d; -} -void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, (__global half*) &x[ib].d); - const float m = vload_half(0, (__global half*) &x[ib].m); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); - - *v0 = x0*d + m; - *v1 = x1*d + m; -} -void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = x[ib].d; - - const int8_t vi0 = x[ib].qs[iqs + 0]; - const int8_t vi1 = x[ib].qs[iqs + 1]; - - *v0 = vi0*d; - *v1 = vi1*d; -} -static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ - *v0 = vload_half(0, &x[ib + 0]); - *v1 = vload_half(0, &x[ib + 1]); -} -); - -static std::string dequant_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; - - if (i >= get_global_size(0)) { - return; - } - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int ib = i/qk; // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - y[iybs + iqs + 0] = v0; - y[iybs + iqs + y_offset] = v1; -} -); - -static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { - const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; - const int tid = get_local_id(0); - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int y_offset = qr == 1 ? 1 : qk/2; - - tmp[tid] = 0; - - for (int i = 0; i < ncols/block_size; i += 2) { - const int col = i*block_size + 2*tid; - const int ib = (row*ncols + col)/qk; // block index - const int iqs = (col%qk)/qr; // quant index - const int iybs = col - col%qk; // y block start index - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - - // matrix multiplication - tmp[tid] += v0 * y[iybs + iqs + 0]; - tmp[tid] += v1 * y[iybs + iqs + y_offset]; - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=block_size/2; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} -); - -static std::array dequant_str_keys = { - "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" -}; - -static std::array dequant_str_values = { - "dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", - "dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", - "dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", - "dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", - "dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", - "convert_row_f16", "half", "1", "1", "convert_f16" -}; - -static std::array dequant_mul_mat_vec_str_values = { - "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", - "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", - "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", - "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", - "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", - "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" -}; - -static std::string& sreplace2(std::string& s, const std::string& from, const std::string& to) { - size_t pos = 0; - while ((pos = s.find(from, pos)) != std::string::npos) { - s.replace(pos, from.length(), to); - pos += to.length(); - } - return s; -} - -static std::string generate_kernels() { - std::stringstream src; - src << program_source << '\n'; - for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { - std::string dequant_kernel = dequant_template; - std::string dmmv_kernel = dequant_mul_mat_vec_template; - for (size_t j = 0; j < dequant_str_keys.size(); j++) { - sreplace2(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); - sreplace2(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); - } - src << dequant_kernel << '\n'; - src << dmmv_kernel << '\n'; - } - return src.str(); -} - -#define CL_CHECK(err, name) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \ - exit(1); \ - } \ - } while (0) - -static cl_platform_id platform; -static cl_device_id device; -static cl_context context; -static cl_command_queue queue; -static cl_program program; -static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; -static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; -static cl_kernel convert_row_f16_cl; -static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; -static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static bool fp16_support = false; - -static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { - cl_program p; - char *program_log; - size_t program_size, log_size; - int err; - - program_size = strlen(program_buffer); - - p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); - if(err < 0) { - fprintf(stderr, "OpenCL error creating program"); - exit(1); - } - - err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); - if(err < 0) { - - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - program_log = (char*) malloc(log_size + 1); - program_log[log_size] = '\0'; - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); - printf("%s\n", program_log); - free(program_log); - exit(1); - } - - return p; -} - -void ggml_v2_cl_init(void) { - cl_int err = 0; - char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_OPENCL_PLATFORM"); - char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_OPENCL_DEVICE"); - int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); - int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); - printf("\nInitializing LEGACY v2 CLBlast (First Run)..."); - printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); - cl_uint num_platforms; - clGetPlatformIDs(0, NULL, &num_platforms); - cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); - clGetPlatformIDs(num_platforms, platforms, NULL); - platform = platforms[plat_num]; - char platform_buffer[1024]; - clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); - cl_uint num_devices; - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); - cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); - device = devices[dev_num]; - char device_buffer[1024]; - clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); - size_t ext_str_size; - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); - char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size); - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); - // Check if ext_buffer contains cl_khr_fp16 - for (size_t i = 0; i < ext_str_size - 12; i++) { - if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) { - fp16_support = true; - break; - } - } - free(ext_buffer); - printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support); - fp16_support = false; - printf("CL FP16 temporarily disabled pending further optimization.\n"); - context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); - CL_CHECK(err, "clCreateContext"); - queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); - CL_CHECK(err, "clCreateCommandQueue"); - - free(platforms); - free(devices); - - std::string kernel_src = generate_kernels(); - - program = build_program_from_source(context, device, kernel_src.c_str()); - - // FP16 to FP32 kernel - convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err); - CL_CHECK(err, "clCreateKernel"); - - // Dequantize kernels - dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err); - CL_CHECK(err, "clCreateKernel"); - - // dequant mul mat kernel - dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err); - CL_CHECK(err, "clCreateKernel"); - dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err); - CL_CHECK(err, "clCreateKernel"); - convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err); - CL_CHECK(err, "clCreateKernel"); -} - -static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { - if (req_size <= *cur_size) { - return; - } - - // Reallocate buffer with enough space - if (*cur_size > 0) { - clReleaseMemObject(*buf); - } - cl_int err; - *buf = clCreateBuffer(context, flags, req_size, NULL, &err); - *cur_size = req_size; - CL_CHECK(err, "clCreateBuffer"); -} - -static cl_kernel* ggml_v2_get_to_fp32_cl(ggml_v2_type type) { - switch (type) { - case GGML_V2_TYPE_Q4_0: - return &dequantize_row_q4_0_cl; - case GGML_V2_TYPE_Q4_1: - return &dequantize_row_q4_1_cl; - case GGML_V2_TYPE_Q5_0: - return &dequantize_row_q5_0_cl; - case GGML_V2_TYPE_Q5_1: - return &dequantize_row_q5_1_cl; - case GGML_V2_TYPE_Q8_0: - return &dequantize_row_q8_0_cl; - case GGML_V2_TYPE_F16: - return &convert_row_f16_cl; - default: - return nullptr; - } -} - -static cl_kernel* ggml_v2_get_dequantize_mul_mat_vec_cl(ggml_v2_type type) { - switch (type) { - case GGML_V2_TYPE_Q4_0: - return &dequantize_mul_mat_vec_q4_0_cl; - case GGML_V2_TYPE_Q4_1: - return &dequantize_mul_mat_vec_q4_1_cl; - case GGML_V2_TYPE_Q5_0: - return &dequantize_mul_mat_vec_q5_0_cl; - case GGML_V2_TYPE_Q5_1: - return &dequantize_mul_mat_vec_q5_1_cl; - case GGML_V2_TYPE_Q8_0: - return &dequantize_mul_mat_vec_q8_0_cl; - case GGML_V2_TYPE_F16: - return &convert_mul_mat_vec_f16_cl; - default: - return nullptr; - } -} - -// buffer pool for cl -#define MAX_CL_BUFFERS 256 - -struct scoped_spin_lock { - std::atomic_flag& lock; - scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { - while (lock.test_and_set(std::memory_order_acquire)) { - ; // spin - } - } - ~scoped_spin_lock() { - lock.clear(std::memory_order_release); - } - scoped_spin_lock(const scoped_spin_lock&) = delete; - scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; -}; - -struct cl_buffer { - cl_mem mem; - size_t size = 0; -}; - -static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; -static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; - -static cl_mem ggml_v2_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { - scoped_spin_lock lock(g_cl_pool_lock); - cl_int err; - - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer& b = g_cl_buffer_pool[i]; - if (b.size > 0 && b.size >= size) { - cl_mem mem = b.mem; - *actual_size = b.size; - b.size = 0; - return mem; - } - } - cl_mem mem = clCreateBuffer(context, flags, size, NULL, &err); - CL_CHECK(err, "clCreateBuffer"); - *actual_size = size; - return mem; -} - -static void ggml_v2_cl_pool_free(cl_mem mem, size_t size) { - scoped_spin_lock lock(g_cl_pool_lock); - - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer& b = g_cl_buffer_pool[i]; - if (b.size == 0) { - b.mem = mem; - b.size = size; - return; - } - } - fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); - clReleaseMemObject(mem); -} - -static cl_int ggml_v2_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v2_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { - cl_int err; - const uint64_t ne0 = src->ne[0]; - const uint64_t ne1 = src->ne[1]; - const uint64_t nb0 = src->nb[0]; - const uint64_t nb1 = src->nb[1]; - const uint64_t nb2 = src->nb[2]; - const uint64_t nb3 = src->nb[3]; - const enum ggml_v2_type type = src->type; - const size_t ts = ggml_v2_type_size(type); - const size_t bs = ggml_v2_blck_size(type); - - const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); - if (nb0 == ts && nb1 == ts*ne0/bs) { - err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev); - return err; - } - if (nb0 == ts) { - const size_t buffer_origin[3] = { offset, 0, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts*ne0/bs, ne1, 1 }; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev); - return err; - } - for (uint64_t i1 = 0; i1 < ne1; i1++) { - // pretend the row is a matrix with cols=1 - const size_t buffer_origin[3] = { offset, i1, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts/bs, ne0, 1 }; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev); - if (err != CL_SUCCESS) { - break; - } - } - return err; -} - -static void ggml_v2_cl_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY); - cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); - - cl_int err; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - // copy data to device - err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); - err |= ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL); - CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); - - CL_CHECK(clFinish(queue), "clFinish"); - - // compute - cl_event ev_sgemm; - - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V2_ASSERT(false); - } - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL); - CL_CHECK(err, "clEnqueueReadBuffer"); - } - } - - ggml_v2_cl_pool_free(d_X, x_size); - ggml_v2_cl_pool_free(d_Y, y_size); - ggml_v2_cl_pool_free(d_D, d_size); -} - -static void ggml_v2_cl_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t /* wsize */) { - GGML_V2_ASSERT(fp16_support); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const ggml_v2_fp16_t alpha = ggml_v2_fp32_to_fp16(1.0f); - const ggml_v2_fp16_t beta = ggml_v2_fp32_to_fp16(0.0f); - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); - cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); - - cl_int err; - - bool src1_cont_rows = nb10 == sizeof(float); - bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - // copy src0 to device - err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); - CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); - - // convert src1 to fp16 - // TODO: use multiple threads - ggml_v2_fp16_t * const tmp = (ggml_v2_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); - char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; - if (src1_cont_rows) { - if (src1_cont_cols) { - ggml_v2_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); - } - else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_v2_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); - } - } - } - else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - for (int64_t i00 = 0; i00 < ne10; i00++) { - // very slow due to no inlining - tmp[i01*ne10 + i00] = ggml_v2_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); - } - } - } - - // copy src1 to device - err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v2_fp16_t) * y_ne, tmp, 0, NULL, NULL); - CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); - - CL_CHECK(clFinish(queue), "clFinish"); - - // compute - cl_event ev_sgemm; - clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V2_ASSERT(false); - } - - // copy dst to host, then convert to float - err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v2_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - ggml_v2_fp16_to_fp32_row(tmp, d, d_ne); - } - } - - ggml_v2_cl_pool_free(d_X, x_size); - ggml_v2_cl_pool_free(d_Y, y_size); - ggml_v2_cl_pool_free(d_D, d_size); -} - -static void ggml_v2_cl_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - const ggml_v2_type type = src0->type; - const bool mul_mat_vec = ne11 == 1; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const size_t q_sz = ggml_v2_type_size(type) * x_ne / ggml_v2_blck_size(type); - - size_t x_size, y_size, d_size, q_size; - cl_mem d_X; - if (!mul_mat_vec) { - d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); - } - cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); - cl_mem d_Q; - if (src0->backend == GGML_V2_BACKEND_CPU) { - d_Q = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); - } - - cl_kernel* to_fp32_cl = ggml_v2_get_to_fp32_cl(type); - cl_kernel* dmmv = ggml_v2_get_dequantize_mul_mat_vec_cl(type); - GGML_V2_ASSERT(to_fp32_cl != nullptr); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev_sgemm; - - // copy src0 to device if necessary - if (src0->backend == GGML_V2_BACKEND_CPU) { - CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); - } else if (src0->backend == GGML_V2_BACKEND_CL) { - d_Q = *(cl_mem*) src0->data; - } else { - GGML_V2_ASSERT(false); - } - if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel - // copy src1 to device - CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); - - // compute - const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; - const size_t local = CL_DMMV_BLOCK_SIZE; - const cl_int ncols = ne00; - CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); - CL_CHECK(clFinish(queue), "clFinish"); - CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); - } else { // general dequantization kernel + CLBlast matrix matrix multiplication - // convert src0 to fp32 on device - const size_t global = x_ne; - CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X), "clSetKernelArg"); - CL_CHECK(clFinish(queue), "clFinish"); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL), "clEnqueueNDRangeKernel"); - - // copy src1 to device - CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); - - // wait for conversion - CL_CHECK(clFinish(queue), "clFinish"); - - // compute - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V2_ASSERT(false); - } - } - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL), "clEnqueueReadBuffer"); - clReleaseEvent(ev_sgemm); - } - } - - if (!mul_mat_vec) { - ggml_v2_cl_pool_free(d_X, x_size); - } - ggml_v2_cl_pool_free(d_Y, y_size); - ggml_v2_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_V2_BACKEND_CPU) { - ggml_v2_cl_pool_free(d_Q, q_size); - } -} - - -bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if ((src0->type == GGML_V2_TYPE_F32 || src0->type == GGML_V2_TYPE_F16 || ggml_v2_is_quantized(src0->type)) && - src1->type == GGML_V2_TYPE_F32 && - dst->type == GGML_V2_TYPE_F32 && - ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V2_BACKEND_CL)) { - return true; - } - - return false; -} - -bool ggml_v2_cl_mul_mat_use_f16(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * /* dst */) { - // If device doesn't support FP16 - if (!fp16_support) { - return false; - } - - size_t src0_sz = ggml_v2_nbytes(src0); - size_t src1_sz = ggml_v2_nbytes(src1); - - // mul_mat_q: src0 is converted to fp32 on device - size_t mul_mat_q_transfer = src0_sz + src1_sz; - - // mul_mat_f16: src1 is converted to fp16 on cpu - size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v2_fp16_t) * ggml_v2_nelements(src1); - - // choose the smaller one to transfer to the device - // TODO: this is not always the best choice due to the overhead of converting to fp16 - return mul_mat_f16_transfer < mul_mat_q_transfer; -} - -void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize) { - GGML_V2_ASSERT(ggml_v2_cl_can_mul_mat(src0, src1, dst)); - - if (src0->type == GGML_V2_TYPE_F32) { - ggml_v2_cl_mul_mat_f32(src0, src1, dst); - } - else if (src0->type == GGML_V2_TYPE_F16) { - if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) { - ggml_v2_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); - } - else { - ggml_v2_cl_mul_mat_q_f32(src0, src1, dst); - } - } - else if (ggml_v2_is_quantized(src0->type)) { - ggml_v2_cl_mul_mat_q_f32(src0, src1, dst); - } - else { - GGML_V2_ASSERT(false); - } -} - -size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) { - if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) { - return ggml_v2_nelements(src1) * sizeof(ggml_v2_fp16_t); - } - return 0; -} - -void ggml_v2_cl_transform_tensor(ggml_v2_tensor * tensor) { - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; - const int64_t ne2 = tensor->ne[2]; - const int64_t ne3 = tensor->ne[3]; - - const ggml_v2_type type = tensor->type; - const size_t q_sz = ggml_v2_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_v2_blck_size(type); - - size_t q_size; - cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem)); - *dst = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); - - // copy tensor to device - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - int i = i3*ne2 + i2; - CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_v2_cl_h2d_tensor_2d"); - } - } - - CL_CHECK(clFinish(queue), "clFinish"); - - tensor->data = dst; - tensor->backend = GGML_V2_BACKEND_CL; -} - -void ggml_v2_cl_sgemm_wrapper( - const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, - const int m, const int n, const int k, - const float alpha, const void *host_a, const int lda, - const float *host_b, const int ldb, const float beta, - float *host_c, const int ldc, const int btype) { - cl_int err = 0; - - cl_kernel * kernel = ggml_v2_get_to_fp32_cl((ggml_v2_type)btype); - size_t global = n * k, local, size_qb; - bool dequant; - - switch (btype) { - case GGML_V2_TYPE_F32: - dequant = false; - break; - case GGML_V2_TYPE_Q4_0: - dequant = true; - local = 16; - size_qb = global * (sizeof(float) + local) / 32; - break; - case GGML_V2_TYPE_Q4_1: - dequant = true; - local = 16; - size_qb = global * (sizeof(float) * 2 + local) / 32; - break; - case GGML_V2_TYPE_Q5_0: - dequant = true; - local = 16; - size_qb = global * (sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + local) / 32; - break; - case GGML_V2_TYPE_Q5_1: - dequant = true; - local = 16; - size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; - break; - case GGML_V2_TYPE_Q8_0: - dequant = true; - local = 32; - size_qb = global * (sizeof(float) + local) / 32; - break; - default: - fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); - abort(); - } - - const size_t size_a = m * k * sizeof(float); - const size_t size_b = n * k * sizeof(float); - const size_t size_c = m * n * sizeof(float); - - // Prepare buffers - ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); - if (dequant) { - ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); - } - ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); - ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); - - cl_event ev_a, ev_qb, ev_b; - - if (dequant) { - err = clSetKernelArg(*kernel, 0, sizeof(cl_mem), &cl_buffer_qb); - err |= clSetKernelArg(*kernel, 1, sizeof(cl_mem), &cl_buffer_b); - CL_CHECK(err, "clSetKernelArg"); - err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); - CL_CHECK(err, "clEnqueueWriteBuffer qb"); - } else { - err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); - CL_CHECK(err, "clEnqueueWriteBuffer b"); - } - - err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); - CL_CHECK(err, "clEnqueueWriteBuffer a"); - if (dequant) { - err = clEnqueueNDRangeKernel(queue, *kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); - CL_CHECK(err, "clEnqueueNDRangeKernel"); - clReleaseEvent(ev_qb); - } - clWaitForEvents(1, &ev_a); - clWaitForEvents(1, &ev_b); - clReleaseEvent(ev_a); - clReleaseEvent(ev_b); - - cl_event ev_sgemm; - CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); - - if (status != CLBlastSuccess) { - fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); - abort(); - } - - cl_event ev_c; - clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); - - // Wait for completion - clWaitForEvents(1, &ev_c); - clReleaseEvent(ev_sgemm); - clReleaseEvent(ev_c); -} diff --git a/otherarch/ggml_v2-opencl.h b/otherarch/ggml_v2-opencl.h deleted file mode 100644 index c21de9186..000000000 --- a/otherarch/ggml_v2-opencl.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "ggml_v2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -enum ggml_v2_blas_order { - GGML_V2_BLAS_ORDER_ROW_MAJOR = 101, - GGML_V2_BLAS_ORDER_COLUMN_MAJOR = 102, -}; - -enum ggml_v2_blas_op { - GGML_V2_BLAS_OP_N = 111, - GGML_V2_BLAS_OP_T = 112, - GGML_V2_BLAS_OP_C = 113, -}; - -void ggml_v2_cl_init(void); - -bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst); -size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst); -void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize); - -void * ggml_v2_cl_host_malloc(size_t size); -void ggml_v2_cl_host_free(void * ptr); - -void ggml_v2_cl_transform_tensor(struct ggml_v2_tensor * tensor); - -void ggml_v2_cl_sgemm_wrapper(const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); - -#ifdef __cplusplus -} -#endif diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index bcb04b991..bb9a309a6 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -140,10 +140,6 @@ inline static void* ggml_v2_aligned_malloc(size_t size) { #include "ggml_v2-cuda.h" #include "ggml_v2-cuda-legacy.h" #endif -#if defined(GGML_USE_CLBLAST) -#include "ggml_v2-opencl.h" -#include "ggml_v2-opencl-legacy.h" -#endif #undef MIN #undef MAX @@ -3904,15 +3900,6 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) { { ggml_v2_init_cublas_legacy(); } -#elif defined(GGML_USE_CLBLAST) - if(quants_unshuffled) - { - ggml_v2_cl_init(); - } - else - { - ggml_v2_cl_init_legacy(); - } #endif is_first_call = false; diff --git a/otherarch/ggml_v3-opencl.cpp b/otherarch/ggml_v3-opencl.cpp deleted file mode 100644 index ac218bb0e..000000000 --- a/otherarch/ggml_v3-opencl.cpp +++ /dev/null @@ -1,1908 +0,0 @@ -#include "ggml_v3.h" -#include "ggml_v3-opencl.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#define CL_TARGET_OPENCL_VERSION 110 -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#define CL_DMMV_LOCAL_SIZE 32 - -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 1 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -static std::string program_source = MULTILINE_QUOTE( - -typedef char int8_t; -typedef uchar uint8_t; -typedef short int16_t; -typedef ushort uint16_t; -typedef int int32_t; -typedef uint uint32_t; - -struct __attribute__ ((packed)) block_q4_0 -{ - half d; - uint8_t qs[QK4_0 / 2]; -}; - -struct __attribute__ ((packed)) block_q4_1 -{ - half d; - half m; - uint8_t qs[QK4_1 / 2]; -}; - -struct __attribute__ ((packed)) block_q5_0 -{ - half d; - uint32_t qh; - uint8_t qs[QK5_0 / 2]; -}; - -struct __attribute__ ((packed)) block_q5_1 -{ - half d; - half m; - uint32_t qh; - uint8_t qs[QK5_1 / 2]; -}; - -struct __attribute__ ((packed)) block_q8_0 -{ - half d; - int8_t qs[QK8_0]; -}; - -struct __attribute__((packed)) block_q2_K -{ - uint8_t scales[16]; - uint8_t qs[64]; - half d; - half dmin; -}; - -struct __attribute__((packed)) block_q3_K -{ - uint8_t hmask[32]; - uint8_t qs[64]; - uint8_t scales[12]; - half d; -}; - -struct __attribute__((packed)) block_q4_K -{ - half d; - half dmin; - uint8_t scales[12]; - uint8_t qs[128]; -}; - -struct __attribute__((packed)) block_q5_K -{ - half d; - half dmin; - uint8_t scales[12]; - uint8_t qh[32]; - uint8_t qs[128]; -}; - -struct __attribute__((packed)) block_q6_K -{ - uint8_t ql[128]; - uint8_t qh[64]; - int8_t scales[16]; - half d; -}; - -__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { - const uint i = get_global_id(0); - - y[i] = vload_half(0, &x[i]); -} - -void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = (vi0 - 8)*d; - *v1 = (vi1 - 8)*d; -} -void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - const float m = vload_half(0, &x[ib].m); - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = vi0*d + m; - *v1 = vi1*d + m; -} -void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; - - *v0 = x0*d; - *v1 = x1*d; -} -void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - const float m = vload_half(0, &x[ib].m); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); - - *v0 = x0*d + m; - *v1 = x1*d + m; -} -void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - const int8_t vi0 = x[ib].qs[iqs + 0]; - const int8_t vi1 = x[ib].qs[iqs + 1]; - - *v0 = vi0*d; - *v1 = vi1*d; -} -void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ - *v0 = vload_half(0, &x[ib + 0]); - *v1 = vload_half(0, &x[ib + 1]); -} -); - -static std::string k_quants_source = MULTILINE_QUOTE( -inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) -{ - if (j < 4) - { - *d = q[j] & 63; - *m = q[j + 4] & 63; - } - else - { - *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); - *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); - } -} - -__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int n = tid / 32; - const int l = tid - 32 * n; - const int is = 8 * n + l / 16; - - const uint8_t q = x[i].qs[32 * n + l]; - __global float *y = yy + get_group_id(0) * QK_K + 128 * n; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4); - y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4); - y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4); - y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); -} - -__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) -{ - int r = get_local_id(0) / 4; - int i = get_group_id(0) + get_global_offset(0); - int tid = r / 2; - int is0 = r % 2; - int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); - int n = tid / 4; - int j = tid - 4 * n; - - uint8_t m = 1 << (4 * n + j); - int is = 8 * n + 2 * j + is0; - int shift = 2 * j; - - int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) - : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) - : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) - : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); - float d_all = vload_half(0, &x[i].d); - float dl = d_all * (us - 32); - - __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j; - const __global uint8_t *q = x[i].qs + 32 * n; - const __global uint8_t *hm = x[i].hmask; - - for (int l = l0; l < l0 + 4; ++l) - y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -} - -__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int il = tid / 8; - const int ir = tid % 8; - const int is = 2 * il; - const int n = 4; - - __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint8_t *q = x[i].qs + 32 * il + n * ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - float d1 = dall * sc; - float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - float d2 = dall * sc; - float m2 = dmin * m; - for (int l = 0; l < n; ++l) - { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l + 32] = d2 * (q[l] >> 4) - m2; - } -} - -__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int il = tid / 16; - const int ir = tid % 16; - const int is = 2 * il; - - __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir; - __global const uint8_t *qh = x[i].qh + 2 * ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - uint8_t hm = 1 << (2 * il); - y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1; - y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1; - hm <<= 1; - y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2; - y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2; -} - -__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int ip = tid / 32; - const int il = tid - 32 * ip; - const int is = 8 * ip + il / 16; - - __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il; - - const float d = vload_half(0, &x[i].d); - - __global const uint8_t *ql = x[i].ql + 64 * ip + il; - const uint8_t qh = x[i].qh[32 * ip + il]; - __global const int8_t *sc = x[i].scales + is; - - y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -} - -__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q2_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 16/K_QUANTS_PER_ITERATION; - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; - - tmp[16 * ix + tid] = 0; - - uint32_t aux[4]; - const uint8_t * d = (const uint8_t *)aux; - const uint8_t * m = (const uint8_t *)(aux + 2); - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * q = x[i].qs + q_offset; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - - } - tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q3_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); - - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; - - uint16_t utmp[4]; - const int8_t * s = (const int8_t *)utmp; - - const uint16_t s_shift = 4*im; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * q = x[i].qs + q_offset; - __global const uint8_t * h = x[i].hmask + l0; - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - - const float d = vload_half(0, &x[i].d); - - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); - } - tmp[16 * ix + tid] += d * sum; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - //to rename it later, just to test now - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int row = get_group_id(0); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; - - const int step = 8/K_QUANTS_PER_ITERATION; - - const int il = tid/step; // 0...3 - const int ir = tid - step*il;// 0...3 - const int n = 2*K_QUANTS_PER_ITERATION; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - __global const struct block_q4_K * x = xx + ib0; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const uint8_t * q1 = x[i].qs + q_offset; - __global const uint8_t * q2 = q1 + 64; - __global const float * y1 = yy + i*QK_K + y_offset; - __global const float * y2 = y1 + 128; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - float4 s = (float4)(0.f); - float smin = 0; - for (int l = 0; l < n; ++l) { - s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); - s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int row = get_group_id(0); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - const int tid = get_local_id(0)/2; // 0...15 - const int ix = get_local_id(0)%2; - - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 2; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - __global const struct block_q5_K * x = xx + ib0; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2) { - - __global const uint8_t * ql1 = x[i].qs + q_offset; - __global const uint8_t * ql2 = ql1 + 64; - __global const uint8_t * qh = x[i].qh + l0; - __global const float * y1 = yy + i*QK_K + y_offset; - __global const float * y2 = y1 + 128; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - float4 sum = (float4)(0.f); - float smin = 0; - for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) - + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) - + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) - + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) - + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; - } - tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) { - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q6_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 - - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - -\n#if K_QUANTS_PER_ITERATION == 1\n - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; - -\n#else\n - - const int l0 = 4 * in; // 0, 4, 8, ..., 28 - const int is = in / 4; - -\n#endif\n - - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; - - tmp[16 * ix + tid] = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * ql = x[i].ql + ql_offset; - __global const uint8_t * qh = x[i].qh + qh_offset; - __global const int8_t * s = x[i].scales + s_offset; - - const float d = vload_half(0, &x[i].d); - -\n#if K_QUANTS_PER_ITERATION == 1\n - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp[16 * ix + tid] += sum; -\n#else\n - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp[16 * ix + tid] += sum; -\n#endif\n - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -); - - -static std::string dequant_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; - - if (i >= get_global_size(0)) { - return; - } - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int ib = i/qk + get_global_offset(0); // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - y[iybs + iqs + 0] = v0; - y[iybs + iqs + y_offset] = v1; -} -); - -static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { - const int local_size = get_local_size(0); - const int row = get_group_id(0); - const int tid = get_local_id(0); - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int col_step = local_size * 2; - const int y_offset = qr == 1 ? 1 : qk/2; - - x += get_global_offset(0); - - tmp[tid] = 0; - - for (int col = tid*2; col < ncols; col += col_step) { - const int ib = (row*ncols + col)/qk; // block index - const int iqs = (col%qk)/qr; // quant index - const int iybs = col - col%qk; // y block start index - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - - // matrix multiplication - tmp[tid] += v0 * y[iybs + iqs + 0]; - tmp[tid] += v1 * y[iybs + iqs + y_offset]; - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=local_size/2; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} -); - - -static std::string mul_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); - - if (i >= get_global_size(0)) { - return; - } - - dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky]; -} -); - -#define CL_CHECK(err) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "ggml_v3_opencl: %s error %d at %s:%d\n", \ - #err, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\ - exit(1); \ - } \ - } while (0) - -#define CLBLAST_CHECK(err) \ - do { \ - CLBlastStatusCode err_ = (err); \ - if (err_ != CLBlastSuccess) { \ - fprintf(stderr, "ggml_v3_opencl: %s error %d at %s:%d\n", \ - #err, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\ - exit(1); \ - } \ - } while (0) - -static std::array dequant_str_keys = { - "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" -}; - -static std::array dequant_str_values = { - "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", - "convert_row_f16", "half", "1", "1", "convert_f16" -}; - -static std::array dequant_mul_mat_vec_str_values = { - "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", - "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" -}; - -static std::array mul_str_keys = { - "KERNEL_NAME", "TYPE" -}; -static std::array mul_str_values = { - "mul_f32", "float" -}; - -static std::string& replace(std::string& s, const std::string& from, const std::string& to) { - size_t pos = 0; - while ((pos = s.find(from, pos)) != std::string::npos) { - s.replace(pos, from.length(), to); - pos += to.length(); - } - return s; -} - -static std::string generate_kernels() { - std::stringstream src; - src << program_source << '\n'; - src << k_quants_source << '\n'; - for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { - std::string dequant_kernel = dequant_template; - std::string dmmv_kernel = dequant_mul_mat_vec_template; - for (size_t j = 0; j < dequant_str_keys.size(); j++) { - replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); - replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); - } - src << dequant_kernel << '\n'; - src << dmmv_kernel << '\n'; - } - for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) { - std::string mul_kernel = mul_template; - for (size_t j = 0; j < mul_str_keys.size(); j++) { - replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]); - } - src << mul_kernel << '\n'; - } - - return src.str(); -} - -static cl_platform_id platform; -static cl_device_id device; -static cl_context context; -static cl_command_queue queue; -static cl_program program; -static cl_kernel convert_row_f16_cl; -static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; -static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; -static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; -static cl_kernel mul_f32_cl; -static bool fp16_support; - -static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { - cl_program p; - char *program_log; - size_t program_size; - size_t log_size; - int err; - - program_size = strlen(program_buffer); - - p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); - if(err < 0) { - fprintf(stderr, "OpenCL error creating program"); - exit(1); - } - - std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " - "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 " - "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION); - - err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); - if(err < 0) { - - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - program_log = (char*) malloc(log_size + 1); - program_log[log_size] = '\0'; - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); - fprintf(stderr, "ggml_v3_opencl: kernel compile error:\n\n%s\n", program_log); - free(program_log); - exit(1); - } - - return p; -} - -void ggml_v3_cl_init(void) { - cl_int err; - - struct cl_device; - struct cl_platform { - cl_platform_id id; - unsigned number; - char name[128]; - char vendor[128]; - struct cl_device * devices; - unsigned n_devices; - struct cl_device * default_device; - }; - - struct cl_device { - struct cl_platform * platform; - cl_device_id id; - unsigned number; - cl_device_type type; - char name[128]; - }; - - enum { NPLAT = 16, NDEV = 16 }; - - struct cl_platform platforms[NPLAT]; - unsigned n_platforms = 0; - struct cl_device devices[NDEV]; - unsigned n_devices = 0; - struct cl_device * default_device = NULL; - - platform = NULL; - device = NULL; - - cl_platform_id platform_ids[NPLAT]; - CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms)); - - for (unsigned i = 0; i < n_platforms; i++) { - struct cl_platform * p = &platforms[i]; - p->number = i; - p->id = platform_ids[i]; - CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); - CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); - - cl_device_id device_ids[NDEV]; - cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); - if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { - p->n_devices = 0; - } else { - CL_CHECK(clGetDeviceIDsError); - } - p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; - p->default_device = NULL; - - for (unsigned j = 0; j < p->n_devices; j++) { - struct cl_device * d = &devices[n_devices]; - d->number = n_devices++; - d->id = device_ids[j]; - d->platform = p; - CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); - CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); - printf("\nPlatform:%d Device:%d - %s with %s",i,j,p->name,d->name); - - if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { - p->default_device = d; - } - } - - if (default_device == NULL && p->default_device != NULL) { - default_device = p->default_device; - } - } - - printf("\n\n"); - - if (n_devices == 0) { - fprintf(stderr, "ggml_v3_opencl: could find any OpenCL devices.\n"); - exit(1); - } - - char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); - char * user_device_string = getenv("GGML_OPENCL_DEVICE"); - int user_platform_number = -1; - int user_device_number = -1; - - unsigned n; - if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { - user_platform_number = (int)n; - } - if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { - user_device_number = (int)n; - } - if (user_platform_number != -1 && user_device_number != -1) { - cl_platform* platform = &platforms[user_platform_number]; - if ((unsigned)user_device_number >= platform->n_devices) { - fprintf(stderr, "ggml_v3_opencl: invalid device number %d\n", user_device_number); - exit(1); - } - default_device = &platform->devices[user_device_number]; - } else { - - struct cl_device * selected_devices = devices; - unsigned n_selected_devices = n_devices; - - if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { - for (unsigned i = 0; i < n_platforms; i++) { - struct cl_platform * p = &platforms[i]; - if (strstr(p->name, user_platform_string) != NULL || - strstr(p->vendor, user_platform_string) != NULL) { - user_platform_number = (int)i; - break; - } - } - if (user_platform_number == -1) { - fprintf(stderr, "ggml_v3_opencl: no platform matching '%s' was found.\n", user_platform_string); - exit(1); - } - } - if (user_platform_number != -1) { - struct cl_platform * p = &platforms[user_platform_number]; - selected_devices = p->devices; - n_selected_devices = p->n_devices; - default_device = p->default_device; - if (n_selected_devices == 0) { - fprintf(stderr, "ggml_v3_opencl: selected platform '%s' does not have any devices.\n", p->name); - exit(1); - } - } - - if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { - for (unsigned i = 0; i < n_selected_devices; i++) { - struct cl_device * d = &selected_devices[i]; - if (strstr(d->name, user_device_string) != NULL) { - user_device_number = d->number; - break; - } - } - if (user_device_number == -1) { - fprintf(stderr, "ggml_v3_opencl: no device matching '%s' was found.\n", user_device_string); - exit(1); - } - } - if (user_device_number != -1) { - selected_devices = &devices[user_device_number]; - n_selected_devices = 1; - default_device = &selected_devices[0]; - } - - GGML_V3_ASSERT(n_selected_devices > 0); - - if (default_device == NULL) { - default_device = &selected_devices[0]; - } - } - - fprintf(stderr, "ggml_v3_opencl: selecting platform: '%s'\n", default_device->platform->name); - fprintf(stderr, "ggml_v3_opencl: selecting device: '%s'\n", default_device->name); - if (default_device->type != CL_DEVICE_TYPE_GPU) { - fprintf(stderr, "ggml_v3_opencl: warning, not a GPU: '%s'.\n", default_device->name); - } - - platform = default_device->platform->id; - device = default_device->id; - - size_t ext_str_size; - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); - char *ext_buffer = (char *)alloca(ext_str_size + 1); - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); - ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated - // Check if ext_buffer contains cl_khr_fp16 - fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; - fprintf(stderr, "ggml_v3_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); - fp16_support = false; - printf("CL FP16 temporarily disabled pending further optimization.\n"); - - cl_context_properties properties[] = { - (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 - }; - - CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); - - CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), - (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err : - (queue = clCreateCommandQueue(context, device, 0, &err), err) - ))); - - const std::string kernel_src = generate_kernels(); - - program = build_program_from_source(context, device, kernel_src.c_str()); - - // FP16 to FP32 kernel - CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err)); - - // Dequantize kernels - CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err)); - CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err)); - CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); - CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); - CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); - CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); - CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); - CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); - CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); - CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); - CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); - - // dequant mul mat kernel - CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); - CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); - - // mul kernel - CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); -} - -static cl_kernel* ggml_v3_get_to_fp32_cl(ggml_v3_type type) { - switch (type) { - case GGML_V3_TYPE_Q4_0: - return &dequantize_row_q4_0_cl; - case GGML_V3_TYPE_Q4_1: - return &dequantize_row_q4_1_cl; - case GGML_V3_TYPE_Q5_0: - return &dequantize_row_q5_0_cl; - case GGML_V3_TYPE_Q5_1: - return &dequantize_row_q5_1_cl; - case GGML_V3_TYPE_Q8_0: - return &dequantize_row_q8_0_cl; - case GGML_V3_TYPE_Q2_K: - return &dequantize_block_q2_k_cl; - case GGML_V3_TYPE_Q3_K: - return &dequantize_block_q3_k_cl; - case GGML_V3_TYPE_Q4_K: - return &dequantize_block_q4_k_cl; - case GGML_V3_TYPE_Q5_K: - return &dequantize_block_q5_k_cl; - case GGML_V3_TYPE_Q6_K: - return &dequantize_block_q6_k_cl; - case GGML_V3_TYPE_F16: - return &convert_row_f16_cl; - default: - return nullptr; - } -} - -static size_t ggml_v3_cl_global_denom(ggml_v3_type type) { - switch (type) { - case GGML_V3_TYPE_Q4_0: - case GGML_V3_TYPE_Q4_1: - case GGML_V3_TYPE_Q5_0: - case GGML_V3_TYPE_Q5_1: - case GGML_V3_TYPE_Q8_0: - return 1; - case GGML_V3_TYPE_Q2_K: - case GGML_V3_TYPE_Q3_K: - return 4; - case GGML_V3_TYPE_Q4_K: - return 8; - case GGML_V3_TYPE_Q5_K: - case GGML_V3_TYPE_Q6_K: - return 4; - case GGML_V3_TYPE_F16: - default: - return 1; - } -} - -static size_t ggml_v3_cl_local_size(ggml_v3_type type) { - switch (type) { - case GGML_V3_TYPE_Q4_0: - case GGML_V3_TYPE_Q4_1: - case GGML_V3_TYPE_Q5_0: - case GGML_V3_TYPE_Q5_1: - case GGML_V3_TYPE_Q8_0: - return 0; - case GGML_V3_TYPE_Q2_K: - case GGML_V3_TYPE_Q3_K: - return 64; - case GGML_V3_TYPE_Q4_K: - return 32; - case GGML_V3_TYPE_Q5_K: - case GGML_V3_TYPE_Q6_K: - return 64; - case GGML_V3_TYPE_F16: - default: - return 0; - } -} - -static cl_kernel* ggml_v3_get_dequantize_mul_mat_vec_cl(ggml_v3_type type) { - switch (type) { - case GGML_V3_TYPE_Q4_0: - return &dequantize_mul_mat_vec_q4_0_cl; - case GGML_V3_TYPE_Q4_1: - return &dequantize_mul_mat_vec_q4_1_cl; - case GGML_V3_TYPE_Q5_0: - return &dequantize_mul_mat_vec_q5_0_cl; - case GGML_V3_TYPE_Q5_1: - return &dequantize_mul_mat_vec_q5_1_cl; - case GGML_V3_TYPE_Q8_0: - return &dequantize_mul_mat_vec_q8_0_cl; - case GGML_V3_TYPE_F16: - return &convert_mul_mat_vec_f16_cl; - case GGML_V3_TYPE_Q2_K: - return &dequantize_mul_mat_vec_q2_K_cl; - case GGML_V3_TYPE_Q3_K: - return &dequantize_mul_mat_vec_q3_K_cl; - case GGML_V3_TYPE_Q4_K: - return &dequantize_mul_mat_vec_q4_K_cl; - case GGML_V3_TYPE_Q5_K: - return &dequantize_mul_mat_vec_q5_K_cl; - case GGML_V3_TYPE_Q6_K: - return &dequantize_mul_mat_vec_q6_K_cl; - default: - return nullptr; - } -} - -// buffer pool for cl -#define MAX_CL_BUFFERS 400 - -struct scoped_spin_lock { - std::atomic_flag& lock; - scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { - while (lock.test_and_set(std::memory_order_acquire)) { - ; // spin - } - } - ~scoped_spin_lock() { - lock.clear(std::memory_order_release); - } - scoped_spin_lock(const scoped_spin_lock&) = delete; - scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; -}; - -struct cl_buffer { - cl_mem mem; - size_t size = 0; -}; - -static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; -static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; - -static cl_mem ggml_v3_cl_pool_malloc(size_t size, size_t * actual_size) { - scoped_spin_lock lock(g_cl_pool_lock); - cl_int err; - - int best_i = -1; - size_t best_size = std::numeric_limits::max(); //smallest unused buffer that fits our needs - int worst_i = -1; - size_t worst_size = 0; //largest unused buffer seen so far - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer &b = g_cl_buffer_pool[i]; - if (b.size > 0 && b.size >= size && b.size < best_size) - { - best_i = i; - best_size = b.size; - } - if (b.size > 0 && b.size > worst_size) - { - worst_i = i; - worst_size = b.size; - } - } - if(best_i!=-1) //found the smallest buffer that fits our needs - { - cl_buffer& b = g_cl_buffer_pool[best_i]; - cl_mem mem = b.mem; - *actual_size = b.size; - b.size = 0; - return mem; - } - if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory - { - cl_buffer& b = g_cl_buffer_pool[worst_i]; - cl_mem mem = b.mem; - b.size = 0; - clReleaseMemObject(mem); - } - cl_mem mem; - CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); - *actual_size = size; - return mem; -} - -static void ggml_v3_cl_pool_free(cl_mem mem, size_t size) { - scoped_spin_lock lock(g_cl_pool_lock); - - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer& b = g_cl_buffer_pool[i]; - if (b.size == 0) { - b.mem = mem; - b.size = size; - return; - } - } - fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); - clReleaseMemObject(mem); -} - -void ggml_v3_cl_free_data(const struct ggml_v3_tensor* tensor) { - if (tensor->backend != GGML_V3_BACKEND_GPU) { - return; - } - - cl_mem mem = (cl_mem)tensor->extra; - clReleaseMemObject(mem); -} - -static cl_int ggml_v3_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v3_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { - cl_int err; - const uint64_t ne0 = src->ne[0]; - const uint64_t ne1 = src->ne[1]; - const uint64_t nb0 = src->nb[0]; - const uint64_t nb1 = src->nb[1]; - const uint64_t nb2 = src->nb[2]; - const uint64_t nb3 = src->nb[3]; - const enum ggml_v3_type type = src->type; - const size_t ts = ggml_v3_type_size(type); - const size_t bs = ggml_v3_blck_size(type); - const uint64_t row_size = ts*ne0/bs; - - const char * x = (const char *) src->data + i2*nb2 + i3*nb3; - if (nb0 == ts && nb1 == row_size) { - return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev); - } - if (nb0 == ts) { - const size_t buffer_origin[3] = { offset, 0, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { row_size, ne1, 1 }; - return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev); - } - std::vector events; - if (ev && ne1>1) events.reserve(ne1-1); - for (uint64_t i1 = 0; i1 < ne1; i1++) { - // pretend the row is a matrix with cols=1 - const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts, ne0/bs, 1 }; - // if an event is requested, make the last write wait for all previous writes to complete - if (ev && i1) { - events.push_back(*ev); - } - cl_uint nevents = i1 == ne1-1 ? events.size() : 0U; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev); - if (err != CL_SUCCESS) { - for (auto event : events) { - clReleaseEvent(event); - } - return err; - } - } - for (auto event : events) { - CL_CHECK(clReleaseEvent(event)); - } - return CL_SUCCESS; -} - -static void ggml_v3_cl_mul_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) { - GGML_V3_ASSERT(src1->backend == GGML_V3_BACKEND_GPU); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - size_t x_size; - size_t d_size; - - cl_mem d_X = ggml_v3_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 - cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. - cl_mem d_D = ggml_v3_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst - - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev; - - // copy src0 to device - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); - - const int64_t i13 = i03%ne13; - const int64_t i12 = i02%ne12; - const int i1 = i13*ne12*ne11 + i12*ne11; - - cl_int x_offset = 0; - cl_int y_offset = i1*ne10; - cl_int d_offset = 0; - - size_t global = ne00 * ne01; - cl_int ky = ne10 * ne11; - - CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); - CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); - - CL_CHECK(clReleaseEvent(ev)); - CL_CHECK(clFinish(queue)); - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); - } - } - ggml_v3_cl_pool_free(d_X, x_size); - ggml_v3_cl_pool_free(d_D, d_size); -} - -void ggml_v3_cl_mul(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) { - GGML_V3_ASSERT(src0->type == GGML_V3_TYPE_F32 && src1->type == GGML_V3_TYPE_F32 && dst->type == GGML_V3_TYPE_F32); - ggml_v3_cl_mul_f32(src0, src1, dst); -} - -static void ggml_v3_cl_mul_mat_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size; - size_t y_size; - size_t d_size; - cl_mem d_X; - if (src0->backend == GGML_V3_BACKEND_GPU) { // NOLINT - d_X = (cl_mem) src0->extra; - } else { - d_X = ggml_v3_cl_pool_malloc(sizeof(float) * x_ne, &x_size); - } - cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(float) * d_ne, &d_size); - - size_t x_offset = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_V3_BACKEND_GPU) { - x_offset = (i03 * ne02 + i02) * x_ne; - } else { - // copy src0 to device - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); - } - - for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { - // copy src1 to device - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); - - CL_CHECK(clFinish(queue)); - - // compute - cl_event ev_sgemm; - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, x_offset, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V3_ASSERT(false); - } - - // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); - } - } - } - } - - if (src0->backend != GGML_V3_BACKEND_GPU) { - ggml_v3_cl_pool_free(d_X, x_size); - } - ggml_v3_cl_pool_free(d_Y, y_size); - ggml_v3_cl_pool_free(d_D, d_size); -} - -static void ggml_v3_cl_mul_mat_f16(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst, void * wdata, size_t wsize) { - GGML_V3_ASSERT(fp16_support); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const ggml_v3_fp16_t alpha = ggml_v3_fp32_to_fp16(1.0f); - const ggml_v3_fp16_t beta = ggml_v3_fp32_to_fp16(0.0f); - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - GGML_V3_ASSERT(wsize >= sizeof(ggml_v3_fp16_t) * y_ne); - GGML_V3_ASSERT(wsize >= sizeof(ggml_v3_fp16_t) * d_ne); - ggml_v3_fp16_t * const tmp = (ggml_v3_fp16_t *) wdata; - - size_t x_size; - size_t y_size; - size_t d_size; - cl_mem d_X; - if (src0->backend == GGML_V3_BACKEND_GPU) { // NOLINT - d_X = (cl_mem) src0->extra; - } else { - d_X = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * x_ne, &x_size); - } - cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * y_ne, &y_size); - cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(ggml_v3_fp16_t) * d_ne, &d_size); - - bool src1_cont_rows = nb10 == sizeof(float); - bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); - - size_t x_offset = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_V3_BACKEND_GPU) { - x_offset = (i03 * ne02 + i02) * x_ne; - } else { - // copy src0 to device - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); - } - - for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { - // convert src1 to fp16 - // TODO: use multiple threads - char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; - if (src1_cont_rows) { - if (src1_cont_cols) { - ggml_v3_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); - } - else { - for (int64_t i11 = 0; i11 < ne11; i11++) { - ggml_v3_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); - } - } - } - else { - for (int64_t i11 = 0; i11 < ne11; i11++) { - for (int64_t i10 = 0; i10 < ne10; i10++) { - // very slow due to no inlining - tmp[i11*ne10 + i10] = ggml_v3_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10)); - } - } - } - - // copy src1 to device - CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v3_fp16_t) * y_ne, tmp, 0, NULL, NULL)); - - CL_CHECK(clFinish(queue)); - - // compute - cl_event ev_sgemm; - clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, x_offset, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V3_ASSERT(false); - } - - // copy dst to host, then convert to float - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v3_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - ggml_v3_fp16_to_fp32_row(tmp, d, d_ne); - } - } - } - } - - if (src0->backend != GGML_V3_BACKEND_GPU) { - ggml_v3_cl_pool_free(d_X, x_size); - } - ggml_v3_cl_pool_free(d_Y, y_size); - ggml_v3_cl_pool_free(d_D, d_size); -} - -static void ggml_v3_cl_mul_mat_q_f32(const ggml_v3_tensor * src0, const ggml_v3_tensor * src1, ggml_v3_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - const ggml_v3_type type = src0->type; - const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const int x_bps = x_ne / ggml_v3_blck_size(type); // blocks per 2D slice - const size_t q_sz = ggml_v3_type_size(type) * x_bps; - - size_t x_size; - size_t y_size; - size_t d_size; - size_t q_size; - cl_mem d_X; - if (!mul_mat_vec) { - d_X = ggml_v3_cl_pool_malloc(sizeof(float) * x_ne, &x_size); - } - cl_mem d_Y = ggml_v3_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_v3_cl_pool_malloc(sizeof(float) * d_ne, &d_size); - cl_mem d_Q; - if (src0->backend == GGML_V3_BACKEND_CPU) { - d_Q = ggml_v3_cl_pool_malloc(q_sz, &q_size); - } - - cl_kernel* to_fp32_cl = ggml_v3_get_to_fp32_cl(type); - cl_kernel* dmmv = ggml_v3_get_dequantize_mul_mat_vec_cl(type); - GGML_V3_ASSERT(to_fp32_cl != nullptr); - - const size_t global_denom = ggml_v3_cl_global_denom(type); - const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_v3_cl_local_size(type); - - size_t ev_idx = 0; - std::vector events; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy and dequantize src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - // copy src0 to device if necessary - if (src0->backend == GGML_V3_BACKEND_CPU) { - events.emplace_back(); - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); - } else if (src0->backend == GGML_V3_BACKEND_GPU) { - d_Q = (cl_mem) src0->extra; - } else { - GGML_V3_ASSERT(false); - } - - if (!mul_mat_vec) { - // convert src0 to fp32 on device - const size_t global = x_ne / global_denom; - const size_t offset = src0->backend == GGML_V3_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; - CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); - CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); - } - - for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { - if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel - // copy src1 to device - events.emplace_back(); - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); - - // compute - const size_t global = ne01 * local; - const size_t offset = src0->backend == GGML_V3_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; - const cl_int ncols = ne00; - events.emplace_back(); - CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); - CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); - CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); - CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); - CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); - } else { // CLBlast matrix matrix multiplication - // copy src1 to device - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); - - // wait for conversion - CL_CHECK(clFinish(queue)); - - // compute - events.emplace_back(); - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, events.data() + ev_idx++); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_V3_ASSERT(false); - } - } - - // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); - for (auto *event : events) { - clReleaseEvent(event); - } - - ev_idx = 0; - events.clear(); - } - } - } - } - - if (!mul_mat_vec) { - ggml_v3_cl_pool_free(d_X, x_size); - } - ggml_v3_cl_pool_free(d_Y, y_size); - ggml_v3_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_V3_BACKEND_CPU) { - ggml_v3_cl_pool_free(d_Q, q_size); - } -} - - -bool ggml_v3_cl_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if ((src0->type == GGML_V3_TYPE_F32 || src0->type == GGML_V3_TYPE_F16 || ggml_v3_is_quantized(src0->type)) && - src1->type == GGML_V3_TYPE_F32 && - dst->type == GGML_V3_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V3_BACKEND_GPU)) { - return true; - } - - return false; -} - -static bool ggml_v3_cl_mul_mat_use_f16(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * /* dst */) { - // If device doesn't support FP16 - if (!fp16_support) { - return false; - } - - size_t src0_sz = ggml_v3_nbytes(src0); - size_t src1_sz = ggml_v3_nbytes(src1); - - // mul_mat_q: src0 is converted to fp32 on device - size_t mul_mat_q_transfer = src0_sz + src1_sz; - - // mul_mat_f16: src1 is converted to fp16 on cpu - size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v3_fp16_t) * ggml_v3_nelements(src1); - - // choose the smaller one to transfer to the device - // TODO: this is not always the best choice due to the overhead of converting to fp16 - return mul_mat_f16_transfer < mul_mat_q_transfer; -} - -void ggml_v3_cl_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst, void * wdata, size_t wsize) { - GGML_V3_ASSERT(ggml_v3_cl_can_mul_mat(src0, src1, dst)); - - if (src0->type == GGML_V3_TYPE_F32) { - ggml_v3_cl_mul_mat_f32(src0, src1, dst); - } - else if (src0->type == GGML_V3_TYPE_F16) { - if (ggml_v3_cl_mul_mat_use_f16(src0, src1, dst)) { - ggml_v3_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); - } - else { - ggml_v3_cl_mul_mat_q_f32(src0, src1, dst); - } - } - else if (ggml_v3_is_quantized(src0->type)) { - ggml_v3_cl_mul_mat_q_f32(src0, src1, dst); - } - else { - GGML_V3_ASSERT(false); - } -} - -size_t ggml_v3_cl_mul_mat_get_wsize(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst) { - if (src0->type == GGML_V3_TYPE_F16 && ggml_v3_cl_mul_mat_use_f16(src0, src1, dst)) { - return sizeof(ggml_v3_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]); - } - return 0; -} - -void ggml_v3_cl_transform_tensor(void * data, ggml_v3_tensor * tensor) { - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; - const int64_t ne2 = tensor->ne[2]; - const int64_t ne3 = tensor->ne[3]; - - const ggml_v3_type type = tensor->type; - const size_t s_sz = ggml_v3_type_size(type) * (size_t) (ne0 * ne1 / ggml_v3_blck_size(type)); - const size_t q_sz = s_sz * (size_t) (ne2 * ne3); - - size_t q_size; - cl_mem dst = ggml_v3_cl_pool_malloc(q_sz, &q_size); - - tensor->data = data; - // copy tensor to device - size_t offset = 0; - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - CL_CHECK(ggml_v3_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL)); - offset += s_sz; - } - } - - CL_CHECK(clFinish(queue)); - - tensor->extra = dst; - GGML_V3_ASSERT(tensor->backend == GGML_V3_BACKEND_GPU); -} \ No newline at end of file diff --git a/otherarch/ggml_v3-opencl.h b/otherarch/ggml_v3-opencl.h deleted file mode 100644 index 3aa499ba9..000000000 --- a/otherarch/ggml_v3-opencl.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "ggml_v3.h" - -#ifdef __cplusplus -extern "C" { -#endif - -GGML_V3_API void ggml_v3_cl_init(void); - -GGML_V3_API void ggml_v3_cl_mul(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst); -GGML_V3_API bool ggml_v3_cl_can_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst); -GGML_V3_API size_t ggml_v3_cl_mul_mat_get_wsize(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst); -GGML_V3_API void ggml_v3_cl_mul_mat(const struct ggml_v3_tensor * src0, const struct ggml_v3_tensor * src1, struct ggml_v3_tensor * dst, void * wdata, size_t wsize); - -GGML_V3_API void * ggml_v3_cl_host_malloc(size_t size); -GGML_V3_API void ggml_v3_cl_host_free(void * ptr); - -GGML_V3_API void ggml_v3_cl_free_data(const struct ggml_v3_tensor* tensor); - -GGML_V3_API void ggml_v3_cl_transform_tensor(void * data, struct ggml_v3_tensor * tensor); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/otherarch/ggml_v3b-opencl.cpp b/otherarch/ggml_v3b-opencl.cpp deleted file mode 100644 index 0ec647a57..000000000 --- a/otherarch/ggml_v3b-opencl.cpp +++ /dev/null @@ -1,2015 +0,0 @@ -#include "ggml.h" -#include "ggml_v3b-opencl.h" -#include "ggml-backend-impl.h" -#include "ggml-cpu.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#define CL_TARGET_OPENCL_VERSION 120 -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#define CL_DMMV_LOCAL_SIZE 32 - -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 1 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -static std::string program_source = MULTILINE_QUOTE( - -typedef char int8_t; -typedef uchar uint8_t; -typedef short int16_t; -typedef ushort uint16_t; -typedef int int32_t; -typedef uint uint32_t; - -struct __attribute__ ((packed)) block_q4_0 -{ - half d; - uint8_t qs[QK4_0 / 2]; -}; - -struct __attribute__ ((packed)) block_q4_1 -{ - half d; - half m; - uint8_t qs[QK4_1 / 2]; -}; - -struct __attribute__ ((packed)) block_q5_0 -{ - half d; - uint32_t qh; - uint8_t qs[QK5_0 / 2]; -}; - -struct __attribute__ ((packed)) block_q5_1 -{ - half d; - half m; - uint32_t qh; - uint8_t qs[QK5_1 / 2]; -}; - -struct __attribute__ ((packed)) block_q8_0 -{ - half d; - int8_t qs[QK8_0]; -}; - -struct __attribute__((packed)) block_q2_K -{ - uint8_t scales[16]; - uint8_t qs[64]; - half d; - half dmin; -}; - -struct __attribute__((packed)) block_q3_K -{ - uint8_t hmask[32]; - uint8_t qs[64]; - uint8_t scales[12]; - half d; -}; - -struct __attribute__((packed)) block_q4_K -{ - half d; - half dmin; - uint8_t scales[12]; - uint8_t qs[128]; -}; - -struct __attribute__((packed)) block_q5_K -{ - half d; - half dmin; - uint8_t scales[12]; - uint8_t qh[32]; - uint8_t qs[128]; -}; - -struct __attribute__((packed)) block_q6_K -{ - uint8_t ql[128]; - uint8_t qh[64]; - int8_t scales[16]; - half d; -}; - -__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { - const uint i = get_global_id(0); - - y[i] = vload_half(0, &x[i]); -} - -void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = (vi0 - 8)*d; - *v1 = (vi1 - 8)*d; -} -void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - const float m = vload_half(0, &x[ib].m); - - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - *v0 = vi0*d + m; - *v1 = vi1*d + m; -} -void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; - - *v0 = x0*d; - *v1 = x1*d; -} -void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - const float m = vload_half(0, &x[ib].m); - - uint32_t qh = x[ib].qh; - - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); - - *v0 = x0*d + m; - *v1 = x1*d + m; -} -void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { - const float d = vload_half(0, &x[ib].d); - - const int8_t vi0 = x[ib].qs[iqs + 0]; - const int8_t vi1 = x[ib].qs[iqs + 1]; - - *v0 = vi0*d; - *v1 = vi1*d; -} -void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ - *v0 = vload_half(0, &x[ib + 0]); - *v1 = vload_half(0, &x[ib + 1]); -} -); - -static std::string k_quants_source = MULTILINE_QUOTE( -inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) -{ - if (j < 4) - { - *d = q[j] & 63; - *m = q[j + 4] & 63; - } - else - { - *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); - *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); - } -} - -__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int n = tid / 32; - const int l = tid - 32 * n; - const int is = 8 * n + l / 16; - - const uint8_t q = x[i].qs[32 * n + l]; - __global float *y = yy + get_group_id(0) * QK_K + 128 * n; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4); - y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4); - y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4); - y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); -} - -__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) -{ - int r = get_local_id(0) / 4; - int i = get_group_id(0) + get_global_offset(0); - int tid = r / 2; - int is0 = r % 2; - int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); - int n = tid / 4; - int j = tid - 4 * n; - - uint8_t m = 1 << (4 * n + j); - int is = 8 * n + 2 * j + is0; - int shift = 2 * j; - - int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) - : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) - : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) - : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); - float d_all = vload_half(0, &x[i].d); - float dl = d_all * (us - 32); - - __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j; - const __global uint8_t *q = x[i].qs + 32 * n; - const __global uint8_t *hm = x[i].hmask; - - for (int l = l0; l < l0 + 4; ++l) - y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -} - -__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int il = tid / 8; - const int ir = tid % 8; - const int is = 2 * il; - const int n = 4; - - __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint8_t *q = x[i].qs + 32 * il + n * ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - float d1 = dall * sc; - float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - float d2 = dall * sc; - float m2 = dmin * m; - for (int l = 0; l < n; ++l) - { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l + 32] = d2 * (q[l] >> 4) - m2; - } -} - -__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int il = tid / 16; - const int ir = tid % 16; - const int is = 2 * il; - - __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir; - __global const uint8_t *qh = x[i].qh + 2 * ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - uint8_t hm = 1 << (2 * il); - y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1; - y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1; - hm <<= 1; - y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2; - y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2; -} - -__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) -{ - const int i = get_group_id(0) + get_global_offset(0); - const int tid = get_local_id(0); - const int ip = tid / 32; - const int il = tid - 32 * ip; - const int is = 8 * ip + il / 16; - - __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il; - - const float d = vload_half(0, &x[i].d); - - __global const uint8_t *ql = x[i].ql + 64 * ip + il; - const uint8_t qh = x[i].qh[32 * ip + il]; - __global const int8_t *sc = x[i].scales + is; - - y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -} - -__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q2_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 16/K_QUANTS_PER_ITERATION; - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; - - tmp[16 * ix + tid] = 0; - - uint32_t aux[4]; - const uint8_t * d = (const uint8_t *)aux; - const uint8_t * m = (const uint8_t *)(aux + 2); - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * q = x[i].qs + q_offset; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - - } - tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q3_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); - - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; - - uint16_t utmp[4]; - const int8_t * s = (const int8_t *)utmp; - - const uint16_t s_shift = 4*im; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * q = x[i].qs + q_offset; - __global const uint8_t * h = x[i].hmask + l0; - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - - const float d = vload_half(0, &x[i].d); - - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); - } - tmp[16 * ix + tid] += d * sum; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - //to rename it later, just to test now - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int row = get_group_id(0); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; - - const int step = 8/K_QUANTS_PER_ITERATION; - - const int il = tid/step; // 0...3 - const int ir = tid - step*il;// 0...3 - const int n = 2*K_QUANTS_PER_ITERATION; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - __global const struct block_q4_K * x = xx + ib0; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const uint8_t * q1 = x[i].qs + q_offset; - __global const uint8_t * q2 = q1 + 64; - __global const float * y1 = yy + i*QK_K + y_offset; - __global const float * y2 = y1 + 128; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - float4 s = (float4)(0.f); - float smin = 0; - for (int l = 0; l < n; ++l) { - s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); - s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { - - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int row = get_group_id(0); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - const int tid = get_local_id(0)/2; // 0...15 - const int ix = get_local_id(0)%2; - - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 2; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - __global const struct block_q5_K * x = xx + ib0; - - tmp[16 * ix + tid] = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2) { - - __global const uint8_t * ql1 = x[i].qs + q_offset; - __global const uint8_t * ql2 = ql1 + 64; - __global const uint8_t * qh = x[i].qh + l0; - __global const float * y1 = yy + i*QK_K + y_offset; - __global const float * y2 = y1 + 128; - - const float dall = vload_half(0, &x[i].d); - const float dmin = vload_half(0, &x[i].dmin); - - __global const uint16_t * a = (__global const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - float4 sum = (float4)(0.f); - float smin = 0; - for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) - + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) - + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) - + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) - + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; - } - tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) { - - const int row = get_group_id(0); - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row + get_global_offset(0); - - __global const struct block_q6_K * x = xx + ib0; - - const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 - - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - -\n#if K_QUANTS_PER_ITERATION == 1\n - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; - -\n#else\n - - const int l0 = 4 * in; // 0, 4, 8, ..., 28 - const int is = in / 4; - -\n#endif\n - - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; - - tmp[16 * ix + tid] = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - __global const float * y = yy + i * QK_K + y_offset; - __global const uint8_t * ql = x[i].ql + ql_offset; - __global const uint8_t * qh = x[i].qh + qh_offset; - __global const int8_t * s = x[i].scales + s_offset; - - const float d = vload_half(0, &x[i].d); - -\n#if K_QUANTS_PER_ITERATION == 1\n - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp[16 * ix + tid] += sum; -\n#else\n - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp[16 * ix + tid] += sum; -\n#endif\n - - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=16; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} -); - - -std::string dequant_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; - - if (i >= get_global_size(0)) { - return; - } - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int ib = i/qk + get_global_offset(0); // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - y[iybs + iqs + 0] = v0; - y[iybs + iqs + y_offset] = v1; -} -); - -std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { - const int local_size = get_local_size(0); - const int row = get_group_id(0); - const int tid = get_local_id(0); - - const uint qk = QUANT_K; - const uint qr = QUANT_R; - - const int col_step = local_size * 2; - const int y_offset = qr == 1 ? 1 : qk/2; - - x += get_global_offset(0); - - tmp[tid] = 0; - - for (int col = tid*2; col < ncols; col += col_step) { - const int ib = (row*ncols + col)/qk; // block index - const int iqs = (col%qk)/qr; // quant index - const int iybs = col - col%qk; // y block start index - - // dequantize - float v0, v1; - DEQUANT_FUNC(x, ib, iqs, &v0, &v1); - - // matrix multiplication - tmp[tid] += v0 * y[iybs + iqs + 0]; - tmp[tid] += v1 * y[iybs + iqs + y_offset]; - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=local_size/2; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} - -); - - -std::string mul_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); - - if (i >= get_global_size(0)) { - return; - } - - dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky]; -} -); - -std::string add_template = MULTILINE_QUOTE( -__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) { - const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); - - if (i >= get_global_size(0)) { - return; - } - - dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky]; -} -); - -#define CL_CHECK(err) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ - #err, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\ - exit(1); \ - } \ - } while (0) - -#define CLBLAST_CHECK(err) \ - do { \ - CLBlastStatusCode err_ = (err); \ - if (err_ != CLBlastSuccess) { \ - fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ - #err, err_, __FILE__, __LINE__); \ - fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\ - exit(1); \ - } \ - } while (0) - -std::array dequant_str_keys = { - "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" -}; - -std::array dequant_str_values = { - "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", - "convert_row_f16", "half", "1", "1", "convert_f16" -}; - -std::array dequant_mul_mat_vec_str_values = { - "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", - "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" -}; - -std::array mul_str_keys = { - "KERNEL_NAME", "TYPE" -}; -std::array mul_str_values = { - "mul_f32", "float" -}; - -static std::string& replace(std::string& s, const std::string& from, const std::string& to) { - size_t pos = 0; - while ((pos = s.find(from, pos)) != std::string::npos) { - s.replace(pos, from.length(), to); - pos += to.length(); - } - return s; -} - -static std::string generate_kernels() { - std::stringstream src; - src << program_source << '\n'; - src << k_quants_source << '\n'; - for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { - std::string dequant_kernel = dequant_template; - std::string dmmv_kernel = dequant_mul_mat_vec_template; - for (size_t j = 0; j < dequant_str_keys.size(); j++) { - replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); - replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); - } - src << dequant_kernel << '\n'; - src << dmmv_kernel << '\n'; - } - for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) { - std::string mul_kernel = mul_template; - for (size_t j = 0; j < mul_str_keys.size(); j++) { - replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]); - } - src << mul_kernel << '\n'; - } - src << add_template << '\n'; - - return src.str(); -} - -static cl_platform_id platform; -static cl_device_id device; -static cl_context context; -static cl_command_queue queue; -static cl_program program; -static cl_kernel convert_row_f16_cl; -static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; -static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; -static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; -static cl_kernel mul_f32_cl; -static cl_kernel add_f32_cl; -static bool fp16_support; - -static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { - cl_program p; - char *program_log; - size_t program_size; - size_t log_size; - int err; - - program_size = strlen(program_buffer); - - p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); - if(err < 0) { - fprintf(stderr, "OpenCL error creating program"); - exit(1); - } - - std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " - "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 " - "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION); - - err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); - if(err < 0) { - - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - program_log = (char*) malloc(log_size + 1); - program_log[log_size] = '\0'; - clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); - fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log); - free(program_log); - exit(1); - } - - return p; -} - -void ggml_cl_init(void) { - static bool initialized = false; - if (initialized) { - return; - } - initialized = true; - - cl_int err; - - struct cl_device; - struct cl_platform { - cl_platform_id id; - unsigned number; - char name[128]; - char vendor[128]; - struct cl_device * devices; - unsigned n_devices; - struct cl_device * default_device; - }; - - struct cl_device { - struct cl_platform * platform; - cl_device_id id; - unsigned number; - cl_device_type type; - char name[128]; - }; - - enum { NPLAT = 16, NDEV = 16 }; - - struct cl_platform platforms[NPLAT]; - unsigned n_platforms = 0; - struct cl_device devices[NDEV]; - unsigned n_devices = 0; - struct cl_device * default_device = NULL; - - platform = NULL; - device = NULL; - - cl_platform_id platform_ids[NPLAT]; - CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms)); - - for (unsigned i = 0; i < n_platforms; i++) { - struct cl_platform * p = &platforms[i]; - p->number = i; - p->id = platform_ids[i]; - CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); - CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); - - cl_device_id device_ids[NDEV]; - cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); - if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { - p->n_devices = 0; - } else { - CL_CHECK(clGetDeviceIDsError); - } - p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; - p->default_device = NULL; - - for (unsigned j = 0; j < p->n_devices; j++) { - struct cl_device * d = &devices[n_devices]; - d->number = n_devices++; - d->id = device_ids[j]; - d->platform = p; - CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); - CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); - printf("\nPlatform:%d Device:%d - %s with %s",i,j,p->name,d->name); - - if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { - p->default_device = d; - } - } - - if (default_device == NULL && p->default_device != NULL) { - default_device = p->default_device; - } - } - - printf("\n\n"); - - if (n_devices == 0) { - fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n"); - exit(1); - } - - char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); - char * user_device_string = getenv("GGML_OPENCL_DEVICE"); - int user_platform_number = -1; - int user_device_number = -1; - - unsigned n; - if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { - user_platform_number = (int)n; - } - if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { - user_device_number = (int)n; - } - if (user_platform_number != -1 && user_device_number != -1) { - cl_platform* platform = &platforms[user_platform_number]; - if ((unsigned)user_device_number >= platform->n_devices) { - fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number); - exit(1); - } - default_device = &platform->devices[user_device_number]; - } else { - - struct cl_device * selected_devices = devices; - unsigned n_selected_devices = n_devices; - - if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { - for (unsigned i = 0; i < n_platforms; i++) { - struct cl_platform * p = &platforms[i]; - if (strstr(p->name, user_platform_string) != NULL || - strstr(p->vendor, user_platform_string) != NULL) { - user_platform_number = (int)i; - break; - } - } - if (user_platform_number == -1) { - fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); - exit(1); - } - } - if (user_platform_number != -1) { - struct cl_platform * p = &platforms[user_platform_number]; - selected_devices = p->devices; - n_selected_devices = p->n_devices; - default_device = p->default_device; - if (n_selected_devices == 0) { - fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); - exit(1); - } - } - - if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { - for (unsigned i = 0; i < n_selected_devices; i++) { - struct cl_device * d = &selected_devices[i]; - if (strstr(d->name, user_device_string) != NULL) { - user_device_number = d->number; - break; - } - } - if (user_device_number == -1) { - fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string); - exit(1); - } - } - if (user_device_number != -1) { - selected_devices = &devices[user_device_number]; - n_selected_devices = 1; - default_device = &selected_devices[0]; - } - - GGML_ASSERT(n_selected_devices > 0); - - if (default_device == NULL) { - default_device = &selected_devices[0]; - } - } - - fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); - fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name); - if (default_device->type != CL_DEVICE_TYPE_GPU) { - fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); - } - - platform = default_device->platform->id; - device = default_device->id; - - size_t ext_str_size; - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); - char *ext_buffer = (char *)alloca(ext_str_size + 1); - clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); - ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated - // Disabled due to faulty outputs - // Check if ext_buffer contains cl_khr_fp16 - fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL; - // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); - - cl_context_properties properties[] = { - (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 - }; - - CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); - - CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), - (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err : - (queue = clCreateCommandQueue(context, device, 0, &err), err) - ))); - - const std::string kernel_src = generate_kernels(); - - program = build_program_from_source(context, device, kernel_src.c_str()); - - // FP16 to FP32 kernel - CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err)); - - // Dequantize kernels - CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err)); - CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err)); - CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); - CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); - CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); - CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); - CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); - CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); - CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); - CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); - CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); - - // dequant mul mat kernel - CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); - CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); - CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); - - // mul kernel - CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); - - CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err)); -} - -static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - return &dequantize_row_q4_0_cl; - case GGML_TYPE_Q4_1: - return &dequantize_row_q4_1_cl; - case GGML_TYPE_Q5_0: - return &dequantize_row_q5_0_cl; - case GGML_TYPE_Q5_1: - return &dequantize_row_q5_1_cl; - case GGML_TYPE_Q8_0: - return &dequantize_row_q8_0_cl; - case GGML_TYPE_Q2_K: - return &dequantize_block_q2_k_cl; - case GGML_TYPE_Q3_K: - return &dequantize_block_q3_k_cl; - case GGML_TYPE_Q4_K: - return &dequantize_block_q4_k_cl; - case GGML_TYPE_Q5_K: - return &dequantize_block_q5_k_cl; - case GGML_TYPE_Q6_K: - return &dequantize_block_q6_k_cl; - case GGML_TYPE_F16: - return &convert_row_f16_cl; - default: - return nullptr; - } -} - -static size_t ggml_cl_global_denom(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 1; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - return 4; - case GGML_TYPE_Q4_K: - return 8; - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - return 4; - case GGML_TYPE_F16: - default: - return 1; - } -} - -static size_t ggml_cl_local_size(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 0; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - return 64; - case GGML_TYPE_Q4_K: - return 32; - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - return 64; - case GGML_TYPE_F16: - default: - return 0; - } -} - -static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - return &dequantize_mul_mat_vec_q4_0_cl; - case GGML_TYPE_Q4_1: - return &dequantize_mul_mat_vec_q4_1_cl; - case GGML_TYPE_Q5_0: - return &dequantize_mul_mat_vec_q5_0_cl; - case GGML_TYPE_Q5_1: - return &dequantize_mul_mat_vec_q5_1_cl; - case GGML_TYPE_Q8_0: - return &dequantize_mul_mat_vec_q8_0_cl; - case GGML_TYPE_F16: - return &convert_mul_mat_vec_f16_cl; - case GGML_TYPE_Q2_K: - return &dequantize_mul_mat_vec_q2_K_cl; - case GGML_TYPE_Q3_K: - return &dequantize_mul_mat_vec_q3_K_cl; - case GGML_TYPE_Q4_K: - return &dequantize_mul_mat_vec_q4_K_cl; - case GGML_TYPE_Q5_K: - return &dequantize_mul_mat_vec_q5_K_cl; - case GGML_TYPE_Q6_K: - return &dequantize_mul_mat_vec_q6_K_cl; - default: - return nullptr; - } -} - -// buffer pool for cl -#define MAX_CL_BUFFERS 400 - -struct scoped_spin_lock { - std::atomic_flag& lock; - scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { - while (lock.test_and_set(std::memory_order_acquire)) { - ; // spin - } - } - ~scoped_spin_lock() { - lock.clear(std::memory_order_release); - } - scoped_spin_lock(const scoped_spin_lock&) = delete; - scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; -}; - -struct cl_buffer { - cl_mem mem; - size_t size = 0; -}; - -static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; -static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; - -static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) { - scoped_spin_lock lock(g_cl_pool_lock); - cl_int err; - - int best_i = -1; - size_t best_size = std::numeric_limits::max(); //smallest unused buffer that fits our needs - int worst_i = -1; - size_t worst_size = 0; //largest unused buffer seen so far - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer &b = g_cl_buffer_pool[i]; - if (b.size > 0 && b.size >= size && b.size < best_size) - { - best_i = i; - best_size = b.size; - } - if (b.size > 0 && b.size > worst_size) - { - worst_i = i; - worst_size = b.size; - } - } - if(best_i!=-1) //found the smallest buffer that fits our needs - { - cl_buffer& b = g_cl_buffer_pool[best_i]; - cl_mem mem = b.mem; - *actual_size = b.size; - b.size = 0; - return mem; - } - if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory - { - cl_buffer& b = g_cl_buffer_pool[worst_i]; - cl_mem mem = b.mem; - b.size = 0; - clReleaseMemObject(mem); - } - cl_mem mem; - CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); - *actual_size = size; - return mem; -} - -static void ggml_cl_pool_free(cl_mem mem, size_t size) { - scoped_spin_lock lock(g_cl_pool_lock); - - for (int i = 0; i < MAX_CL_BUFFERS; ++i) { - cl_buffer& b = g_cl_buffer_pool[i]; - if (b.size == 0) { - b.mem = mem; - b.size = size; - return; - } - } - fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); - clReleaseMemObject(mem); -} - -void ggml_cl_free_data(const struct ggml_tensor* tensor) { - if (!tensor->clblast_offload_gpu) { - return; - } - - cl_mem mem = (cl_mem)tensor->extra; - clReleaseMemObject(mem); -} - -static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { - cl_int err; - const uint64_t ne0 = src->ne[0]; - const uint64_t ne1 = src->ne[1]; - const uint64_t nb0 = src->nb[0]; - const uint64_t nb1 = src->nb[1]; - const uint64_t nb2 = src->nb[2]; - const uint64_t nb3 = src->nb[3]; - const enum ggml_type type = src->type; - const size_t ts = ggml_type_size(type); - const size_t bs = ggml_blck_size(type); - const uint64_t row_size = ts*ne0/bs; - - const char * x = (const char *) src->data + i2*nb2 + i3*nb3; - if (nb0 == ts && nb1 == row_size) { - return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev); - } - if (nb0 == ts) { - const size_t buffer_origin[3] = { offset, 0, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { row_size, ne1, 1 }; - return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev); - } - std::vector events; - if (ev && ne1>1) events.reserve(ne1-1); - for (uint64_t i1 = 0; i1 < ne1; i1++) { - // pretend the row is a matrix with cols=1 - const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 }; - const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts, ne0/bs, 1 }; - // if an event is requested, make the last write wait for all previous writes to complete - if (ev && i1) { - events.push_back(*ev); - } - cl_uint nevents = i1 == ne1-1 ? events.size() : 0U; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev); - if (err != CL_SUCCESS) { - for (auto event : events) { - clReleaseEvent(event); - } - return err; - } - } - for (auto event : events) { - CL_CHECK(clReleaseEvent(event)); - } - return CL_SUCCESS; -} - -static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->clblast_offload_gpu); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - size_t x_size; - size_t d_size; - - cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 - cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. - cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst - - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev; - - // copy src0 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); - - const int64_t i13 = i03%ne13; - const int64_t i12 = i02%ne12; - const int i1 = i13*ne12*ne11 + i12*ne11; - - cl_int x_offset = 0; - cl_int y_offset = i1*ne10; - cl_int d_offset = 0; - - size_t global = ne00 * ne01; - cl_int ky = ne10 * ne11; - - CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); - CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); - CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); - - CL_CHECK(clReleaseEvent(ev)); - CL_CHECK(clFinish(queue)); - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); - } - } - ggml_cl_pool_free(d_X, x_size); - ggml_cl_pool_free(d_D, d_size); -} - -void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cl_mul_f32(src0, src1, dst); -} - -static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->clblast_offload_gpu); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - size_t x_size; - size_t d_size; - - cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 - cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. - cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst - - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev; - - // copy src0 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); - - const int64_t i13 = i03%ne13; - const int64_t i12 = i02%ne12; - const int i1 = i13*ne12*ne11 + i12*ne11; - - cl_int x_offset = 0; - cl_int y_offset = i1*ne10; - cl_int d_offset = 0; - - size_t global = ne00 * ne01; - cl_int ky = ne10 * ne11; - - CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X)); - CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset)); - CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y)); - CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset)); - CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D)); - CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset)); - CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky)); - CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); - - CL_CHECK(clReleaseEvent(ev)); - CL_CHECK(clFinish(queue)); - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); - } - } - ggml_cl_pool_free(d_X, x_size); - ggml_cl_pool_free(d_D, d_size); -} - -void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cl_add_f32(src0, src1, dst); -} - -static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size; - size_t y_size; - size_t d_size; - cl_mem d_X; - if (src0->clblast_offload_gpu) { // NOLINT - d_X = (cl_mem) src0->extra; - } else { - d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); - } - cl_mem d_Y = src1->clblast_offload_gpu ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = dst->clblast_offload_gpu ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); - - size_t x_offset = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->clblast_offload_gpu) { - x_offset = (i03 * ne02 + i02) * x_ne; - } else { - // copy src0 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); - } - - for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { - // copy src1 to device - if (!src1->clblast_offload_gpu) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); - } - - CL_CHECK(clFinish(queue)); - - // compute - cl_event ev_sgemm; - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, x_offset, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_ASSERT(false); - } - - // copy dst to host - if (!dst->clblast_offload_gpu) { - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); - } - } - } - } - } - - if (!src0->clblast_offload_gpu) { - ggml_cl_pool_free(d_X, x_size); - } - if (!src1->clblast_offload_gpu) { - ggml_cl_pool_free(d_Y, y_size); - } - if (!dst->clblast_offload_gpu) { - ggml_cl_pool_free(d_D, d_size); - } -} - -static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(fp16_support); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); - const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne); - GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne); - ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata; - - size_t x_size; - size_t y_size; - size_t d_size; - cl_mem d_X; - if (src0->clblast_offload_gpu) { // NOLINT - d_X = (cl_mem) src0->extra; - } else { - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); - } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size); - - bool src1_cont_rows = nb10 == sizeof(float); - bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); - - size_t x_offset = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->clblast_offload_gpu) { - x_offset = (i03 * ne02 + i02) * x_ne; - } else { - // copy src0 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); - } - - // FIXME: convert on device - - for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { - // convert src1 to fp16 - // TODO: use multiple threads - char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; - if (src1_cont_rows) { - if (src1_cont_cols) { - ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); - } - else { - for (int64_t i11 = 0; i11 < ne11; i11++) { - ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); - } - } - } - else { - for (int64_t i11 = 0; i11 < ne11; i11++) { - for (int64_t i10 = 0; i10 < ne10; i10++) { - // very slow due to no inlining - tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10)); - } - } - } - - // copy src1 to device - CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL)); - - CL_CHECK(clFinish(queue)); - - // compute - cl_event ev_sgemm; - clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, x_offset, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_ASSERT(false); - } - - // copy dst to host, then convert to float - if (!dst->clblast_offload_gpu) { - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - ggml_fp16_to_fp32_row(tmp, d, d_ne); - } else { - // FIXME: convert dst to fp32 on device - } - } - } - } - } - - if (!src0->clblast_offload_gpu) { - ggml_cl_pool_free(d_X, x_size); - } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); -} - -static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - const ggml_type type = src0->type; - const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0; - - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice - const size_t q_sz = ggml_type_size(type) * x_bps; - - size_t x_size; - size_t y_size; - size_t d_size; - size_t q_size; - cl_mem d_X; - if (!mul_mat_vec) { - d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); - } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); - cl_mem d_Q; - if (!src0->clblast_offload_gpu) { - d_Q = ggml_cl_pool_malloc(q_sz, &q_size); - } - - cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); - cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); - if(to_fp32_cl==nullptr) - { - printf("\nOpenCL: Unsupported Tensor Type Detected: %d\n",type); - printf("Note that CLBlast only works on classic quants and K-quants!\n"); - } - GGML_ASSERT(to_fp32_cl != nullptr); - - const size_t global_denom = ggml_cl_global_denom(type); - const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type); - - size_t ev_idx = 0; - std::vector events; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - // TODO: copy and dequantize src0 here when r3>1 - for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - // copy src0 to device if necessary - if (!src0->clblast_offload_gpu) { - events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); - } else if (src0->clblast_offload_gpu) { - d_Q = (cl_mem) src0->extra; - } else { - GGML_ASSERT(false); - } - - if (!mul_mat_vec) { - // convert src0 to fp32 on device - const size_t global = x_ne / global_denom; - const size_t offset = src0->clblast_offload_gpu ? (i03 * ne02 + i02) * x_bps : 0; - CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); - CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); - } - - int64_t i12 = i02 * r2; - int64_t e12 = i12 + r2; - events.reserve(e12 - i12); - for (; i12 < e12; i12++) { - if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel - // copy src1 to device - events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); - - // compute - const size_t global = ne01 * local; - const size_t offset = src0->clblast_offload_gpu ? (i03 * ne02 + i02) * x_bps : 0; - const cl_int ncols = ne00; - events.emplace_back(); - CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); - CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); - CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); - CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); - CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); - } else { // CLBlast matrix matrix multiplication - // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); - - // wait for conversion - CL_CHECK(clFinish(queue)); - - // compute - events.emplace_back(); - clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, - (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, events.data() + ev_idx++); - - if (status != clblast::StatusCode::kSuccess) { - printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); - GGML_ASSERT(false); - } - } - - // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); - for (auto *event : events) { - clReleaseEvent(event); - } - - ev_idx = 0; - events.clear(); - } - } - } - } - - if (!mul_mat_vec) { - ggml_cl_pool_free(d_X, x_size); - } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); - if (!src0->clblast_offload_gpu) { - ggml_cl_pool_free(d_Q, q_size); - } -} - - -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->clblast_offload_gpu)) { - return true; - } - - return false; -} - -static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { - // If device doesn't support FP16 - if (!fp16_support) { - return false; - } - - size_t src0_sz = ggml_nbytes(src0); - size_t src1_sz = ggml_nbytes(src1); - - // mul_mat_q: src0 is converted to fp32 on device - size_t mul_mat_q_transfer = src0_sz + src1_sz; - - // mul_mat_f16: src1 is converted to fp16 on cpu - size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1); - - // choose the smaller one to transfer to the device - // TODO: this is not always the best choice due to the overhead of converting to fp16 - return mul_mat_f16_transfer < mul_mat_q_transfer; -} - -void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); - - if (src0->type == GGML_TYPE_F32) { - ggml_cl_mul_mat_f32(src0, src1, dst); - } - else if (src0->type == GGML_TYPE_F16) { - if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { - ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); - } - else { - ggml_cl_mul_mat_q_f32(src0, src1, dst); - } - } - else if (ggml_is_quantized(src0->type)) { - ggml_cl_mul_mat_q_f32(src0, src1, dst); - } - else { - GGML_ASSERT(false); - } -} - -size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) { - return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]); - } - return 0; -} - -void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; - const int64_t ne2 = tensor->ne[2]; - const int64_t ne3 = tensor->ne[3]; - - const ggml_type type = tensor->type; - const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type)); - const size_t q_sz = s_sz * (size_t) (ne2 * ne3); - - size_t q_size; - cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); - - tensor->data = data; - // copy tensor to device - size_t offset = 0; - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL)); - offset += s_sz; - } - } - - CL_CHECK(clFinish(queue)); - - tensor->extra = dst; - GGML_ASSERT(tensor->clblast_offload_gpu); -} diff --git a/otherarch/ggml_v3b-opencl.h b/otherarch/ggml_v3b-opencl.h deleted file mode 100644 index f63e5ad4f..000000000 --- a/otherarch/ggml_v3b-opencl.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-cpu.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -GGML_API void ggml_cl_init(void); - -GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); -GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); - -GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); -GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index f220595d3..5875b7f14 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -8,10 +8,6 @@ #include #include -#if defined(GGML_USE_CLBLAST) -# include "ggml_v3b-opencl.h" -#endif - static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -973,7 +969,6 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } } -static int clblast_offload_fallback_layers = 0; static int layer_name_to_number(std::string inputString) { size_t firstDotPosition = inputString.find('.'); @@ -1220,18 +1215,6 @@ bool llama_model_loader::load_all_data( } } - #if defined(GGML_USE_CLBLAST) - int layernum = layer_name_to_number(cur->name); - bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum); - if(shouldoffload) - { - cur->clblast_offload_gpu = true; - ggml_cl_transform_tensor(cur->data, cur); - }else{ - cur->clblast_offload_gpu = false; - } - #endif - size_done += n_size; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c7b70c978..8f575dcb0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -131,10 +131,6 @@ #include "models/mistral3.cpp" #include "models/graph-context-mamba.cpp" -#if defined(GGML_USE_CLBLAST) -# include "ggml_v3b-opencl.h" -#endif - const char * llm_type_name(llm_type type) { switch (type) { case LLM_TYPE_14M: return "14M"; @@ -2639,12 +2635,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); - #if defined(GGML_USE_CLBLAST) - printf("\nOpenCL GPU Offload Fallback...\n"); - clblast_offload_fallback_layers = n_gpu_layers; - i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0); - #endif - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); @@ -7563,9 +7553,6 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d throw std::runtime_error(format("failed to create ggml context")); } - #if defined(GGML_USE_CLBLAST) - ggml_cl_init(); - #endif ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; ggml_tensor * op_tensor = fn(ctx.get()); for (int i = 0; i < GGML_MAX_SRC; i++) { diff --git a/src/llama.cpp b/src/llama.cpp index 2e184f7a1..f4b8bc90d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -43,8 +43,6 @@ static bool old_mixtral_warning_showed = false; #ifdef GGML_USE_CUDA # include "ggml-cuda.h" -#elif defined(GGML_USE_CLBLAST) -# include "ggml_v3b-opencl.h" #endif #if defined(_MSC_VER) @@ -804,13 +802,9 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { - #if defined(GGML_USE_CLBLAST) - return true; - #else return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr || llama_supports_rpc(); - #endif } bool llama_supports_rpc(void) {