diff --git a/CMakePresets.json b/CMakePresets.json index e2b7a79e3..fba22af9a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -11,9 +11,21 @@ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, - + { + "name": "sycl-base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_COMPILER": "icx", + "LLAMA_SYCL": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, - { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } }, { @@ -35,15 +47,18 @@ }, { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, - { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] }, - { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] }, + { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, + { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }, + { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, + { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, - { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] }, - { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] } + { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, + { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, + + { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, + { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } ] } diff --git a/Makefile b/Makefile index da83137e5..859ad8420 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split dev: koboldcpp_openblas dev2: koboldcpp_clblast - +dev3: koboldcpp_vulkan ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -158,7 +158,7 @@ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instanc ifdef LLAMA_CUBLAS CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib + CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST) diff --git a/common/common.cpp b/common/common.cpp index 24eb8b64b..1bceee198 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,7 +7,6 @@ #include "llama.h" #include -#include #include #include #include @@ -543,6 +542,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } else { invalid_param = true; } return true; } @@ -1871,6 +1871,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "backend" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); + if (llama_supports_mlock()) { options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); } @@ -2658,7 +2659,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat } // Set the output file - std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb"), fclose); + + struct FILE_deleter { + void operator()(FILE * f) const { + fclose(f); + } + }; + + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; diff --git a/common/common.h b/common/common.h index 29e12c59d..123c148f0 100644 --- a/common/common.h +++ b/common/common.h @@ -69,7 +69,6 @@ struct gpt_params { int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index fbf1e1ea3..67598b561 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -214,7 +214,7 @@ src_func = f""" """ convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") -convert_py = convert_py_pth.read_text() +convert_py = convert_py_pth.read_text(encoding="utf-8") convert_py = re.sub( r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", lambda m: m.group(1) + src_func + m.group(3), @@ -222,7 +222,7 @@ convert_py = re.sub( flags=re.DOTALL | re.MULTILINE, ) -convert_py_pth.write_text(convert_py) +convert_py_pth.write_text(convert_py, encoding="utf-8") logger.info("+++ convert-hf-to-gguf.py was updated") diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 9f3263bd0..f861a7785 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -18,9 +18,10 @@ static std::vector split_lines(const std::string & s) { return lines; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -41,13 +42,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu // try to get sequence embeddings - supported only when pooling_type is not NONE const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); - continue; - } - } + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); float * out = output + batch.seq_id[i][0] * n_embd; //TODO: I would also add a parameter here to enable normalization or not. @@ -98,6 +93,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 213515791..2c61c2e1e 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -44,6 +44,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); // run model @@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo llama_token eos_token = llama_token_eos(mdl); llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); + llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); std::vector inputs = llama_tokenize(mdl, prompt, false, true); @@ -166,8 +169,7 @@ int main(int argc, char * argv[]) { llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); - // create new context - set to embedding mode - cparams.embeddings = true; + // create generation context llama_context * ctx = llama_new_context_with_model(mdl, cparams); // ### Embedding/Representation ### diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 467034d93..64c6ab9cf 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -224,7 +224,11 @@ int main(int argc, char ** argv) { inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); @@ -529,7 +533,12 @@ int main(int argc, char ** argv) { inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } + embd.clear(); n_remain = params.n_predict; n_past = 0; diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 5bde18917..2c1e3f61b 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -131,22 +131,29 @@ class LlamaState: ObservableObject { messageLog += "\(text)" - while await llamaContext.n_cur < llamaContext.n_len { - let result = await llamaContext.completion_loop() - messageLog += "\(result)" + Task.detached { + while await llamaContext.n_cur < llamaContext.n_len { + let result = await llamaContext.completion_loop() + await MainActor.run { + self.messageLog += "\(result)" + } + } + + let t_end = DispatchTime.now().uptimeNanoseconds + let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S + let tokens_per_second = Double(await llamaContext.n_len) / t_generation + + await llamaContext.clear() + + await MainActor.run { + self.messageLog += """ + \n + Done + Heat up took \(t_heat)s + Generated \(tokens_per_second) t/s\n + """ + } } - - let t_end = DispatchTime.now().uptimeNanoseconds - let t_generation = Double(t_end - t_heat_end) / NS_PER_S - let tokens_per_second = Double(await llamaContext.n_len) / t_generation - - await llamaContext.clear() - messageLog += """ - \n - Done - Heat up took \(t_heat)s - Generated \(tokens_per_second) t/s\n - """ } func bench() async { diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 55b7b2f70..eb89d16da 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -73,9 +73,10 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz return chunks; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -160,6 +161,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5e0b05421..19549c5c4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1595,7 +1595,7 @@ struct server_context { } else { std::string prompt; if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { - json_value(task.data, "prompt", std::string()); + prompt = json_value(task.data, "prompt", std::string()); } slot = get_available_slot(prompt); @@ -2039,7 +2039,12 @@ struct server_context { prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + prefix_tokens.push_back(middle_token); + } + prompt_tokens = prefix_tokens; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index b8037aae8..027173b0a 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON :: for FP32 -cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR :: build example/main only :: make main :: build all binary -make -j +cmake --build . -j if %errorlevel% neq 0 goto ERROR cd .. diff --git a/ggml-backend.c b/ggml-backend.c index 26dce7f72..13c71c310 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; for (int i = 0; i < sched->graph->n_nodes; i++) { - if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && + sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; break; } } if (!backend_ids_changed) { for (int i = 0; i < sched->graph->n_leafs; i++) { - if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && + sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; break; } @@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { return sched->n_copies; } +int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { + return sched->n_backends; +} + +ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { + GGML_ASSERT(i >= 0 && i < sched->n_backends); + return sched->backends[i]; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); diff --git a/ggml-backend.h b/ggml-backend.h index 47fd81475..4a38eeb5c 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -182,6 +182,9 @@ extern "C" { // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); + GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); + // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d1cc37749..e31e0d9e6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array & } const int cc = ggml_cuda_info().devices[id].cc; - row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc))); + row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc)); } return row_rounding; } diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh index ed93f65cb..e8eb834e5 100644 --- a/ggml-cuda/common.cuh +++ b/ggml-cuda/common.cuh @@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) { } // Round rows to this value for --split-mode row: -static int get_mmq_y_host(const int cc, const int mmq_x) { - return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64; +static int get_mmq_y_host(const int cc) { + return cc >= CC_VOLTA ? 128 : 64; } ////////////////////// diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu index 1d6b9e698..6dbd85fef 100644 --- a/ggml-cuda/mmq.cu +++ b/ggml-cuda/mmq.cu @@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q( switch (src0->type) { case GGML_TYPE_Q4_0: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q4_1: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q5_0: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q5_1: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q8_0: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q2_K: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q3_K: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q4_K: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q5_K: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_Q6_K: - mul_mat_q_case(args, stream); + mul_mat_q_case(ctx, args, stream); break; default: GGML_ASSERT(false); diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh index 6d57974fb..e2d07c202 100644 --- a/ggml-cuda/mmq.cuh +++ b/ggml-cuda/mmq.cuh @@ -8,6 +8,7 @@ #include #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) +#define MMQ_NWARPS 8 typedef void (*load_tiles_mmq_t)( const char * __restrict__ x, int * __restrict__ x_qs, half2 * __restrict__ x_dm, @@ -15,7 +16,7 @@ typedef void (*load_tiles_mmq_t)( typedef void (*vec_dot_mmq_t)( const int * __restrict__ x_qs, const half2 * __restrict__ x_dm, const int * __restrict__ x_sc, const int * __restrict__ y, float * __restrict__ sum, const int & k0); -typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1); +typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max); struct block_q8_1_mmq { half2 ds[4]; @@ -50,21 +51,17 @@ static constexpr __device__ int get_mmq_x_max_device() { // get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row +static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -static constexpr __device__ int get_mmq_y_device(int mmq_x) { - return mmq_x >= 32 ? 128 : 64; -} + return 128; #else #if __CUDA_ARCH__ >= CC_VOLTA -static constexpr __device__ int get_mmq_y_device(int mmq_x) { - return mmq_x >= 32 ? 128 : 64; -} + return 128; #else -static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) { return 64; -} #endif // __CUDA_ARCH__ >= CC_VOLTA #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} #define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0} #define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0} @@ -1734,30 +1731,34 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } template -static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) { +static __device__ __forceinline__ void mmq_write_back_dp4a( + const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { - const int j = blockIdx.y*mmq_x + j0 + threadIdx.y; + const int j = j0 + threadIdx.y; - if (j >= ne1) { + if (j > j_max) { return; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { - const int i = blockIdx.x*mmq_y + i0 + threadIdx.x; + const int i = i0 + threadIdx.x; - if (need_check && i >= ne0) { + if (need_check && i > i_max) { continue; } - dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } template -static __device__ __forceinline__ void mmq_write_back_mma(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) { +static __device__ __forceinline__ void mmq_write_back_mma( + const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { + typedef mma_int_C_I16J8 mma_C; const int i0 = threadIdx.y*mma_C::I; @@ -1769,19 +1770,19 @@ static __device__ __forceinline__ void mmq_write_back_mma(const float * __restri for (int j0 = 0; j0 < mmq_x; j0 += mma_C::J) { #pragma unroll for (int l = 0; l < mma_C::ne; ++l) { - const int j = blockIdx.y*mmq_x + j0 + mma_C::get_j(l); + const int j = j0 + mma_C::get_j(l); - if (j >= ne1) { + if (j > j_max) { continue; } - const int i = blockIdx.x*mmq_y + i0 + mma_C::get_i(l); + const int i = i0 + mma_C::get_i(l); - if (need_check && i >= ne0) { + if (need_check && i > i_max) { continue; } - dst[j*ne0 + i] = sum[(j0/mma_C::J)*mma_C::ne + l]; + dst[j*stride + i] = sum[(j0/mma_C::J)*mma_C::ne + l]; } } } @@ -1896,32 +1897,16 @@ static bool mmq_need_sum(const ggml_type type_x) { return false; } -template -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#else -#if __CUDA_ARCH__ >= CC_VOLTA - __launch_bounds__(WARP_SIZE*nwarps, 1) -#else - __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // __CUDA_ARCH__ >= CC_VOLTA -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -static __global__ void mul_mat_q( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, - const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { - - // Skip unused template specializations for faster compilation: - if (mmq_x > get_mmq_x_max_device()) { - NO_DEVICE_CODE; - return; - } +template +static __device__ void mul_mat_q_process_tile( + const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0, + const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qr = ggml_cuda_type_traits::qr; constexpr int qi = ggml_cuda_type_traits::qi; - constexpr int mmq_y = get_mmq_y_device(mmq_x); + constexpr int mmq_y = get_mmq_y_device(); constexpr int vdr = mmq_type_traits::vdr; constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; @@ -1941,20 +1926,18 @@ static __global__ void mul_mat_q( int * tile_x_sc = (int *) (tile_x_dm + txs.dm); int * tile_y = (int *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)] - const int blocks_per_row_x = ne00 / qk; - const int blocks_per_warp = WARP_SIZE / qi; - - const int & ne1 = ne11; - - const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1; - - const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); + constexpr int blocks_per_warp = WARP_SIZE / qi; float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) { + const int tile_x_max_i = ne01 - it*mmq_y - 1; + const int tile_y_max_j = ne11 - jt*mmq_x - 1; - load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01); + const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); + + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_warp) { + + load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*it*mmq_y + kb0, tile_x_max_i, stride01); #pragma unroll for (int kr = 0; kr < qr; ++kr) { @@ -1977,7 +1960,176 @@ static __global__ void mul_mat_q( } } - write_back(sum, dst, ne0, ne1); + if (fixup) { + write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); + } else { + write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j); + } +} + + +// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 + +template +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*nwarps, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#else +#if __CUDA_ARCH__ >= CC_VOLTA + __launch_bounds__(WARP_SIZE*nwarps, 1) +#else + __launch_bounds__(WARP_SIZE*nwarps, 2) +#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +static __global__ void mul_mat_q( + const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { + + // Skip unused template specializations for faster compilation: + if (mmq_x > get_mmq_x_max_device()) { + NO_DEVICE_CODE; + return; + } + + constexpr int qk = ggml_cuda_type_traits::qk; + constexpr int qi = ggml_cuda_type_traits::qi; + constexpr int mmq_y = get_mmq_y_device(); + + // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: +#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA + { + constexpr bool fixup = false; + mul_mat_q_process_tile + (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, + blockIdx.x, blockIdx.y, 0, ne00/qk); + return; + } +#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA + + const int64_t blocks_per_ne00 = ne00 / qk; + constexpr int blocks_per_warp = WARP_SIZE / qi; + + const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x + const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y + + // kbc == k block continuous, current index in continuous ijk space. + int64_t kbc = GGML_PAD((int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp); + const int64_t kbc_stop = GGML_PAD((int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp); + + // kb0 == k index when doing the matrix multiplication for an output tile. + int kb0_start = kbc % blocks_per_ne00; + int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc); + while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) { + const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile. + const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile. + + constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. + mul_mat_q_process_tile + (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, + it, jt, kb0_start, kb0_stop); + + kbc += blocks_per_ne00; + kbc -= kbc % blocks_per_ne00; + + kb0_start = 0; + kb0_stop = min(blocks_per_ne00, kbc_stop - kbc); + } + + if (kbc >= kbc_stop) { + return; + } + + const int jt = kbc / (blocks_per_ne00*nty); + const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + + constexpr bool fixup = true; // Last index writes it data to fixup buffer to avoid data races with other blocks. + mul_mat_q_process_tile + (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, + it, jt, kb0_start, kb0_stop); +} + + +template +static __global__ void mul_mat_q_stream_k_fixup( + float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) { + + constexpr int mmq_y = get_mmq_y_device(); + constexpr int qk = ggml_cuda_type_traits::qk; + constexpr int qi = ggml_cuda_type_traits::qi; + constexpr int blocks_per_warp = WARP_SIZE / qi; + const int64_t blocks_per_ne00 = ne00 / qk; + + float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; + + const int ntx = (ne11 + mmq_x - 1) / mmq_x; + const int nty = (ne01 + mmq_y - 1) / mmq_y; + + bool any_fixup = false; + + const int bidx_start = (blockIdx.y*nty + blockIdx.x) * block_num_mmq / (gridDim.y*gridDim.x); + const int bidx_stop = (blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq / (gridDim.y*gridDim.x) + 1; + + for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) { + const int64_t kbc = GGML_PAD((int64_t) bidx *blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp); + const int64_t kbc_stop = GGML_PAD((int64_t)(bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp); + + // Skip fixup tile if the MMQ CUDA block never wrote anything to it: + if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) { + continue; + } + + const int jt = kbc_stop / (blocks_per_ne00*nty); + const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + + // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block: + if (it != blockIdx.x || jt != blockIdx.y) { + continue; + } + + any_fixup = true; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; + } + } + } + + if (!any_fixup) { + return; + } + + dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y; + + const int i_max = ne01 - blockIdx.x*mmq_y - 1; + const int j_max = ne11 - blockIdx.y*mmq_x - 1; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j > j_max) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + if (need_check && i > i_max) { + continue; + } + + dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + } + } } struct mmq_args { @@ -1987,124 +2139,151 @@ struct mmq_args { int64_t ne0; }; -constexpr int mmq_get_nwarps(int mmq_x) { - return mmq_x >= 32 ? 8 : 4; -} - static int mmq_get_shmem(const ggml_type type, const int mmq_x, const int mmq_y) { const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y); - const int nwarps = mmq_get_nwarps(mmq_x); const int shmem_x = txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2); - return shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int)); + return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); } -template -static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) { +template +static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { const int id = ggml_cuda_get_device(); const int cc = ggml_cuda_info().devices[id].cc; - const int mmq_y = get_mmq_y_host(cc, mmq_x); + const int nsm = ggml_cuda_info().devices[id].nsm; + const int mmq_y = get_mmq_y_host(cc); - const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y; - const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); const int shmem = mmq_get_shmem(type, mmq_x, mmq_y); #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; if (!shmem_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); shmem_limit_raised[id] = true; } #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) + const int nty = (args.ne01 + mmq_y - 1) / mmq_y; + const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; + const dim3 block_nums_xy_tiling(nty, ntx, 1); + + const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; + if (!use_stream_k) { + if (args.ne01 % mmq_y == 0) { + constexpr bool need_check = false; + mul_mat_q<<>> + (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + } else { + constexpr bool need_check = true; + mul_mat_q<<>> + (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + } + return; + } + + const dim3 block_nums_mmq(nsm, 1, 1); + + ggml_cuda_pool & pool = ctx.pool(); + ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); + if (args.ne01 % mmq_y == 0) { - const bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + constexpr bool need_check = false; + + mul_mat_q<<>> + (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + + mul_mat_q_stream_k_fixup<<>> + (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); } else { - const bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + constexpr bool need_check = true; + + mul_mat_q<<>> + (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + + mul_mat_q_stream_k_fixup<<>> + (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); } } template -void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) { +void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { const int id = ggml_cuda_get_device(); const int nsm = ggml_cuda_info().devices[id].nsm; const int cc = ggml_cuda_info().devices[id].cc; const int smpbo = ggml_cuda_info().devices[id].smpbo; const int mmq_x_max = get_mmq_x_max_host(cc); - const int mmq_y = get_mmq_y_host(cc, mmq_x_max); + const int mmq_y = get_mmq_y_host(cc); const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; + const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; int mmq_x_best = 0; - int nwaves_best = INT_MAX; + int nparts_best = INT_MAX; - for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) { - const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x; - const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm; + for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) { + const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x; + const int nwaves_xy_tiling = ntiles_x*block_num_y; - if (nwaves < nwaves_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) { + const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling; + + if (nparts < nparts_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) { mmq_x_best = mmq_x; - nwaves_best = nwaves; + nparts_best = nparts; } } switch (mmq_x_best) { case 8: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 16: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 24: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 32: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 40: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 48: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 56: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 64: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 72: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 80: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 88: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 96: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 104: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 112: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 120: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; case 128: - launch_mul_mat_q(args, stream); + launch_mul_mat_q(ctx, args, stream); break; default: fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best); @@ -2114,7 +2293,7 @@ void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) { } #define DECL_MMQ_CASE(type) \ - template void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) \ + template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \ extern DECL_MMQ_CASE(GGML_TYPE_Q4_0); extern DECL_MMQ_CASE(GGML_TYPE_Q4_1); diff --git a/ggml-metal.m b/ggml-metal.m index 0829b3d41..70a318150 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -735,6 +735,12 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs } static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { + return false; + } + } + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { diff --git a/ggml-quants.c b/ggml-quants.c index c2df9ffa4..cc25e434f 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -8815,7 +8815,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx) +#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, @@ -8948,6 +8948,61 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void *s = 0.125f * hsum_float_8(accumf); +#elif defined(__AVX__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]); + const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); + const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + #elif defined(__POWER9_VECTOR__) const vector int v0 = vec_splats((int32_t)0); vector float vsumf0 = vec_splats(0.0f); @@ -9291,6 +9346,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = 0.125f * hsum_float_8(accumf); + +#elif defined(__AVX__) + const __m128i mone = _mm_set1_epi8(1); + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes); + const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1); + const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1); + const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1); + const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2); + const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1); + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper); + const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1); + const __m128i m511 = _mm_set1_epi16(511); + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m256i aux_gindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2); + const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16; + aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511)); + + const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9); + const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9); + const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13); + const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13); + const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0); + const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1); + + const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0); + const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1); + const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0); + const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1); + + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]); + const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]); + const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]); + const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]); + + // AVX2 full_signs_1 is full_sign_bits_0 here + // AVX2 full_signs_2 is full_sign_bits_1 here + __m128i signs_0, signs_1; + signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone)); + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0); + const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1); + const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0); + const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1); + + __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)); + const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)); + const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)); + const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)); + const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1)); + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1)); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + #elif defined(__loongarch_asx) const __m256i mone = __lasx_xvreplgr2vr_b(1); @@ -9694,6 +9908,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * *s = 0.125f * hsum_float_8(accumf); +#elif defined(__AVX__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); + const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); + const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); + const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); + + uint64_t aux64; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); + const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8); + const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8)); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], + iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], + iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], + iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], + iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]); + qs += 8; + + __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); + __m128i aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); + const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); + + aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); + aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); + const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); + + signs += 4; + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0))); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1))); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0))); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1))); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + #elif defined(__POWER9_VECTOR__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 @@ -10020,6 +10326,63 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void *s = 0.25f * hsum_float_8(accumf); +#elif defined(__AVX__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); + q3 += 8; + const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]); + const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + #elif defined(__POWER9_VECTOR__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; @@ -10371,6 +10734,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * *s = hsum_float_8(accumf); +#elif defined(__AVX__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); + const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); + const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); + const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); + + const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256); + const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16); + const __m128i idx_mask = _mm_set1_epi32(256); + + typedef union { + __m128i vec[4]; + uint32_t index[16]; + } index_t; + + index_t idx; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs); + const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp); + const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16; + idx.vec[0] = _mm_set1_epi32(qh[ib32+0]); + idx.vec[1] = idx.vec[0]; + idx.vec[2] = _mm_set1_epi32(qh[ib32+1]); + idx.vec[3] = idx.vec[2]; + + idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask); + idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask); + idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask); + idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask); + + idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0)); + idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8))); + idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1)); + idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8))); + + const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]); + const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]); + const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]); + const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]); + + __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16)); + __m128i aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); + const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); + + aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16)); + aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); + const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); + + signs += 4; + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = hsum_float_8(accumf); + #elif defined(__POWER9_VECTOR__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 @@ -10608,6 +11077,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * } +#if defined(__AVX__) +static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { + const __m128i ax = _mm_sign_epi8(x, x); + const __m128i sy = _mm_sign_epi8(y, x); + return _mm_maddubs_epi16(ax, sy); +} +#endif + #if defined(__AVX2__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); @@ -10725,6 +11202,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; +#elif defined __AVX__ + __m256 accum = _mm256_setzero_ps(); + float accum1 = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + int sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]); + const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); + const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]); + qs += 8; + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); + const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); + const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); + const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); + const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2)); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); + sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; + } + + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); + accum1 += d * sumi1; + + } + + *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; + #elif defined(__POWER9_VECTOR__) const vector unsigned char v0 = vec_splats((unsigned char)0x0); const vector unsigned short vsign = vec_splats((unsigned short)0x8000); @@ -11063,6 +11588,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); +#elif defined __AVX__ + const __m128i mask = _mm_set1_epi16(0x7); + const __m128i mone = _mm_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q1b_1_0 = _mm_set_epi64x( + iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]); + const __m128i q1b_1_1 = _mm_set_epi64x( + iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]); + const __m128i q1b_2_0 = _mm_set_epi64x( + iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]); + const __m128i q1b_2_1 = _mm_set_epi64x( + iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); + const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); + const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); + const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); + + const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + + const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0); + const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1); + const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0); + const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1); + + __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0); + __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3); + __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6); + __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9); + + scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone); + scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone); + scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone); + scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone); + const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1); + const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0); + const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1); + const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0); + const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1)); + + qs += 8; qh += 4; + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + + accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); + } + + *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); + #else int sum1[2], sum2[2], delta[4]; @@ -11193,6 +11804,44 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m128i mone = _mm_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int ib = 0; ib < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1); + + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); + accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2); + + y += 2; + x += 2; + } + + *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); + #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); const vector signed int v0 = vec_splats((int32_t)0); @@ -11383,6 +12032,54 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * *s = hsum_float_8(accum); +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16; + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16; + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; + sh >>= 4; + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1)); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1)); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2)); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2)); + sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0); + sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1); + sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0); + sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1); + } + __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); + __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); + } + + *s = hsum_float_8(accum); + #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); const vector int v0 = vec_splats((int32_t)0); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6bd42b960..e5ddf4a34 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -38,67 +38,17 @@ #include "ggml-sycl/backend.hpp" -/* -Following definition copied from DPCT head files, which are used by ggml-sycl.cpp -*/ -// COPY from DPCT head files -#include -#include -#include - -#if defined(__linux__) -#include -#elif defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#else -#error "Only support Windows and Linux." -#endif - -#if defined(__linux__) -#include -#include -#endif -#if defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#endif - -#define DPCT_COMPATIBILITY_TEMP (900) - -#if defined(_MSC_VER) -#define __dpct_align__(n) __declspec(align(n)) -#define __dpct_inline__ __forceinline -#else -#define __dpct_align__(n) __attribute__((aligned(n))) -#define __dpct_inline__ __inline__ __attribute__((always_inline)) -#endif - -#if defined(_MSC_VER) -#define __dpct_noinline__ __declspec(noinline) -#else -#define __dpct_noinline__ __attribute__((noinline)) -#endif - bool ggml_sycl_loaded(void); void ggml_sycl_free_data(struct ggml_tensor * tensor); -void ggml_sycl_assign_buffers(struct ggml_tensor * tensor); -void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor); -void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor); -void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor); void ggml_sycl_copy_to_device(struct ggml_tensor * tensor); void ggml_sycl_set_main_device(int main_device); void ggml_sycl_set_mul_mat_q(bool mul_mat_q); -void ggml_sycl_set_scratch_size(size_t scratch_size); -void ggml_sycl_free_scratch(void); void ggml_sycl_get_device_description(int device, char * description, size_t description_size); bool ggml_backend_is_sycl(ggml_backend_t backend); int ggml_backend_sycl_get_device(ggml_backend_t backend); static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); +static inline int get_sycl_env(const char *env_name, int default_val); +static inline int get_work_group_size(const sycl::device& device); void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { @@ -108,45 +58,6 @@ void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, free(host_buf); } -static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - - return x32; -} - -static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8, - const int &i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - - return x32; -} - -static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8, - const int &i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8, - const int &i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -template -using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y, - int k, queue_ptr stream); -typedef to_t_sycl_t to_fp32_sycl_t; -typedef to_t_sycl_t to_fp16_sycl_t; - -typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_sycl_func_t)(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_sycl_op_mul_mat_t)( @@ -162,22 +73,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); -typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); -typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm, - int **x_qh, int **x_sc); -typedef void (*load_tiles_sycl_t)(const void *__restrict__ vx, - int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, - int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, - const int &i_max, const int &k, - const int &blocks_per_row); -typedef float (*vec_dot_q_mul_mat_sycl_t)( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms, - const int &i, const int &j, const int &k); - static __dpct_inline__ float warp_reduce_sum(float x, const sycl::nd_item<3> &item_ct1) { #pragma unroll @@ -664,1069 +559,6 @@ static void rms_norm_f32(const float * x, float * dst, const int ncols, const fl } } -static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, - const int iqs, dfloat2 &v) { - const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = x[ib].d; - - const int vui = x[ib].qs[iqs]; - - v.x() = vui & 0xF; - v.y() = vui >> 4; - -#ifdef GGML_SYCL_F16 - // v = v - {8.0f, 8.0f}; - // v = v * {d, d}; - v.s0() = (v.s0() - 8.0f) * d; - v.s1() = (v.s1() - 8.0f) * d; - -#else - v.x() = (v.x() - 8.0f) * d; - v.y() = (v.y() - 8.0f) * d; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, - const int iqs, dfloat2 &v) { - const block_q4_1 * x = (const block_q4_1 *) vx; - - const dfloat d = x[ib].dm[0]; - const dfloat m = x[ib].dm[1]; - - const int vui = x[ib].qs[iqs]; - - v.x() = vui & 0xF; - v.y() = vui >> 4; - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; - -#else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, - const int iqs, dfloat2 &v) { - const block_q5_0 * x = (const block_q5_0 *) vx; - - const dfloat d = x[ib].d; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_SYCL_F16 - // v = v - {16.0f, 16.0f}; - // v = v * {d, d}; - v.s0() = (v.s0() - 16.0f) * d; - v.s1() = (v.s1() - 16.0f) * d; - -#else - v.x() = (v.x() - 16.0f) * d; - v.y() = (v.y() - 16.0f) * d; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, - const int iqs, dfloat2 &v) { - const block_q5_1 * x = (const block_q5_1 *) vx; - - const dfloat d = x[ib].dm[0]; - const dfloat m = x[ib].dm[1]; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; -#else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, - const int iqs, dfloat2 &v) { - const block_q8_0 * x = (const block_q8_0 *) vx; - - const dfloat d = x[ib].d; - - v.x() = x[ib].qs[iqs + 0]; - v.y() = x[ib].qs[iqs + 1]; - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - v.s0() *= d; - v.s1() *= d; -#else - v.x() *= d; - v.y() *= d; -#endif // GGML_SYCL_F16 -} - -template -static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - - // assume 32 threads - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; - const int ir = tid%8; - const int ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_0 * x = (const block_q4_0 *)vx + ib; - const float d = sycl::vec(x->d) - .convert()[0]; - const float dm = -8*d; - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l+ 0] = d * (q[l] & 0xF) + dm; - y[l+16] = d * (q[l] >> 4) + dm; - } -} - -template -static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - - // assume 32 threads - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; - const int ir = tid%8; - const int ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_1 * x = (const block_q4_1 *)vx + ib; - const sycl::float2 d = - x->dm.convert(); - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l + 0] = d.x() * (q[l] & 0xF) + d.y(); - y[l + 16] = d.x() * (q[l] >> 4) + d.y(); - } -} - - -//================================== k-quants - -template -static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - const block_q2_K * x = (const block_q2_K *) vx; - - const int tid = item_ct1.get_local_id(2); - const int n = tid/32; - const int l = tid - 32*n; - const int is = 8*n + l/16; - - const uint8_t q = x[i].qs[32*n + l]; - dst_t * y = yy + i*QK_K + 128*n; - - float dall = x[i].dm[0]; - float dmin = x[i].dm[1]; - y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); - y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); - y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -} - -template -static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - const block_q3_K * x = (const block_q3_K *) vx; - - const int r = item_ct1.get_local_id(2) / 4; - const int tid = r/2; - const int is0 = r%2; - const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); - const int n = tid / 4; - const int j = tid - 4*n; - - uint8_t m = 1 << (4*n + j); - int is = 8*n + 2*j + is0; - int shift = 2*j; - - int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : - is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : - is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : - (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); - float d_all = x[i].d; - float dl = d_all * (us - 32); - - dst_t * y = yy + i*QK_K + 128*n + 32*j; - const uint8_t * q = x[i].qs + 32*n; - const uint8_t * hm = x[i].hmask; - - for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -} - -static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { - if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; - } else { - d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} - -template -static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const block_q4_K * x = (const block_q4_K *) vx; - - const int i = item_ct1.get_group(2); - - // assume 32 threads - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; - const int ir = tid%8; - const int is = 2*il; - const int n = 4; - - dst_t * y = yy + i*QK_K + 64*il + n*ir; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint8_t * q = x[i].qs + 32*il + n*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; - for (int l = 0; l < n; ++l) { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l +32] = d2 * (q[l] >> 4) - m2; - } -} - -template -static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const block_q5_K * x = (const block_q5_K *) vx; - - const int i = item_ct1.get_group(2); - - // assume 64 threads - this is very slightly better than the one below - const int tid = item_ct1.get_local_id(2); - const int il = tid/16; // il is in 0...3 - const int ir = tid%16; // ir is in 0...15 - const int is = 2*il; // is is in 0...6 - - dst_t * y = yy + i*QK_K + 64*il + 2*ir; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint8_t * ql = x[i].qs + 32*il + 2*ir; - const uint8_t * qh = x[i].qh + 2*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; - - uint8_t hm = 1 << (2*il); - y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; - y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; - hm <<= 1; - y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; - y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -} - -template -static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const block_q6_K * x = (const block_q6_K *) vx; - - const int i = item_ct1.get_group(2); - - // assume 64 threads - this is very slightly better than the one below - const int tid = item_ct1.get_local_id(2); - const int ip = tid/32; // ip is 0 or 1 - const int il = tid - 32*ip; // 0...32 - const int is = 8*ip + il/16; - - dst_t * y = yy + i*QK_K + 128*ip + il; - - const float d = x[i].d; - - const uint8_t * ql = x[i].ql + 64*ip + il; - const uint8_t qh = x[i].qh[32*ip + il]; - const int8_t * sc = x[i].scales + is; - - y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -} - -template -static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xxs_grid_ptr, - const uint8_t *ksigns_iq2xs_ptr, - const uint8_t *kmask_iq2xs_ptr) { - - const int i = item_ct1.get_group(2); - const block_iq2_xxs * x = (const block_iq2_xxs *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; - const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); -} - -template -static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xs_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { - - const int i = item_ct1.get_group(2); - const block_iq2_xs * x = (const block_iq2_xs *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -} - -template -__dpct_inline__ static void -dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - const block_iq2_s * x = (const block_iq2_s *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; -#pragma unroll - for (int j = 0; j < 8; ++j) { - y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); - } -} - -template -static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { - - const int i = item_ct1.get_group(2); - const block_iq3_xxs * x = (const block_iq3_xxs *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * q3 = x[i].qs + 8*ib; - const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -} - -template -__dpct_inline__ static void -dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { - - const int i = item_ct1.get_group(2); - const block_iq3_s * x = (const block_iq3_s *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); - const uint8_t signs = x[i].signs[4*ib + il]; -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -} - -template -__dpct_inline__ static void -dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_gpu) { - - const int i = item_ct1.get_group(2); - const block_iq1_s * x = (const block_iq1_s *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; - const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; -#pragma unroll - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -} - -template -__dpct_inline__ static void -dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_gpu) { - - const int i = item_ct1.get_group(2); - const block_iq1_m * x = (const block_iq1_m *) vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * sc = (const uint16_t *)x[i].scales; - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); - const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); - const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; -#pragma unroll - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -} - -template -__dpct_inline__ static void -dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_group(2); - const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[ib].qs + 4*il; - const float d = (float)x[ib].d; -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } - -} - - -template -__dpct_inline__ static void -dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_group(2); - const block_iq4_xs * x = (const block_iq4_xs *)vx; - - const int tid = item_ct1.get_local_id(2); - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[i].qs + 16*ib + 4*il; - const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } -} - - - -/* -DPCT1110:4: The total declared local variable size in device function -dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q2_K * x = (const block_q2_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 16/K_QUANTS_PER_ITERATION; - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; - - uint32_t aux[4]; - const uint8_t * d = (const uint8_t *)aux; - const uint8_t * m = (const uint8_t *)(aux + 2); - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - - } - tmp += dall * sum1 - dmin * sum2; - - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:5: The total declared local variable size in device function -dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q3_K * x = (const block_q3_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); - - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; - - uint16_t utmp[4]; - const int8_t * s = (const int8_t *)utmp; - - const uint16_t s_shift = 4*im; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - const uint8_t * h = x[i].hmask + l0; - - const uint16_t * a = (const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - - const float d = x[i].d; - - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); - } - tmp += d * sum; - - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:6: The total declared local variable size in device function -dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q4_K * x = (const block_q4_K *)vx + ib0; - - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 - const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - -#if K_QUANTS_PER_ITERATION == 2 - uint32_t q32[4]; - const uint8_t * q4 = (const uint8_t *)q32; -#else - uint16_t q16[4]; - const uint8_t * q4 = (const uint8_t *)q16; -#endif - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - -#if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); - const uint32_t * q2 = q1 + 16; - - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; - - sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; - s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + - s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - - dmin * smin; -#else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#endif - - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:7: The total declared local variable size in device function -dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q5_K * x = (const block_q5_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = item_ct1.get_local_id(2) / 2; // 0...15 - const int ix = item_ct1.get_local_id(2) % 2; - - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 2; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - uint16_t q16[8]; - const uint8_t * q4 = (const uint8_t *)q16; - - for (int i = ix; i < num_blocks_per_row; i += 2) { - - const uint8_t * ql1 = x[i].qs + q_offset; - const uint8_t * qh = x[i].qh + l0; - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - const uint16_t * q1 = (const uint16_t *)ql1; - const uint16_t * q2 = q1 + 32; - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[8] & 0x0f0f; - q16[2] = (q1[0] >> 4) & 0x0f0f; - q16[3] = (q1[8] >> 4) & 0x0f0f; - q16[4] = q2[0] & 0x0f0f; - q16[5] = q2[8] & 0x0f0f; - q16[6] = (q2[0] >> 4) & 0x0f0f; - q16[7] = (q2[8] >> 4) & 0x0f0f; - for (int l = 0; l < n; ++l) { - sum.x() += - y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + - y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); - sum.y() += - y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + - y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); - sum.z() += - y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + - y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); - sum.w() += - y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + - y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; - } - tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + - sum.w() * sc[5]) - - dmin * smin; - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q6_K * x = (const block_q6_K *)vx + ib0; - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 - - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - -#if K_QUANTS_PER_ITERATION == 1 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; -#else - const int l0 = 4 * in; // 0, 4, 8, ..., 28 - const int is = in / 4; -#endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = x[i].ql + ql_offset; - const uint8_t * qh = x[i].qh + qh_offset; - const int8_t * s = x[i].scales + s_offset; - - const float d = x[i].d; - -#if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; -#else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; -#endif - - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - -static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const sycl::half *x = (const sycl::half *)vx; - - // automatic half -> float type cast if dfloat == float - v.x() = x[ib + iqs + 0]; - v.y() = x[ib + iqs + 1]; -} - -static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const float * x = (const float *) vx; - - // automatic half -> float type cast if dfloat == float - v.x() = x[ib + iqs + 0]; - v.y() = x[ib + iqs + 1]; -} - static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const sycl::nd_item<3> &item_ct1) { const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -1848,3287 +680,6 @@ static void k_get_rows_float( dst_row[i00] = src0_row[i00]; } -template -static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)); - - if (i >= k) { - return; - } - - const int ib = i/qk; // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(vx, ib, iqs, v); - - y[iybs + iqs + 0] = v.x(); - y[iybs + iqs + y_offset] = v.y(); -} - -template -static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - const src_t * x = (src_t *) vx; - - y[i] = x[i]; -} - -// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called -// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q - -#define VDR_Q4_0_Q8_1_MMVQ 2 -#define VDR_Q4_0_Q8_1_MMQ 4 - -template -static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, - const float &d4, - const sycl::half2 &ds8) { - int sumi = 0; -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); - sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); - } - - const sycl::float2 ds8f = - ds8.convert(); - - // second part effectively subtracts 8 from each quant value - return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); -} - -#define VDR_Q4_1_Q8_1_MMVQ 2 -#define VDR_Q4_1_Q8_1_MMQ 4 - -template -static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, - const sycl::half2 &dm4, - const sycl::half2 &ds8) { - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); - sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); - } - -#ifdef GGML_SYCL_F16 - const sycl::float2 tmp = - (dm4 * ds8).convert(); - const float d4d8 = tmp.x(); - const float m4s8 = tmp.y(); -#else - const sycl::float2 dm4f = - dm4.convert(); - const sycl::float2 ds8f = - ds8.convert(); - const float d4d8 = dm4f.x() * ds8f.x(); - const float m4s8 = dm4f.y() * ds8f.y(); -#endif // GGML_SYCL_F16 - - // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it - return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); -} - -#define VDR_Q5_0_Q8_1_MMVQ 2 -#define VDR_Q5_0_Q8_1_MMQ 4 - -template -static __dpct_inline__ float -vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, - const float &d5, const sycl::half2 &ds8) { - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = dpct::dp4a(vi0, u[2 * i + 0], - sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = dpct::dp4a(vi1, u[2 * i + 1], - sumi); // SIMD dot product of quantized values - } - - const sycl::float2 ds8f = - ds8.convert(); - - // second part effectively subtracts 16 from each quant value - return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); -} - -#define VDR_Q5_1_Q8_1_MMVQ 2 -#define VDR_Q5_1_Q8_1_MMQ 4 - -template -static __dpct_inline__ float -vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, - const sycl::half2 &dm5, const sycl::half2 &ds8) { - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = dpct::dp4a(vi0, u[2 * i + 0], - sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = dpct::dp4a(vi1, u[2 * i + 1], - sumi); // SIMD dot product of quantized values - } - -#ifdef GGML_SYCL_F16 - const sycl::float2 tmp = - (dm5 * ds8).convert(); - const float d5d8 = tmp.x(); - const float m5s8 = tmp.y(); - - -#else - const sycl::float2 dm5f = - dm5.convert(); - const sycl::float2 ds8f = - ds8.convert(); - const float d5d8 = dm5f.x() * ds8f.x(); - const float m5s8 = dm5f.y() * ds8f.y(); -#endif // GGML_SYCL_F16 - - // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it - return sumi*d5d8 + m5s8 / (QI5_1 / vdr); -} - -#define VDR_Q8_0_Q8_1_MMVQ 2 -#define VDR_Q8_0_Q8_1_MMQ 8 - -template -static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, - const float &d8_0, - const float &d8_1) { - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = dpct::dp4a(v[i], u[i], sumi); - } - - return d8_0*d8_1 * sumi; -} - -template -static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, - const sycl::half2 &dm8, - const sycl::half2 &ds8) { - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = dpct::dp4a(v[i], u[i], sumi); - } - -#ifdef GGML_SYCL_F16 - const sycl::float2 tmp = - (dm8 * ds8).convert(); - const float d8d8 = tmp.x(); - const float m8s8 = tmp.y(); -#else - const sycl::float2 dm8f = - dm8.convert(); - const sycl::float2 ds8f = - ds8.convert(); - const float d8d8 = dm8f.x() * ds8f.x(); - const float m8s8 = dm8f.y() * ds8f.y(); -#endif // GGML_SYCL_F16 - - // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it - return sumi*d8d8 + m8s8 / (QI8_1 / vdr); -} - -#define VDR_Q2_K_Q8_1_MMVQ 1 -#define VDR_Q2_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( - const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, - const sycl::half2 &dm2, const float *__restrict__ d8) { - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR2_K; ++i) { - const int sc = scales[2*i]; - - const int vi = (v >> (2*i)) & 0x03030303; - - sumf_d += - d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - sumf_m += d8[i] * - dpct::dp4a( - m, u[i], - 0); // multiply constant q2_K part with sum of q8_1 values - } - - const sycl::float2 dm2f = - dm2.convert(); - - return dm2f.x() * sumf_d - dm2f.y() * sumf_m; -} - -// contiguous u/y values -static __dpct_inline__ float -vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, - const uint8_t *__restrict__ scales, - const sycl::half2 &dm2, const float &d8) { - - int sumi_d = 0; - int sumi_m = 0; - -#pragma unroll - for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { - int sumi_d_sc = 0; - - const int sc = scales[i0 / (QI8_1/2)]; - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - -#pragma unroll - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product - sumi_m = dpct::dp4a(m, u[i], - sumi_m); // multiply sum of q8_1 values with m - } - - sumi_d += sumi_d_sc * (sc & 0xF); - } - - const sycl::float2 dm2f = - dm2.convert(); - - return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); -} - -#define VDR_Q3_K_Q8_1_MMVQ 1 -#define VDR_Q3_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( - const int &vl, const int &vh, const int *__restrict__ u, - const uint8_t *__restrict__ scales, const int &scale_offset, - const float &d3, const float *__restrict__ d8) { - - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - const int isc = scale_offset + 2*i; - - const int isc_low = isc % (QK_K/32); - const int sc_shift_low = 4 * (isc / (QK_K/32)); - const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; - - const int isc_high = isc % (QK_K/64); - const int sc_shift_high = 2 * (isc / (QK_K/64)); - const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; - - const int sc = (sc_low | sc_high) - 32; - - const int vil = (vl >> (2*i)) & 0x03030303; - - const int vih = ((vh >> i) << 2) & 0x04040404; - - const int vi = - dpct::vectorized_binary(vil, vih, dpct::sub_sat()); - - sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d3 * sumf; -} - -// contiguous u/y values -static __dpct_inline__ float -vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, - const int8_t *__restrict__ scales, const float &d3, - const float &d8) { - - int sumi = 0; - -#pragma unroll - for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { - int sumi_sc = 0; - - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product - } - - sumi += sumi_sc * scales[i0 / (QI8_1/2)]; - } - - return d3*d8 * sumi; -} - -#define VDR_Q4_K_Q8_1_MMVQ 2 -#define VDR_Q4_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( - const int *__restrict__ v, const int *__restrict__ u, - const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const float *__restrict__ d8) { - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K; ++i) { - const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; - const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - - const int dot1 = - dpct::dp4a(v1i, u[2 * i + 1], - dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product - const int dot2 = - dpct::dp4a(0x01010101, u[2 * i + 1], - dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values - } - - const sycl::float2 dm4f = - dm4.convert(); - - return dm4f.x() * sumf_d - dm4f.y() * sumf_m; -} - -// contiguous u/y values -static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( - const int *__restrict__ v, const int *__restrict__ u, - const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F, - u[i * QI8_1 + j], sumi_d); // SIMD dot product - } - - const sycl::float2 ds8f = - ds8[i].convert(); - - sumf_d += ds8f.x() * (sc[i] * sumi_d); - sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val - } - - const sycl::float2 dm4f = - dm4.convert(); - - return dm4f.x() * sumf_d - dm4f.y() * sumf_m; -} - -#define VDR_Q5_K_Q8_1_MMVQ 2 -#define VDR_Q5_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( - const int *__restrict__ vl, const int *__restrict__ vh, - const int *__restrict__ u, const uint8_t *__restrict__ sc, - const uint8_t *__restrict__ m, const sycl::half2 &dm5, - const float *__restrict__ d8) { - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; - const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; - - const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; - const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; - - const int v0i = vl0i | vh0i; - const int v1i = vl1i | vh1i; - - const int dot1 = - dpct::dp4a(v0i, u[2 * i + 0], - dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product - const int dot2 = - dpct::dp4a(0x01010101, u[2 * i + 0], - dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); - - } - - const sycl::float2 dm5f = - dm5.convert(); - - return dm5f.x() * sumf_d - dm5f.y() * sumf_m; -} - -// contiguous u/y values -static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( - const int *__restrict__ v, const int *__restrict__ u, - const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j], - sumi_d); // SIMD dot product - } - - const sycl::float2 ds8f = - ds8[i].convert(); - - sumf_d += ds8f.x() * (sc[i] * sumi_d); - sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val - } - - const sycl::float2 dm4f = - dm4.convert(); - - return dm4f.x() * sumf_d - dm4f.y() * sumf_m; -} - -#define VDR_Q6_K_Q8_1_MMVQ 1 -#define VDR_Q6_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __dpct_inline__ float -vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, - const int *__restrict__ u, - const int8_t *__restrict__ scales, const float &d, - const float *__restrict__ d8) { - - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4*i]; - - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; - - const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - - const int vi = dpct::vectorized_binary( - (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 - - sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d*sumf; -} - -// contiguous u/y values -static __dpct_inline__ float -vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, - const int8_t *__restrict__ sc, const float &d6, - const float *__restrict__ d8) { - - float sumf_d = 0.0f; - -#pragma unroll - for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { - sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale - -#pragma unroll - for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0], - sumi_d.x()); // SIMD dot product - sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1], - sumi_d.x()); // SIMD dot product - - sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4], - sumi_d.y()); // SIMD dot product - sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5], - sumi_d.y()); // SIMD dot product - } - - sumf_d += d8[i0 / 4] * - (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y()); - } - - return d6 * sumf_d; -} - -static __dpct_inline__ float -vec_dot_q4_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; - - int v[VDR_Q4_0_Q8_1_MMVQ]; - int u[2*VDR_Q4_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); - } - - return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); -} - -template -static __dpct_inline__ void -allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_qs_q4_0, float *tile_x_d_q4_0) { - (void)x_qh; (void)x_sc; - - *x_ql = tile_x_qs_q4_0; - *x_dm = (sycl::half2 *)tile_x_d_q4_0; -} - -template -static __dpct_inline__ void -load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; (void)x_sc; - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_0; - const int kqsx = k % QI4_0; - - const block_q4_0 * bx0 = (const block_q4_0 *) vx; - - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { - int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; - } -} - -static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (const float *) x_dm; - - int u[2*VDR_Q4_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; - } - - return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __dpct_inline__ float -vec_dot_q4_1_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; - - int v[VDR_Q4_1_Q8_1_MMVQ]; - int u[2*VDR_Q4_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); - } - - return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); -} - -template -static __dpct_inline__ void -allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) { - (void)x_qh; (void)x_sc; - - *x_ql = tile_x_qs_q4_1; - *x_dm = tile_x_dm_q4_1; -} - -template -static __dpct_inline__ void -load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; (void)x_sc; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_1; - const int kqsx = k % QI4_1; - - const block_q4_1 * bx0 = (const block_q4_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { - int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; - } -} - -static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - - int u[2*VDR_Q4_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; - } - - return vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __dpct_inline__ float -vec_dot_q5_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; - - int vl[VDR_Q5_0_Q8_1_MMVQ]; - int vh[VDR_Q5_0_Q8_1_MMVQ]; - int u[2*VDR_Q5_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); - vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); - } - - return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); -} - -template -static __dpct_inline__ void -allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q5_0, float *tile_x_d_q5_0) { - (void)x_qh; (void)x_sc; - - *x_ql = tile_x_ql_q5_0; - *x_dm = (sycl::half2 *)tile_x_d_q5_0; -} - -template -static __dpct_inline__ void -load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; (void)x_sc; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_0; - const int kqsx = k % QI5_0; - - const block_q5_0 * bx0 = (const block_q5_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; - - const int ql = get_int_from_uint8(bxi->qs, kqsx); - const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - qs0 = dpct::vectorized_binary( - qs0, 0x10101010, dpct::sub_sat()); // subtract 16 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - qs1 = dpct::vectorized_binary( - qs1, 0x10101010, dpct::sub_sat()); // subtract 16 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { - int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; - } -} - -static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - int u[2*VDR_Q5_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; - } - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __dpct_inline__ float -vec_dot_q5_1_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; - - int vl[VDR_Q5_1_Q8_1_MMVQ]; - int vh[VDR_Q5_1_Q8_1_MMVQ]; - int u[2*VDR_Q5_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); - vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); - } - - return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); -} - -template -static __dpct_inline__ void -allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) { - (void)x_qh; (void)x_sc; - - *x_ql = tile_x_ql_q5_1; - *x_dm = tile_x_dm_q5_1; -} - -template -static __dpct_inline__ void -load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; (void)x_sc; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_1; - const int kqsx = k % QI5_1; - - const block_q5_1 * bx0 = (const block_q5_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { - int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; - } -} - -static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; - - int u[2*VDR_Q5_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; - } - - return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __dpct_inline__ float -vec_dot_q8_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; - - int v[VDR_Q8_0_Q8_1_MMVQ]; - int u[VDR_Q8_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_int8(bq8_0->qs, iqs + i); - u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - } - - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, - bq8_1->ds[0]); -} - -template -static __dpct_inline__ void -allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_qs_q8_0, float *tile_x_d_q8_0) { - (void)x_qh; (void)x_sc; - - *x_ql = tile_x_qs_q8_0; - *x_dm = (sycl::half2 *)tile_x_d_q8_0; -} - -template -static __dpct_inline__ void -load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; (void)x_sc; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI8_0; - const int kqsx = k % QI8_0; - float * x_dmf = (float *) x_dm; - - const block_q8_0 * bx0 = (const block_q8_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { - int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; - } -} - -static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; (void)x_sc; - - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], - y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); -} - -static __dpct_inline__ float -vec_dot_q2_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q2_K * bq2_K = (const block_q2_K *) vbq; - - const int bq8_offset = QR2_K * (iqs / QI8_1); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const uint8_t * scales = bq2_K->scales + scale_offset; - - const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); - int u[QR2_K]; - float d8[QR2_K]; - -#pragma unroll - for (int i = 0; i < QR2_K; ++ i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds[0]; - } - - return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); -} - -template -static __dpct_inline__ void -allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K, - int *tile_x_sc_q2_K) { - (void)x_qh; - - *x_ql = tile_x_ql_q2_K; - *x_dm = tile_x_dm_q2_K; - *x_sc = tile_x_sc_q2_K; -} - -template -static __dpct_inline__ void -load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI2_K; - const int kqsx = k % QI2_K; - - const block_q2_K * bx0 = (const block_q2_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); - - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); - } -} - -static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; - - const int kbx = k / QI2_K; - const int ky = (k % QI2_K) * QR2_K; - const float * y_df = (const float *) y_ds; - - int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); - const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); - -#pragma unroll - for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { - v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; - } - - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - - const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; - return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); -} - -static __dpct_inline__ float -vec_dot_q3_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q3_K * bq3_K = (const block_q3_K *) vbq; - - const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const float d = bq3_K->d; - - const int vl = get_int_from_uint8(bq3_K->qs, iqs); - - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; - - int u[QR3_K]; - float d8[QR3_K]; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds[0]; - } - - return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); -} - -template -static __dpct_inline__ void -allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K, - int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) { - - *x_ql = tile_x_ql_q3_K; - *x_dm = tile_x_dm_q3_K; - *x_qh = tile_x_qh_q3_K; - *x_sc = tile_x_sc_q3_K; -} - -template -static __dpct_inline__ void -load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI3_K; - const int kqsx = k % QI3_K; - - const block_q3_K * bx0 = (const block_q3_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); - - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); - - const int ksc = k % (QI3_K/4); - - const int ksc_low = ksc % (QI3_K/8); - const int shift_low = 4 * (ksc / (QI3_K/8)); - const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; - - const int ksc_high = QI3_K/8; - const int shift_high = 2 * ksc; - const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; - - const int sc = dpct::vectorized_binary( - sc_low | sc_high, 0x20202020, dpct::sub_sat()); - - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; - } -} - -static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - - const int kbx = k / QI3_K; - const int ky = (k % QI3_K) * QR3_K; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; - - int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); - const int shift = 2 * ((ky % 32) / 8); - const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); - const int vlh = (vh << 2) & 0x04040404; - - v[l] = dpct::vectorized_binary(vll, vlh, dpct::sub_sat()); - } - - const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; - return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); -} - -static __dpct_inline__ float -vec_dot_q4_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - int v[2]; - int u[2*QR4_K]; - float d8[QR4_K]; - - // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); - - // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 - // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 - // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 - // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 - - const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - v[0] = q4[0]; - v[1] = q4[4]; - - const uint16_t * scales = (const uint16_t *)bq4_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - - for (int i = 0; i < QR4_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds[0]; - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); -} - -template -static __dpct_inline__ void -allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K, - int *tile_x_sc_q4_K) { - (void)x_qh; - - *x_ql = tile_x_ql_q4_K; - *x_dm = tile_x_dm_q4_K; - *x_sc = tile_x_sc_q4_K; -} - -template -static __dpct_inline__ void -load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_K; // == 0 if QK_K == 256 - const int kqsx = k % QI4_K; // == k if QK_K == 256 - - const block_q4_K * bx0 = (const block_q4_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE/8); - - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; - } -} - -static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; - - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - - const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); -} - -static __dpct_inline__ float -vec_dot_q5_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - int vl[2]; - int vh[2]; - int u[2*QR5_K]; - float d8[QR5_K]; - - const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); - const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); - - vl[0] = ql[0]; - vl[1] = ql[4]; - - vh[0] = qh[0] >> bq8_offset; - vh[1] = qh[4] >> bq8_offset; - - const uint16_t * scales = (const uint16_t *)bq5_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds[0]; - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); -} - -template -static __dpct_inline__ void -allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K, - int *tile_x_sc_q5_K) { - (void)x_qh; - - *x_ql = tile_x_ql_q5_K; - *x_dm = tile_x_dm_q5_K; - *x_sc = tile_x_sc_q5_K; -} - -template -static __dpct_inline__ void -load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_K; // == 0 if QK_K == 256 - const int kqsx = k % QI5_K; // == k if QK_K == 256 - - const block_q5_K * bx0 = (const block_q5_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR5_K*kqsx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); - const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; - const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; - - const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; - const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); - - x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; - x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE/8); - - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; - } -} - -static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; - - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; - return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); -} - -static __dpct_inline__ float -vec_dot_q6_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_q6_K * bq6_K = (const block_q6_K *) vbq; - - const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); - const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); - const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); - - const int vl = get_int_from_uint8(bq6_K->ql, iqs); - const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; - - const int8_t * scales = bq6_K->scales + scale_offset; - - int u[QR6_K]; - float d8[QR6_K]; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; - } - - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); -} - -template -static __dpct_inline__ void -allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, - int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { - (void)x_qh; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template -static __dpct_inline__ void -load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql, - sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, - int *__restrict__ x_sc, const int &i_offset, const int &i_max, - const int &k, const int &blocks_per_row) { - (void)x_qh; - - GGML_SYCL_ASSUME(i_offset >= 0); - GGML_SYCL_ASSUME(i_offset < nwarps); - GGML_SYCL_ASSUME(k >= 0); - GGML_SYCL_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - const block_q6_K * bx0 = (const block_q6_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR6_K*kqsx; - - const int ql = get_int_from_uint8(bxi->ql, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); - const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; - const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; - - const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; - const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); - - x_ql[i * (2 * WARP_SIZE + 1) + kq0] = - dpct::vectorized_binary(ql0 | qh0, 0x20202020, - dpct::sub_sat()); - x_ql[i * (2 * WARP_SIZE + 1) + kq1] = - dpct::vectorized_binary(ql1 | qh1, 0x20202020, - dpct::sub_sat()); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = sycl::min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; - - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); - } -} - -static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( - const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, - const int *__restrict__ x_qh, const int *__restrict__ x_sc, - const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k) { - (void)x_qh; - - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; - return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); -} - - -static __dpct_inline__ float -vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { - const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; - -#if QR2_XXS == 8 - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = q2[2] | (q2[3] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); - const uint8_t signs = ksigns_iq2xs[aux32 & 127]; - for (int j = 0; j < 8; ++j) { - sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - aux32 >>= 7; - } - const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f; - return d * sumi; -#else - // iqs is 0...15 - const int ib32 = iqs/2; - const int il = iqs%2; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f; - const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; - const int8_t * q8 = bq8_1[ib32].qs + 16*il; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); - sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); - } - return d * (sumi1 + sumi2); -#endif -} - -static __dpct_inline__ float -vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; - - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs[0], signs[0], std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs[1], signs[1], std::minus<>()); - sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs[0], signs[0], std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs[1], signs[1], std::minus<>()); - sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - assert(false); - return 0.f; -#endif -} - -static __dpct_inline__ float -vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - const block_iq2_s * bq2 = (const block_iq2_s *) vbq; - - const int ib32 = iqs; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = dpct::vectorized_binary( - ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, - std::equal_to<>()); - const uint32_t signs1 = dpct::vectorized_binary( - ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, - std::equal_to<>()); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs0, signs0, std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs1, signs1, std::minus<>()); - sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = dpct::vectorized_binary( - ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, - std::equal_to<>()); - const uint32_t signs1 = dpct::vectorized_binary( - ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, - std::equal_to<>()); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs0, signs0, std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs1, signs1, std::minus<>()); - sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -} - -static __dpct_inline__ float -vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; - - const int ib32 = iqs; - const uint8_t * q3 = bq2->qs + 8*ib32; - const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = gas[0] | (gas[1] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; - const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; - const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); - const int grid_l = dpct::vectorized_binary( - grid1[0] ^ signs[0], signs[0], std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid2[0] ^ signs[1], signs[1], std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); - q8 += 8; - aux32 >>= 7; - } - const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f; - return d * sumi; -#else - assert(false); - return 0.f; -#endif -} - -static __dpct_inline__ float -vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq3s_grid) { - const block_iq3_s * bq2 = (const block_iq3_s *) vbq; - - const int ib32 = iqs; - const uint8_t * qs = bq2->qs + 8*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); - uint32_t signs0 = dpct::vectorized_binary( - ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201, - 0x08040201, std::equal_to<>()); - uint32_t signs1 = dpct::vectorized_binary( - ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201, - 0x08040201, std::equal_to<>()); - const int grid_l = dpct::vectorized_binary( - grid1[0] ^ signs0, signs0, std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid2[0] ^ signs1, signs1, std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); - q8 += 8; - } - const float d = - (float)bq2->d * - (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * - bq8_1[ib32].ds[0]; - return d * sumi; -} - -static __dpct_inline__ float -vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq1s_grid_gpu) { - const block_iq1_s * bq1 = (const block_iq1_s *) vbq; - - const int ib32 = iqs; - int sumi = 0; - const int * q8 = (const int *)bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); - int grid0 = grid[0] & 0x0f0f0f0f; - int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; - sumi = dpct::dp4a(q8[2 * l + 1], grid1, - dpct::dp4a(q8[2 * l + 0], grid0, sumi)); - } - - const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; - const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); - const float d = d1q * bq8_1[ib32].ds[0]; - const float m = d1q * bq8_1[ib32].ds[1]; - return d * sumi + m * delta; -} - -static __dpct_inline__ float -vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - const block_iq1_m * bq1 = (const block_iq1_m *) vbq; - - const int ib32 = iqs; - int sumi[2] = {0, 0}; - float sumf[2] = {0.f, 0.f}; - - const int * q8 = (const int *)bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); - int grid0 = grid[0] & 0x0f0f0f0f; - int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; - sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1, - dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2])); - const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; - const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101, - dpct::dp4a(q8[2 * l + 0], 0x01010101, 0)); - sumf[l/2] += delta*sumy; - } - - iq1m_scale_t scale; - const uint16_t * sc = (const uint16_t *)bq1->scales; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; - return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); -} - -static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, - const uint8_t *values, - int &val1, int &val2) { - - uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; - aux32 = q4 & 0x0f0f0f0f; - uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); - uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); - val1 = v1 | (v2 << 16); - aux32 = (q4 >> 4) & 0x0f0f0f0f; - v1 = values[q8[0]] | (values[q8[1]] << 8); - v2 = values[q8[2]] | (values[q8[3]] << 8); - val2 = v1 | (v2 << 16); -} - - -static __dpct_inline__ float -vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_iq4_nl * bq = (const block_iq4_nl *) vbq; - - const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; - const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; - - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { - const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); - get_int_from_table_16(aux, values, v1, v2); - sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1); - sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2); - } - - const float d = (float)bq->d * bq8_1->ds[0]; - return d * (sumi1 + sumi2); -} - - -static __dpct_inline__ float -vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs) { - - const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - // iqs is 0...7 - const int ib32 = iqs; - const int32_t * q8 = (const int *)bq8_1[ib32].qs; - const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; - const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0]; - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 4; ++j) { - get_int_from_table_16(q4[j], values, v1, v2); - sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1); - sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); - } - return d * (sumi1 + sumi2); -} - -template -/* -DPCT1110:8: The total declared local variable size in device function mul_mat_q -exceeds 128 bytes and may cause high register pressure. Consult with your -hardware vendor to find the total register size available and adjust the code, -or use smaller sub-group size to avoid high register pressure. -*/ -static __dpct_inline__ void -mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, - float *__restrict__ dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh, - int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs, - sycl::half2 *tile_y_ds) { - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE / qi; - - const int & ncols_dst = ncols_y; - - const int row_dst_0 = item_ct1.get_group(2) * mmq_y; - const int & row_x_0 = row_dst_0; - - const int col_dst_0 = item_ct1.get_group(1) * mmq_x; - const int & col_y_0 = col_dst_0; - - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; - - for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { - - load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, - tile_x_qh, tile_x_sc, item_ct1.get_local_id(1), - nrows_x - row_x_0 - 1, item_ct1.get_local_id(2), - blocks_per_row_x); - -#pragma unroll - for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2); - const int kbxd = kqs / QI8_1; - -#pragma unroll - for (int i = 0; i < mmq_x; i += nwarps) { - const int col_y_eff = dpct::min( - (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), - ncols_y - 1); // to prevent out-of-bounds memory accesses - - const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - - const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE + - kqs % WARP_SIZE; - tile_y_qs[index_y] = get_int_from_int8_aligned( - by0->qs, item_ct1.get_local_id(2) % QI8_1); - } - -#pragma unroll - for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = - (ids0 + item_ct1.get_local_id(1) * QI8_1 + - item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) % - mmq_x; - const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1); - const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const sycl::half2 *dsi_src = - &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) + - ir * (WARP_SIZE / QI8_1) + kby] - .ds; - sycl::half2 *dsi_dst = - &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src)[0]; - } - } - - /* - DPCT1118:9: SYCL group functions and algorithms must be encountered - in converged control flow. You may need to adjust the code. - */ - /* - DPCT1065:56: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); - -// #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - sum[i / WARP_SIZE][j / nwarps] += vec_dot( - tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i, - item_ct1.get_local_id(1) + j, k); - } - } - } - - /* - DPCT1118:10: SYCL group functions and algorithms must be encountered - in converged control flow. You may need to adjust the code. - */ - /* - DPCT1065:57: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); - } - } - -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { - const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1); - - if (col_dst >= ncols_dst) { - return; - } - -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i; - - if (row_dst >= nrows_dst) { - continue; - } - - dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; - } - } -} - -#define MMQ_X_Q4_0_RDNA2 64 -#define MMQ_Y_Q4_0_RDNA2 128 -#define NWARPS_Q4_0_RDNA2 8 -#define MMQ_X_Q4_0_RDNA1 64 -#define MMQ_Y_Q4_0_RDNA1 64 -#define NWARPS_Q4_0_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q4_0_AMPERE 4 -#define MMQ_Y_Q4_0_AMPERE 32 -#define NWARPS_Q4_0_AMPERE 4 -#else -#define MMQ_X_Q4_0_AMPERE 64 -#define MMQ_Y_Q4_0_AMPERE 128 -#define NWARPS_Q4_0_AMPERE 4 -#endif -#define MMQ_X_Q4_0_PASCAL 64 -#define MMQ_Y_Q4_0_PASCAL 64 -#define NWARPS_Q4_0_PASCAL 8 - -template static void - mul_mat_q4_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0, - int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - - const int mmq_x = MMQ_X_Q4_0_AMPERE; - const int mmq_y = MMQ_Y_Q4_0_AMPERE; - const int nwarps = NWARPS_Q4_0_AMPERE; - allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_qs_q4_0, tile_x_d_q4_0); - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, - vec_dot_q4_0_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q4_1_RDNA2 64 -#define MMQ_Y_Q4_1_RDNA2 128 -#define NWARPS_Q4_1_RDNA2 8 -#define MMQ_X_Q4_1_RDNA1 64 -#define MMQ_Y_Q4_1_RDNA1 64 -#define NWARPS_Q4_1_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q4_1_AMPERE 4 -#define MMQ_Y_Q4_1_AMPERE 32 -#define NWARPS_Q4_1_AMPERE 4 -#else -#define MMQ_X_Q4_1_AMPERE 64 -#define MMQ_Y_Q4_1_AMPERE 128 -#define NWARPS_Q4_1_AMPERE 4 -#endif -#define MMQ_X_Q4_1_PASCAL 64 -#define MMQ_Y_Q4_1_PASCAL 64 -#define NWARPS_Q4_1_PASCAL 8 - -template static void - mul_mat_q4_1( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1, - sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q4_1_AMPERE; - const int mmq_y = MMQ_Y_Q4_1_AMPERE; - const int nwarps = NWARPS_Q4_1_AMPERE; - allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_qs_q4_1, tile_x_dm_q4_1); - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, - vec_dot_q4_1_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q5_0_RDNA2 64 -#define MMQ_Y_Q5_0_RDNA2 128 -#define NWARPS_Q5_0_RDNA2 8 -#define MMQ_X_Q5_0_RDNA1 64 -#define MMQ_Y_Q5_0_RDNA1 64 -#define NWARPS_Q5_0_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q5_0_AMPERE 4 -#define MMQ_Y_Q5_0_AMPERE 32 -#define NWARPS_Q5_0_AMPERE 4 -#else -#define MMQ_X_Q5_0_AMPERE 128 -#define MMQ_Y_Q5_0_AMPERE 64 -#define NWARPS_Q5_0_AMPERE 4 -#endif -#define MMQ_X_Q5_0_PASCAL 64 -#define MMQ_Y_Q5_0_PASCAL 64 -#define NWARPS_Q5_0_PASCAL 8 - -template static void - mul_mat_q5_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0, - int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q5_0_AMPERE; - const int mmq_y = MMQ_Y_Q5_0_AMPERE; - const int nwarps = NWARPS_Q5_0_AMPERE; - allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q5_0, tile_x_d_q5_0); - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, - vec_dot_q5_0_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q5_1_RDNA2 64 -#define MMQ_Y_Q5_1_RDNA2 128 -#define NWARPS_Q5_1_RDNA2 8 -#define MMQ_X_Q5_1_RDNA1 64 -#define MMQ_Y_Q5_1_RDNA1 64 -#define NWARPS_Q5_1_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q5_1_AMPERE 4 -#define MMQ_Y_Q5_1_AMPERE 32 -#define NWARPS_Q5_1_AMPERE 4 -#else -#define MMQ_X_Q5_1_AMPERE 128 -#define MMQ_Y_Q5_1_AMPERE 64 -#define NWARPS_Q5_1_AMPERE 4 -#endif -#define MMQ_X_Q5_1_PASCAL 64 -#define MMQ_Y_Q5_1_PASCAL 64 -#define NWARPS_Q5_1_PASCAL 8 - -template static void -mul_mat_q5_1( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1, - sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q5_1_AMPERE; - const int mmq_y = MMQ_Y_Q5_1_AMPERE; - const int nwarps = NWARPS_Q5_1_AMPERE; - allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q5_1, tile_x_dm_q5_1); - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, - vec_dot_q5_1_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q8_0_RDNA2 64 -#define MMQ_Y_Q8_0_RDNA2 128 -#define NWARPS_Q8_0_RDNA2 8 -#define MMQ_X_Q8_0_RDNA1 64 -#define MMQ_Y_Q8_0_RDNA1 64 -#define NWARPS_Q8_0_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q8_0_AMPERE 4 -#define MMQ_Y_Q8_0_AMPERE 32 -#define NWARPS_Q8_0_AMPERE 4 -#else -#define MMQ_X_Q8_0_AMPERE 128 -#define MMQ_Y_Q8_0_AMPERE 64 -#define NWARPS_Q8_0_AMPERE 4 -#endif -#define MMQ_X_Q8_0_PASCAL 64 -#define MMQ_Y_Q8_0_PASCAL 64 -#define NWARPS_Q8_0_PASCAL 8 - -template static void - mul_mat_q8_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0, - int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q8_0_AMPERE; - const int mmq_y = MMQ_Y_Q8_0_AMPERE; - const int nwarps = NWARPS_Q8_0_AMPERE; - allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_qs_q8_0, tile_x_d_q8_0); - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, - vec_dot_q8_0_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q2_K_RDNA2 64 -#define MMQ_Y_Q2_K_RDNA2 128 -#define NWARPS_Q2_K_RDNA2 8 -#define MMQ_X_Q2_K_RDNA1 128 -#define MMQ_Y_Q2_K_RDNA1 32 -#define NWARPS_Q2_K_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q2_K_AMPERE 4 -#define MMQ_Y_Q2_K_AMPERE 32 -#define NWARPS_Q2_K_AMPERE 4 -#else -#define MMQ_X_Q2_K_AMPERE 64 -#define MMQ_Y_Q2_K_AMPERE 128 -#define NWARPS_Q2_K_AMPERE 4 -#endif -#define MMQ_X_Q2_K_PASCAL 64 -#define MMQ_Y_Q2_K_PASCAL 64 -#define NWARPS_Q2_K_PASCAL 8 - -template static void -mul_mat_q2_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K, - sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs, - sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q2_K_AMPERE; - const int mmq_y = MMQ_Y_Q2_K_AMPERE; - const int nwarps = NWARPS_Q2_K_AMPERE; - allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K); - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, - vec_dot_q2_K_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q3_K_RDNA2 128 -#define MMQ_Y_Q3_K_RDNA2 64 -#define NWARPS_Q3_K_RDNA2 8 -#define MMQ_X_Q3_K_RDNA1 32 -#define MMQ_Y_Q3_K_RDNA1 128 -#define NWARPS_Q3_K_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q3_K_AMPERE 4 -#define MMQ_Y_Q3_K_AMPERE 32 -#define NWARPS_Q3_K_AMPERE 4 -#else -#define MMQ_X_Q3_K_AMPERE 128 -#define MMQ_Y_Q3_K_AMPERE 128 -#define NWARPS_Q3_K_AMPERE 4 -#endif -#define MMQ_X_Q3_K_PASCAL 64 -#define MMQ_Y_Q3_K_PASCAL 64 -#define NWARPS_Q3_K_PASCAL 8 - -template static void -mul_mat_q3_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K, - sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K, - int *tile_y_qs, sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q3_K_AMPERE; - const int mmq_y = MMQ_Y_Q3_K_AMPERE; - const int nwarps = NWARPS_Q3_K_AMPERE; - allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, - tile_x_sc_q3_K); - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, - vec_dot_q3_K_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q4_K_RDNA2 64 -#define MMQ_Y_Q4_K_RDNA2 128 -#define NWARPS_Q4_K_RDNA2 8 -#define MMQ_X_Q4_K_RDNA1 32 -#define MMQ_Y_Q4_K_RDNA1 64 -#define NWARPS_Q4_K_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q4_K_AMPERE 4 -#define MMQ_Y_Q4_K_AMPERE 32 -#define NWARPS_Q4_K_AMPERE 4 -#else -#define MMQ_X_Q4_K_AMPERE 64 -#define MMQ_Y_Q4_K_AMPERE 128 -#define NWARPS_Q4_K_AMPERE 4 -#endif -#define MMQ_X_Q4_K_PASCAL 64 -#define MMQ_Y_Q4_K_PASCAL 64 -#define NWARPS_Q4_K_PASCAL 8 - -template static void - mul_mat_q4_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K, - sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs, - sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q4_K_AMPERE; - const int mmq_y = MMQ_Y_Q4_K_AMPERE; - const int nwarps = NWARPS_Q4_K_AMPERE; - allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K); - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, - vec_dot_q4_K_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q5_K_RDNA2 64 -#define MMQ_Y_Q5_K_RDNA2 128 -#define NWARPS_Q5_K_RDNA2 8 -#define MMQ_X_Q5_K_RDNA1 32 -#define MMQ_Y_Q5_K_RDNA1 64 -#define NWARPS_Q5_K_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q5_K_AMPERE 4 -#define MMQ_Y_Q5_K_AMPERE 32 -#define NWARPS_Q5_K_AMPERE 4 -#else -#define MMQ_X_Q5_K_AMPERE 64 -#define MMQ_Y_Q5_K_AMPERE 128 -#define NWARPS_Q5_K_AMPERE 4 -#endif -#define MMQ_X_Q5_K_PASCAL 64 -#define MMQ_Y_Q5_K_PASCAL 64 -#define NWARPS_Q5_K_PASCAL 8 - -template static void -mul_mat_q5_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K, - sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs, - sycl::half2 *tile_y_ds) { - int * tile_x_ql = nullptr; - sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q5_K_AMPERE; - const int mmq_y = MMQ_Y_Q5_K_AMPERE; - const int nwarps = NWARPS_Q5_K_AMPERE; - allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K); - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, - vec_dot_q5_K_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -#define MMQ_X_Q6_K_RDNA2 64 -#define MMQ_Y_Q6_K_RDNA2 128 -#define NWARPS_Q6_K_RDNA2 8 -#define MMQ_X_Q6_K_RDNA1 32 -#define MMQ_Y_Q6_K_RDNA1 64 -#define NWARPS_Q6_K_RDNA1 8 -#if defined(SYCL_USE_XMX) -#define MMQ_X_Q6_K_AMPERE 4 -#define MMQ_Y_Q6_K_AMPERE 32 -#define NWARPS_Q6_K_AMPERE 4 -#else -#define MMQ_X_Q6_K_AMPERE 64 -#define MMQ_Y_Q6_K_AMPERE 64 -#define NWARPS_Q6_K_AMPERE 4 -#endif -#define MMQ_X_Q6_K_PASCAL 64 -#define MMQ_Y_Q6_K_PASCAL 64 -#define NWARPS_Q6_K_PASCAL 8 - -template static void - mul_mat_q6_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, - int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { - // int * tile_x_ql = nullptr; - // sycl::half2 *tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - // int * tile_x_sc = nullptr; - -//sycl_todo: change according to hardware - const int mmq_x = MMQ_X_Q6_K_AMPERE; - const int mmq_y = MMQ_Y_Q6_K_AMPERE; - const int nwarps = NWARPS_Q6_K_AMPERE; - allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, - tile_x_ql, tile_x_dm, tile_x_sc); - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, - vec_dot_q6_K_q8_1_mul_mat>( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, - tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); -} - -template -static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - - const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block - - // partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row * blocks_per_row + i; // x block index - - const int iby = i * (qk / QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) - - i * qi_vdr); // x block quant index when casting the quants to int - - tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -template -static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - - -template -static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, - const void *__restrict__ vy, - float *__restrict__ dst, const int ncols, - const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - -// partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - - -template -static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1) { - // qk = quantized weights per x block - // qr = number of quantized weights per data value in x block - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int tid = item_ct1.get_local_id(2); - - const int iter_stride = 2*GGML_SYCL_DMMV_X; - const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter - const int y_offset = qr == 1 ? 1 : qk/2; - -// partial sum for each thread -#ifdef GGML_SYCL_F16 - sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics -#else - float tmp = 0.0f; -#endif // GGML_SYCL_F16 - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = (row*ncols + col)/qk; // x block index - const int iqs = (col%qk)/qr; // x quant index - const int iybs = col - col%qk; // y block start index - -// processing >2 values per i iter is faster for fast GPUs -#pragma unroll - for (int j = 0; j < vals_per_iter; j += 2) { - // process 2 vals per j iter - - // dequantize - // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val - dfloat2 v; - dequantize_kernel(vx, ib, iqs + j/qr, v); - - // matrix multiplication - // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_SYCL_F16 - dfloat2 t1{y[iybs + iqs + j / qr + 0], - y[iybs + iqs + j / qr + y_offset]}; - - tmp += v * t1; -#else - tmp += v.x() * y[iybs + iqs + j / qr + 0]; - tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; -#endif // GGML_SYCL_F16 - } - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { -#ifdef GGML_SYCL_F16 - dst[row] = tmp.x() + tmp.y(); -#else - dst[row] = tmp; -#endif // GGML_SYCL_F16 - } -} - static void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, @@ -6219,8 +1770,7 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols, }); }); } else { - // FIXME: 1024 from cuda - const int work_group_size = GROUP_SIZE; + const int work_group_size = get_work_group_size(stream->get_device()); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:17: The work-group size passed to the SYCL kernel may exceed @@ -6266,7 +1816,7 @@ static void group_norm_f32_sycl(const float *x, float *dst, }); }); } else { - const int work_group_size = GROUP_SIZE; + const int work_group_size = get_work_group_size(stream->get_device()); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:18: The work-group size passed to the SYCL kernel may exceed @@ -6355,7 +1905,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols, }); }); } else { - const int work_group_size = GROUP_SIZE; + const int work_group_size = get_work_group_size(stream->get_device()); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:19: The work-group size passed to the SYCL kernel may exceed @@ -6396,2298 +1946,6 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, } } -template -static void dequantize_block_sycl(const void *__restrict__ vx, - dst_t *__restrict__ y, const int k, - queue_ptr stream) { - const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>( - sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block(vx, y, k, item_ct1); - }); - } -} - -template -static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); - } -} - -template -static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); - } -} - -template -static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb32 = k / 32; - const int nb = (k + 255) / 256; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_0(vx, y, nb32, item_ct1); - }); - } -} - -template -static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb32 = k / 32; - const int nb = (k + 255) / 256; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_1(vx, y, nb32, item_ct1); - }); - } -} - - -template -static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, item_ct1); - }); - } -} - -template -static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); - } -} - -template -static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); - } -} - -template -static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_s( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); - }); - } -} - -template -static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_m( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); - }); - } -} - -template -static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xxs( - vx, y, item_ct1, iq2xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xs( - vx, y, item_ct1, iq2xs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_s(vx, y, item_ct1); - }); - }); - } -} - - -template -static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_xxs( - vx, y, item_ct1, iq3xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_s( - vx, y, item_ct1, kmask_iq2xs, iq3s_grid); - }); - }); - } -} - -template -static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = (k + QK_K - 1) / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_xs(vx, y, item_ct1); - }); - }); - } -} - - -template -static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k, - queue_ptr stream) { - const int nb = (k + QK_K - 1) / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_nl(vx, y, item_ct1); - }); - }); - } -} - - - -template -static void convert_unary_sycl(const void *__restrict__ vx, - dst_t *__restrict__ y, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>( - sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - convert_unary(vx, y, k, item_ct1); - }); - } -} - - -static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { - int id; - switch (type) { - case GGML_TYPE_Q4_0: - return dequantize_block_sycl; - case GGML_TYPE_Q4_1: - return dequantize_block_sycl; - case GGML_TYPE_Q5_0: - return dequantize_block_sycl; - case GGML_TYPE_Q5_1: - return dequantize_block_sycl; - case GGML_TYPE_Q8_0: - return dequantize_block_sycl; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_sycl; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; - case GGML_TYPE_Q4_K: - return dequantize_row_q4_K_sycl; - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_sycl; - case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; - case GGML_TYPE_IQ1_M: - return dequantize_row_iq1_m_sycl; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_sycl; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_sycl; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_sycl; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_sycl; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_sycl; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_sycl; - case GGML_TYPE_F32: - return convert_unary_sycl; - default: - return nullptr; - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - return dequantize_row_q4_0_sycl; - case GGML_TYPE_Q4_1: - return dequantize_row_q4_1_sycl; - case GGML_TYPE_Q5_0: - return dequantize_block_sycl; - case GGML_TYPE_Q5_1: - return dequantize_block_sycl; - case GGML_TYPE_Q8_0: - return dequantize_block_sycl; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_sycl; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; - case GGML_TYPE_Q4_K: - return dequantize_row_q4_K_sycl; - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_sycl; - case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; - case GGML_TYPE_IQ1_M: - return dequantize_row_iq1_m_sycl; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_sycl; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_sycl; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_sycl; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_sycl; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_sycl; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_sycl; - case GGML_TYPE_F16: - return convert_unary_sycl; - default: - return nullptr; - } -} - -static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, 32); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, 32); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, 32); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const sycl::range<3> block_dims(1, 1, 32); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, 32); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, - nrows, item_ct1); - }); - } -} - - -static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK4_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK5_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK5_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - - -static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; - auto ksigns64_ptr_ct1 = &ksigns64[0]; - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; - auto ksigns64_ptr_ct1 = &ksigns64[0]; - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; - auto ksigns64_ptr_ct1 = &ksigns64[0]; - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq3_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq3_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; - auto ksigns64_ptr_ct1 = &ksigns64[0]; - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq1_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq1_m_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK4_NL == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq4_nl_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq4_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); - }); - } -} - -static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q4_0_RDNA2; - mmq_y = MMQ_Y_Q4_0_RDNA2; - nwarps = NWARPS_Q4_0_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q4_0_RDNA1; - mmq_y = MMQ_Y_Q4_0_RDNA1; - nwarps = NWARPS_Q4_0_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q4_0_AMPERE; - mmq_y = MMQ_Y_Q4_0_AMPERE; - nwarps = NWARPS_Q4_0_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q4_0_PASCAL; - mmq_y = MMQ_Y_Q4_0_PASCAL; - nwarps = NWARPS_Q4_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:20: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q4_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q4_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q4_0_acc_ct1.get_pointer(), - tile_x_d_q4_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:21: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q4_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q4_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q4_0_acc_ct1.get_pointer(), - tile_x_d_q4_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q4_1_RDNA2; - mmq_y = MMQ_Y_Q4_1_RDNA2; - nwarps = NWARPS_Q4_1_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q4_1_RDNA1; - mmq_y = MMQ_Y_Q4_1_RDNA1; - nwarps = NWARPS_Q4_1_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q4_1_AMPERE; - mmq_y = MMQ_Y_Q4_1_AMPERE; - nwarps = NWARPS_Q4_1_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q4_1_PASCAL; - mmq_y = MMQ_Y_Q4_1_PASCAL; - nwarps = NWARPS_Q4_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:22: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q4_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); - sycl::local_accessor tile_x_dm_q4_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_1( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q4_1_acc_ct1.get_pointer(), - tile_x_dm_q4_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:23: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q4_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); - sycl::local_accessor tile_x_dm_q4_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_1( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q4_1_acc_ct1.get_pointer(), - tile_x_dm_q4_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q5_0_RDNA2; - mmq_y = MMQ_Y_Q5_0_RDNA2; - nwarps = NWARPS_Q5_0_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q5_0_RDNA1; - mmq_y = MMQ_Y_Q5_0_RDNA1; - nwarps = NWARPS_Q5_0_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q5_0_AMPERE; - mmq_y = MMQ_Y_Q5_0_AMPERE; - nwarps = NWARPS_Q5_0_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q5_0_PASCAL; - mmq_y = MMQ_Y_Q5_0_PASCAL; - nwarps = NWARPS_Q5_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:24: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_0_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q5_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_0_acc_ct1.get_pointer(), - tile_x_d_q5_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:25: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_0_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q5_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_0_acc_ct1.get_pointer(), - tile_x_d_q5_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q5_1_RDNA2; - mmq_y = MMQ_Y_Q5_1_RDNA2; - nwarps = NWARPS_Q5_1_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q5_1_RDNA1; - mmq_y = MMQ_Y_Q5_1_RDNA1; - nwarps = NWARPS_Q5_1_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q5_1_AMPERE; - mmq_y = MMQ_Y_Q5_1_AMPERE; - nwarps = NWARPS_Q5_1_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q5_1_PASCAL; - mmq_y = MMQ_Y_Q5_1_PASCAL; - nwarps = NWARPS_Q5_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:26: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_1_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q5_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_1( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_1_acc_ct1.get_pointer(), - tile_x_dm_q5_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:27: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_1_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q5_1_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_1( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_1_acc_ct1.get_pointer(), - tile_x_dm_q5_1_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q8_0_RDNA2; - mmq_y = MMQ_Y_Q8_0_RDNA2; - nwarps = NWARPS_Q8_0_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q8_0_RDNA1; - mmq_y = MMQ_Y_Q8_0_RDNA1; - nwarps = NWARPS_Q8_0_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q8_0_AMPERE; - mmq_y = MMQ_Y_Q8_0_AMPERE; - nwarps = NWARPS_Q8_0_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q8_0_PASCAL; - mmq_y = MMQ_Y_Q8_0_PASCAL; - nwarps = NWARPS_Q8_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:28: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q8_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q8_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q8_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q8_0_acc_ct1.get_pointer(), - tile_x_d_q8_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:29: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_qs_q8_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_d_q8_0_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), - cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q8_0( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_qs_q8_0_acc_ct1.get_pointer(), - tile_x_d_q8_0_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q2_K_RDNA2; - mmq_y = MMQ_Y_Q2_K_RDNA2; - nwarps = NWARPS_Q2_K_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q2_K_RDNA1; - mmq_y = MMQ_Y_Q2_K_RDNA1; - nwarps = NWARPS_Q2_K_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q2_K_AMPERE; - mmq_y = MMQ_Y_Q2_K_AMPERE; - nwarps = NWARPS_Q2_K_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q2_K_PASCAL; - mmq_y = MMQ_Y_Q2_K_PASCAL; - nwarps = NWARPS_Q2_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:30: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), - cgh); - sycl::local_accessor tile_x_sc_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q2_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q2_K_acc_ct1.get_pointer(), - tile_x_dm_q2_K_acc_ct1.get_pointer(), - tile_x_sc_q2_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:31: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), - cgh); - sycl::local_accessor tile_x_sc_q2_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q2_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q2_K_acc_ct1.get_pointer(), - tile_x_dm_q2_K_acc_ct1.get_pointer(), - tile_x_sc_q2_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q3_K_RDNA2; - mmq_y = MMQ_Y_Q3_K_RDNA2; - nwarps = NWARPS_Q3_K_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q3_K_RDNA1; - mmq_y = MMQ_Y_Q3_K_RDNA1; - nwarps = NWARPS_Q3_K_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q3_K_AMPERE; - mmq_y = MMQ_Y_Q3_K_AMPERE; - nwarps = NWARPS_Q3_K_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q3_K_PASCAL; - mmq_y = MMQ_Y_Q3_K_PASCAL; - nwarps = NWARPS_Q3_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:32: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), - cgh); - sycl::local_accessor tile_x_qh_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); - sycl::local_accessor tile_x_sc_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q3_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q3_K_acc_ct1.get_pointer(), - tile_x_dm_q3_K_acc_ct1.get_pointer(), - tile_x_qh_q3_K_acc_ct1.get_pointer(), - tile_x_sc_q3_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:33: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), - cgh); - sycl::local_accessor tile_x_qh_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); - sycl::local_accessor tile_x_sc_q3_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q3_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q3_K_acc_ct1.get_pointer(), - tile_x_dm_q3_K_acc_ct1.get_pointer(), - tile_x_qh_q3_K_acc_ct1.get_pointer(), - tile_x_sc_q3_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q4_K_RDNA2; - mmq_y = MMQ_Y_Q4_K_RDNA2; - nwarps = NWARPS_Q4_K_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q4_K_RDNA1; - mmq_y = MMQ_Y_Q4_K_RDNA1; - nwarps = NWARPS_Q4_K_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q4_K_AMPERE; - mmq_y = MMQ_Y_Q4_K_AMPERE; - nwarps = NWARPS_Q4_K_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q4_K_PASCAL; - mmq_y = MMQ_Y_Q4_K_PASCAL; - nwarps = NWARPS_Q4_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:34: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), - cgh); - sycl::local_accessor tile_x_sc_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q4_K_acc_ct1.get_pointer(), - tile_x_dm_q4_K_acc_ct1.get_pointer(), - tile_x_sc_q4_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:35: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), - cgh); - sycl::local_accessor tile_x_sc_q4_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q4_K_acc_ct1.get_pointer(), - tile_x_dm_q4_K_acc_ct1.get_pointer(), - tile_x_sc_q4_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q5_K_RDNA2; - mmq_y = MMQ_Y_Q5_K_RDNA2; - nwarps = NWARPS_Q5_K_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q5_K_RDNA1; - mmq_y = MMQ_Y_Q5_K_RDNA1; - nwarps = NWARPS_Q5_K_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q5_K_AMPERE; - mmq_y = MMQ_Y_Q5_K_AMPERE; - nwarps = NWARPS_Q5_K_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q5_K_PASCAL; - mmq_y = MMQ_Y_Q5_K_PASCAL; - nwarps = NWARPS_Q5_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:36: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), - cgh); - sycl::local_accessor tile_x_sc_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_K_acc_ct1.get_pointer(), - tile_x_dm_q5_K_acc_ct1.get_pointer(), - tile_x_sc_q5_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:37: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), - cgh); - sycl::local_accessor tile_x_sc_q5_K_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_q5_K_acc_ct1.get_pointer(), - tile_x_dm_q5_K_acc_ct1.get_pointer(), - tile_x_sc_q5_K_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols_x, - const int nrows_x, const int ncols_y, - const int nrows_y, const int nrows_dst, - queue_ptr stream) try { - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= VER_GEN13) { - mmq_x = MMQ_X_Q6_K_RDNA2; - mmq_y = MMQ_Y_Q6_K_RDNA2; - nwarps = NWARPS_Q6_K_RDNA2; - } else if (compute_capability >= VER_GEN12) { - mmq_x = MMQ_X_Q6_K_RDNA1; - mmq_y = MMQ_Y_Q6_K_RDNA1; - nwarps = NWARPS_Q6_K_RDNA1; - } else if (compute_capability >= VER_GEN9) { - mmq_x = MMQ_X_Q6_K_AMPERE; - mmq_y = MMQ_Y_Q6_K_AMPERE; - nwarps = NWARPS_Q6_K_AMPERE; - } else if (compute_capability >= VER_4VEC) { - mmq_x = MMQ_X_Q6_K_PASCAL; - mmq_y = MMQ_Y_Q6_K_PASCAL; - nwarps = NWARPS_Q6_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const sycl::range<3> block_nums(1, block_num_y, block_num_x); - const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - /* - DPCT1049:38: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), - cgh); - sycl::local_accessor tile_x_sc_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q6_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_acc_ct1.get_pointer(), - tile_x_dm_acc_ct1.get_pointer(), - tile_x_sc_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } else { - const bool need_check = true; - /* - DPCT1049:39: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor tile_x_ql_acc_ct1( - sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); - sycl::local_accessor tile_x_dm_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), - cgh); - sycl::local_accessor tile_x_sc_acc_ct1( - sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); - sycl::local_accessor tile_y_qs_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE), cgh); - sycl::local_accessor tile_y_ds_acc_ct1( - sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q6_K( - vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, - nrows_dst, item_ct1, - tile_x_ql_acc_ct1.get_pointer(), - tile_x_dm_acc_ct1.get_pointer(), - tile_x_sc_acc_ct1.get_pointer(), - tile_y_qs_acc_ct1.get_pointer(), - tile_y_ds_acc_ct1.get_pointer()); - }); - }); - } - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, float *dst, const int ncols_x, const int nrows_x, @@ -9187,7 +2445,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const int nrows_y, const float scale, const float max_bias, queue_ptr stream) { int nth = WARP_SIZE; - int max_block_size = GROUP_SIZE; + int max_block_size = get_work_group_size(stream->get_device()); while (nth < ncols_x && nth < max_block_size) nth *= 2; if (nth>max_block_size) nth = max_block_size; @@ -9339,7 +2597,7 @@ void ggml_backend_sycl_print_sycl_devices() { } } -int get_sycl_env(const char *env_name, int default_val) { +static inline int get_sycl_env(const char *env_name, int default_val) { char *user_device_string = getenv(env_name); int user_number = default_val; @@ -9353,10 +2611,9 @@ int get_sycl_env(const char *env_name, int default_val) { return user_number; } -int get_work_group_size(int user_device_id) { +static inline int get_work_group_size(const sycl::device& device) { dpct::device_info prop; - dpct::get_device_info(prop, - dpct::dev_mgr::instance().get_device(user_device_id)); + dpct::get_device_info(prop, device); return prop.get_max_work_group_size(); } @@ -10042,76 +3299,6 @@ inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_te (void) src1_dd; } -inline void ggml_sycl_op_mul_mat_q( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const queue_ptr &stream) try { - - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - - int device_id; - SYCL_CHECK( - CHECK_TRY_ERROR(device_id = get_current_device_id())); - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = device_id == ctx.device ? ne0 : row_diff; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - default: - GGML_ASSERT(false); - break; - } - - (void) src1; - (void) dst; - (void) src1_ddf_i; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; @@ -10160,179 +3347,6 @@ static int64_t get_row_rounding(ggml_type type, const std::arrayne[0]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ1_M: - mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; - default: - GGML_ASSERT(false); - break; - } - - (void) src1; - (void) dst; - (void) src1_ddf_i; - (void) src1_ncols; - (void) src1_padded_row_size; -} - - -inline void ggml_sycl_op_dequantize_mul_mat_vec( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const queue_ptr &stream) { - - const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_SYCL_F16 - ggml_sycl_pool_alloc src1_dfloat_a(ctx.pool()); - sycl::half *src1_dfloat = nullptr; // dfloat == half - - bool src1_convert_f16 = - src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; - - if (src1_convert_f16) { - src1_dfloat = src1_dfloat_a.alloc(ne00); - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); - GGML_ASSERT(to_fp16_sycl != nullptr); - to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream); - } -#else - const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_SYCL_F16 - - switch (src0->type) { - case GGML_TYPE_Q4_0: - dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_F16: - convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - default: - printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); - GGML_ASSERT(false); - break; - } - - (void) src1; - (void) dst; - (void) src1_ddq_i; - (void) src1_ncols; - (void) src1_padded_row_size; -} - inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -11897,7 +4911,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS01; SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); diff --git a/ggml-sycl/backend.hpp b/ggml-sycl/backend.hpp index 88bae5967..2d37e271f 100644 --- a/ggml-sycl/backend.hpp +++ b/ggml-sycl/backend.hpp @@ -14,5 +14,10 @@ #define GGML_SYCL_BACKEND_HPP #include "common.hpp" +#include "convert.hpp" +#include "dequantize.hpp" +#include "dmmv.hpp" +#include "mmq.hpp" +#include "mmvq.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml-sycl/convert.cpp b/ggml-sycl/convert.cpp new file mode 100644 index 000000000..ce9de2b42 --- /dev/null +++ b/ggml-sycl/convert.cpp @@ -0,0 +1,544 @@ +#include "convert.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + +template +static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)); + + if (i >= k) { + return; + } + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x(); + y[iybs + iqs + y_offset] = v.y(); +} + +template +static void dequantize_block_sycl(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block(vx, y, k, item_ct1); + }); + } +} + +template +static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#endif +} + +template +static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_0(vx, y, nb32, item_ct1); + }); + } +} + +template +static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_1(vx, y, nb32, item_ct1); + }); + } +} + + +template +static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, item_ct1); + }); + } +} + +template +static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_m( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xxs( + vx, y, item_ct1, iq2xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xs( + vx, y, item_ct1, iq2xs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s(vx, y, item_ct1); + }); + }); + } +} + + +template +static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_xxs( + vx, y, item_ct1, iq3xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_s( + vx, y, item_ct1, kmask_iq2xs, iq3s_grid); + }); + }); + } +} + +template +static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_row_iq4_nl_sycl(vx, y, k, stream); +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_xs(vx, y, item_ct1); + }); + }); + } +#endif +} + +template +static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_nl(vx, y, item_ct1); + }); + }); + } +} + +template +static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + +template +static void convert_unary_sycl(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + convert_unary(vx, y, k, item_ct1); + }); + } +} + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_sycl; + case GGML_TYPE_Q4_1: + return dequantize_block_sycl; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_sycl; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F32: + return convert_unary_sycl; + default: + return nullptr; + } +} + +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_sycl; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_sycl; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_sycl; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F16: + return convert_unary_sycl; + default: + return nullptr; + } +} diff --git a/ggml-sycl/convert.hpp b/ggml-sycl/convert.hpp new file mode 100644 index 000000000..b1f10d635 --- /dev/null +++ b/ggml-sycl/convert.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONVERT_HPP +#define GGML_SYCL_CONVERT_HPP + +#include "common.hpp" + +template +using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y, + int k, dpct::queue_ptr stream); +typedef to_t_sycl_t to_fp32_sycl_t; +typedef to_t_sycl_t to_fp16_sycl_t; + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type); +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type); + +#endif // GGML_SYCL_CONVERT_HPP diff --git a/ggml-sycl/dequantize.hpp b/ggml-sycl/dequantize.hpp new file mode 100644 index 000000000..b6080d83a --- /dev/null +++ b/ggml-sycl/dequantize.hpp @@ -0,0 +1,690 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DEQUANTIZE_HPP +#define GGML_SYCL_DEQUANTIZE_HPP + +#include "common.hpp" + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); + +static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v - {8.0f, 8.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 8.0f) * d; + v.s1() = (v.s1() - 8.0f) * d; + +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; + +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v - {16.0f, 16.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 16.0f) * d; + v.s1() = (v.s1() - 16.0f) * d; + +#else + v.x() = (v.x() - 16.0f) * d; + v.y() = (v.y() - 16.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x() = x[ib].qs[iqs + 0]; + v.y() = x[ib].qs[iqs + 1]; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + v.s0() *= d; + v.s1() *= d; +#else + v.x() *= d; + v.y() *= d; +#endif // GGML_SYCL_F16 +} + +template +static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = sycl::vec(x->d) + .convert()[0]; + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template +static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const sycl::float2 d = + x->dm.convert(); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l + 0] = d.x() * (q[l] & 0xF) + d.y(); + y[l + 16] = d.x() * (q[l] >> 4) + d.y(); + } +} + + +//================================== k-quants + +template +static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q2_K * x = (const block_q2_K *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int n = tid/32; + const int l = tid - 32*n; + const int is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template +static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int r = item_ct1.get_local_id(2) / 4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int n = tid / 4; + const int j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = item_ct1.get_local_id(2); + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template +static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int is = 2*il; + const int n = 4; + + dst_t * y = yy + i*QK_K + 64*il + n*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * q = x[i].qs + 32*il + n*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l +32] = d2 * (q[l] >> 4) - m2; + } +#else + const int tid = item_ct1.get_local_id(2); + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template +static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int il = tid/16; // il is in 0...3 + const int ir = tid%16; // ir is in 0...15 + const int is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = item_ct1.get_local_id(2); + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template +static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int i = item_ct1.get_group(2); +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int ip = tid/32; // ip is 0 or 1 + const int il = tid - 32*ip; // 0...32 + const int is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +template +static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xxs_grid_ptr, + const uint8_t *ksigns_iq2xs_ptr, + const uint8_t *kmask_iq2xs_ptr) { + + const int i = item_ct1.get_group(2); + const block_iq2_xxs * x = (const block_iq2_xxs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; + const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template +static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; +#pragma unroll + for (int j = 0; j < 8; ++j) + y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); + +#endif + +} + +template +static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq3xxs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * q3 = x[i].qs + 8*ib; + const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { + + const int i = item_ct1.get_group(2); + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); + const uint8_t signs = x[i].signs[4*ib + il]; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + + +template +__dpct_inline__ static void +dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_group(2); + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} + + +#endif // GGML_SYCL_DEQUANTIZE_HPP diff --git a/ggml-sycl/dmmv.cpp b/ggml-sycl/dmmv.cpp new file mode 100644 index 000000000..3a87d3ef8 --- /dev/null +++ b/ggml-sycl/dmmv.cpp @@ -0,0 +1,1022 @@ +#include "convert.hpp" +#include "dmmv.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + +static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const sycl::half *x = (const sycl::half *)vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +template +static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + const int iter_stride = 2*GGML_SYCL_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_SYCL_F16 + sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_SYCL_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_SYCL_F16 + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_SYCL_F16 + dst[row] = tmp.x() + tmp.y(); +#else + dst[row] = tmp; +#endif // GGML_SYCL_F16 + } +} + +static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} + +/* +DPCT1110:4: The total declared local variable size in device function +dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = item_ct1.get_local_id(2) / + (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2) % + (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const sycl::float2 dall = + x[i].dm.convert(); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x() * sum1 - dall.y() * sum2; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:5: The total declared local variable size in device function +dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:6: The total declared local variable size in device function +dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:7: The total declared local variable size in device function +dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + + +static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, 32); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_SYCL_F16 + ggml_sycl_pool_alloc src1_dfloat_a(ctx.pool()); + sycl::half *src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = src1_dfloat_a.alloc(ne00); + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_SYCL_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} diff --git a/ggml-sycl/dmmv.hpp b/ggml-sycl/dmmv.hpp new file mode 100644 index 000000000..bd8373564 --- /dev/null +++ b/ggml-sycl/dmmv.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DMMV_HPP +#define GGML_SYCL_DMMV_HPP + +#include "common.hpp" + + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); + +#endif // GGML_SYCL_DMMV_HPP diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp index 017fd6ee1..1ff297218 100644 --- a/ggml-sycl/dpct/helper.hpp +++ b/ggml-sycl/dpct/helper.hpp @@ -588,266 +588,222 @@ namespace dpct out = prop; } - /// dpct device extension - class device_ext : public sycl::device - { - typedef std::mutex mutex_type; + /// dpct device extension + class device_ext : public sycl::device { + typedef std::mutex mutex_type; - public: - device_ext() : sycl::device(), _ctx(*this) {} - ~device_ext() - { - std::lock_guard lock(m_mutex); - clear_queues(); - } - device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) - { - std::lock_guard lock(m_mutex); - init_queues(); - } + public: + device_ext() : sycl::device() {} + ~device_ext() { + std::lock_guard lock(m_mutex); + clear_queues(); + } + device_ext(const sycl::device &base) : sycl::device(base) { + std::lock_guard lock(m_mutex); + init_queues(); + } - int is_native_atomic_supported() { return 0; } - int get_major_version() const - { - return dpct::get_major_version(*this); - } + int is_native_atomic_supported() { return 0; } + int get_major_version() const { return dpct::get_major_version(*this); } - int get_minor_version() const - { - return dpct::get_minor_version(*this); - } + int get_minor_version() const { return dpct::get_minor_version(*this); } - int get_max_compute_units() const - { - return get_device_info().get_max_compute_units(); - } + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } - /// Return the maximum clock frequency of this device in KHz. - int get_max_clock_frequency() const - { - return get_device_info().get_max_clock_frequency(); - } + /// Return the maximum clock frequency of this device in KHz. + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } - int get_integrated() const { return get_device_info().get_integrated(); } + int get_integrated() const { return get_device_info().get_integrated(); } - int get_max_sub_group_size() const - { - return get_device_info().get_max_sub_group_size(); - } + int get_max_sub_group_size() const { + return get_device_info().get_max_sub_group_size(); + } - int get_max_register_size_per_work_group() const - { - return get_device_info().get_max_register_size_per_work_group(); - } + int get_max_register_size_per_work_group() const { + return get_device_info().get_max_register_size_per_work_group(); + } - int get_max_work_group_size() const - { - return get_device_info().get_max_work_group_size(); - } + int get_max_work_group_size() const { + return get_device_info().get_max_work_group_size(); + } - int get_mem_base_addr_align() const - { - return get_info(); - } + int get_mem_base_addr_align() const { + return get_info(); + } - size_t get_global_mem_size() const - { - return get_device_info().get_global_mem_size(); - } + size_t get_global_mem_size() const { + return get_device_info().get_global_mem_size(); + } - size_t get_max_mem_alloc_size() const - { - return get_device_info().get_max_mem_alloc_size(); - } + size_t get_max_mem_alloc_size() const { + return get_device_info().get_max_mem_alloc_size(); + } - /// Get the number of bytes of free and total memory on the SYCL device. - /// \param [out] free_memory The number of bytes of free memory on the SYCL device. - /// \param [out] total_memory The number of bytes of total memory on the SYCL device. - void get_memory_info(size_t &free_memory, size_t &total_memory) - { - total_memory = get_device_info().get_global_mem_size(); - const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not " - "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " - "use total memory as free memory"; + /// Get the number of bytes of free and total memory on the SYCL device. + /// \param [out] free_memory The number of bytes of free memory on the + /// SYCL device. \param [out] total_memory The number of bytes of total + /// memory on the SYCL device. + void get_memory_info(size_t &free_memory, size_t &total_memory) { + total_memory = get_device_info().get_global_mem_size(); + const char *warning_info = + "get_memory_info: [warning] ext_intel_free_memory is not " + "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " + "use total memory as free memory"; #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) - if (!has(sycl::aspect::ext_intel_free_memory)) - { - std::cerr << warning_info << std::endl; - free_memory = total_memory; - } - else - { - free_memory = get_info(); - } + if (!has(sycl::aspect::ext_intel_free_memory)) { + std::cerr << warning_info << std::endl; + free_memory = total_memory; + } else { + free_memory = get_info(); + } #else - std::cerr << warning_info << std::endl; - free_memory = total_memory; + std::cerr << warning_info << std::endl; + free_memory = total_memory; #if defined(_MSC_VER) && !defined(__clang__) #pragma message("Querying the number of bytes of free memory is not supported") #else #warning "Querying the number of bytes of free memory is not supported" #endif #endif + } + + void get_device_info(device_info &out) const { + dpct::get_device_info(out, *this); + } + + device_info get_device_info() const { + device_info prop; + dpct::get_device_info(prop, *this); + return prop; + } + + void reset() { + std::lock_guard lock(m_mutex); + clear_queues(); + init_queues(); + } + + sycl::queue &in_order_queue() { return _q_in_order; } + + sycl::queue &out_of_order_queue() { return _q_out_of_order; } + + sycl::queue &default_queue() { return in_order_queue(); } + + void queues_wait_and_throw() { + std::unique_lock lock(m_mutex); + lock.unlock(); + for (auto &q : _queues) { + q.wait_and_throw(); } + // Guard the destruct of current_queues to make sure the ref count is + // safe. + lock.lock(); + } - void get_device_info(device_info &out) const - { - dpct::get_device_info(out, *this); - } + sycl::queue create_queue(bool enable_exception_handler = false) { + return create_in_order_queue(enable_exception_handler); + } - device_info get_device_info() const - { - device_info prop; - dpct::get_device_info(prop, *this); - return prop; - } + sycl::queue create_queue(sycl::device device, + bool enable_exception_handler = false) { + return create_in_order_queue(device, enable_exception_handler); + } - void reset() - { - std::lock_guard lock(m_mutex); - clear_queues(); - init_queues(); - } + sycl::queue create_in_order_queue(bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler, + sycl::property::queue::in_order()); + } - sycl::queue &in_order_queue() { return *_q_in_order; } - - sycl::queue &out_of_order_queue() { return *_q_out_of_order; } - - sycl::queue &default_queue() - { - return in_order_queue(); - } - - void queues_wait_and_throw() - { - std::unique_lock lock(m_mutex); - std::vector> current_queues( - _queues); - lock.unlock(); - for (const auto &q : current_queues) - { - q->wait_and_throw(); - } - // Guard the destruct of current_queues to make sure the ref count is safe. - lock.lock(); - } - - sycl::queue *create_queue(bool enable_exception_handler = false) - { - return create_in_order_queue(enable_exception_handler); - } - - sycl::queue *create_queue(sycl::context context, sycl::device device, - bool enable_exception_handler = false) { - return create_in_order_queue(context, device, enable_exception_handler); - } - - sycl::queue *create_in_order_queue(bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(enable_exception_handler, - sycl::property::queue::in_order()); - } - - sycl::queue *create_in_order_queue(sycl::context context, sycl::device device, + sycl::queue create_in_order_queue(sycl::device device, bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(context, device, enable_exception_handler, - sycl::property::queue::in_order()); - } + std::lock_guard lock(m_mutex); + return create_queue_impl(device, enable_exception_handler, + sycl::property::queue::in_order()); + } - sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(enable_exception_handler); - } + sycl::queue create_out_of_order_queue( + bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler); + } - void destroy_queue(sycl::queue *&queue) - { - std::lock_guard lock(m_mutex); - _queues.erase(std::remove_if(_queues.begin(), _queues.end(), - [=](const std::shared_ptr &q) -> bool - { - return q.get() == queue; - }), - _queues.end()); - queue = nullptr; - } - void set_saved_queue(sycl::queue *q) - { - std::lock_guard lock(m_mutex); - _saved_queue = q; - } - sycl::queue *get_saved_queue() const - { - std::lock_guard lock(m_mutex); - return _saved_queue; - } - sycl::context get_context() const { return _ctx; } + void destroy_queue(sycl::queue queue) { + std::lock_guard lock(m_mutex); + _queues.clear(); + } + void set_saved_queue(sycl::queue q) { + std::lock_guard lock(m_mutex); + _saved_queue = q; + } + sycl::queue get_saved_queue() const { + std::lock_guard lock(m_mutex); + return _saved_queue; + } - private: - void clear_queues() - { - _queues.clear(); - _q_in_order = _q_out_of_order = _saved_queue = nullptr; - } + private: + void clear_queues() { _queues.clear(); } - void init_queues() - { - _q_in_order = create_queue_impl(true, sycl::property::queue::in_order()); - _q_out_of_order = create_queue_impl(true); - _saved_queue = &default_queue(); - } + void init_queues() { + _q_in_order = + create_queue_impl(true, sycl::property::queue::in_order()); + _q_out_of_order = create_queue_impl(true); + _saved_queue = default_queue(); + } - /// Caller should acquire resource \p m_mutex before calling this function. - template - sycl::queue *create_queue_impl(bool enable_exception_handler, - Properties... properties) - { - sycl::async_handler eh = {}; - if (enable_exception_handler) - { - eh = exception_handler; - } - _queues.push_back(std::make_shared( - _ctx, *this, eh, - sycl::property_list( + /// Caller should acquire resource \p m_mutex before calling this + /// function. + template + sycl::queue create_queue_impl(bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + auto q = sycl::queue(*this, eh, + sycl::property_list( #ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), + sycl::property::queue::enable_profiling(), #endif - properties...))); + properties...)); + _queues.push_back(q); - return _queues.back().get(); - } + return _queues.back(); + } - template - sycl::queue *create_queue_impl(sycl::context context, sycl::device device, + template + sycl::queue create_queue_impl(sycl::device device, bool enable_exception_handler, Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back(std::make_shared( - context, device, eh, - sycl::property_list( - #ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), - #endif - properties...))); - - return _queues.back().get(); + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; } + _queues.push_back( + sycl::queue(device, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); - void get_version(int &major, int &minor) const - { - detail::get_version(*this, major, minor); - } - sycl::queue *_q_in_order, *_q_out_of_order; - sycl::queue *_saved_queue; - sycl::context _ctx; - std::vector> _queues; - mutable mutex_type m_mutex; + return _queues.back(); + } + + void get_version(int &major, int &minor) const { + detail::get_version(*this, major, minor); + } + sycl::queue _q_in_order, _q_out_of_order; + sycl::queue _saved_queue; + std::vector _queues; + mutable mutex_type m_mutex; }; + /// device manager class dev_mgr { diff --git a/ggml-sycl/mmq.cpp b/ggml-sycl/mmq.cpp new file mode 100644 index 000000000..b514f0040 --- /dev/null +++ b/ggml-sycl/mmq.cpp @@ -0,0 +1,3031 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "mmq.hpp" +#include "vecdotq.hpp" + +typedef void (*allocate_tiles_sycl_t)( + int** x_ql, + sycl::half2** x_dm, + int** x_qh, + int** x_sc); +typedef void (*load_tiles_sycl_t)( + const void* __restrict__ vx, + int* __restrict__ x_ql, + sycl::half2* __restrict__ x_dm, + int* __restrict__ x_qh, + int* __restrict__ x_sc, + const int& i_offset, + const int& i_max, + const int& k, + const int& blocks_per_row); +typedef float (*vec_dot_q_mul_mat_sycl_t)( + const int* __restrict__ x_ql, + const sycl::half2* __restrict__ x_dm, + const int* __restrict__ x_qh, + const int* __restrict__ x_sc, + const int* __restrict__ y_qs, + const sycl::half2* __restrict__ y_ms, + const int& i, + const int& j, + const int& k); + + +template +static __dpct_inline__ void +allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_0, float *tile_x_d_q4_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_0; + *x_dm = (sycl::half2 *)tile_x_d_q4_0; +} + +template +static __dpct_inline__ void +load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (const block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (const float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_1; + *x_dm = tile_x_dm_q4_1; +} + + +template +static __dpct_inline__ void +load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (const block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_0, float *tile_x_d_q5_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_0; + *x_dm = (sycl::half2 *)tile_x_d_q5_0; +} + +template +static __dpct_inline__ void +load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (const block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = dpct::vectorized_binary( + qs0, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = dpct::vectorized_binary( + qs1, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_1; + *x_dm = tile_x_dm_q5_1; +} + +template +static __dpct_inline__ void +load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (const block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template +static __dpct_inline__ void +allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q8_0, float *tile_x_d_q8_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q8_0; + *x_dm = (sycl::half2 *)tile_x_d_q8_0; +} + +template +static __dpct_inline__ void +load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (const block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +template +static __dpct_inline__ void +allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K, + int *tile_x_sc_q2_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q2_K; + *x_dm = tile_x_dm_q2_K; + *x_sc = tile_x_sc_q2_K; +} + +template +static __dpct_inline__ void +load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (const block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +#define VDR_Q2_K_Q8_1_MMQ 2 +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float &d8) { + + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = dpct::dp4a(m, u[i], + sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const sycl::float2 dm2f = + dm2.convert(); + + return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); +} + +static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); +} + +template +static __dpct_inline__ void +allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K, + int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) { + + *x_ql = tile_x_ql_q3_K; + *x_dm = tile_x_dm_q3_K; + *x_qh = tile_x_qh_q3_K; + *x_sc = tile_x_sc_q3_K; +} + +template +static __dpct_inline__ void +load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (const block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = dpct::vectorized_binary( + sc_low | sc_high, 0x20202020, dpct::sub_sat()); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +#define VDR_Q3_K_Q8_1_MMQ 2 +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d3, + const float &d8) { + + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +} + +static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = dpct::vectorized_binary(vll, vlh, dpct::sub_sat()); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K, + int *tile_x_sc_q4_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q4_K; + *x_dm = tile_x_dm_q4_K; + *x_sc = tile_x_sc_q4_K; +} + +template +static __dpct_inline__ void +load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (const block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + + +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F, + u[i * QI8_1 + j], sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + + +static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K, + int *tile_x_sc_q5_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q5_K; + *x_dm = tile_x_dm_q5_K; + *x_sc = tile_x_sc_q5_K; +} + +template +static __dpct_inline__ void +load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (const block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j], + sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + +static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); +} + +template +static __dpct_inline__ void +allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (const block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2 * WARP_SIZE + 1) + kq0] = + dpct::vectorized_binary(ql0 | qh0, 0x20202020, + dpct::sub_sat()); + x_ql[i * (2 * WARP_SIZE + 1) + kq1] = + dpct::vectorized_binary(ql1 | qh1, 0x20202020, + dpct::sub_sat()); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ sc, const float &d6, + const float *__restrict__ d8) { + + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0], + sumi_d.x()); // SIMD dot product + sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1], + sumi_d.x()); // SIMD dot product + + sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4], + sumi_d.y()); // SIMD dot product + sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5], + sumi_d.y()); // SIMD dot product + } + + sumf_d += d8[i0 / 4] * + (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y()); + } + + return d6 * sumf_d; +} + +static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); +} + +template +/* +DPCT1110:8: The total declared local variable size in device function mul_mat_q +exceeds 128 bytes and may cause high register pressure. Consult with your +hardware vendor to find the total register size available and adjust the code, +or use smaller sub-group size to avoid high register pressure. +*/ +static __dpct_inline__ void +mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, + float *__restrict__ dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh, + int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = item_ct1.get_group(2) * mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = item_ct1.get_group(1) * mmq_x; + const int & col_y_0 = col_dst_0; + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, + tile_x_qh, tile_x_sc, item_ct1.get_local_id(1), + nrows_x - row_x_0 - 1, item_ct1.get_local_id(2), + blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2); + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = dpct::min( + (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), + ncols_y - 1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE + + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned( + by0->qs, item_ct1.get_local_id(2) % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = + (ids0 + item_ct1.get_local_id(1) * QI8_1 + + item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) % + mmq_x; + const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1); + const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const sycl::half2 *dsi_src = + &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) + + ir * (WARP_SIZE / QI8_1) + kby] + .ds; + sycl::half2 *dsi_dst = + &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src)[0]; + } + } + + /* + DPCT1118:9: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:56: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i / WARP_SIZE][j / nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i, + item_ct1.get_local_id(1) + j, k); + } + } + } + + /* + DPCT1118:10: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:57: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1); + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#endif +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static void + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_0, tile_x_d_q4_0); + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, + vec_dot_q4_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#endif +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static void + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1, + sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_1, tile_x_dm_q4_1); + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, + vec_dot_q4_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#endif +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static void + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_0, tile_x_d_q5_0); + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, + vec_dot_q5_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#endif +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static void +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1, + sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_1, tile_x_dm_q5_1); + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, + vec_dot_q5_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#endif +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static void + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q8_0, tile_x_d_q8_0); + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, + vec_dot_q8_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#endif +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static void +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K, + sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K); + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, + vec_dot_q2_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#endif +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static void +mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K, + sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, + tile_x_sc_q3_K); + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, + vec_dot_q3_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#endif +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static void + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K, + sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K); + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, + vec_dot_q4_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#endif +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static void +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K, + sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K); + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, + vec_dot_q5_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#endif +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static void + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { + // int * tile_x_ql = nullptr; + // sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + // int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql, tile_x_dm, tile_x_sc); + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, + vec_dot_q6_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:20: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_0_acc_ct1.get_pointer(), + tile_x_d_q4_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:21: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_0_acc_ct1.get_pointer(), + tile_x_d_q4_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:22: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_1_acc_ct1.get_pointer(), + tile_x_dm_q4_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:23: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_1_acc_ct1.get_pointer(), + tile_x_dm_q4_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:24: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_0_acc_ct1.get_pointer(), + tile_x_d_q5_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:25: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_0_acc_ct1.get_pointer(), + tile_x_d_q5_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:26: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_1_acc_ct1.get_pointer(), + tile_x_dm_q5_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:27: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_1_acc_ct1.get_pointer(), + tile_x_dm_q5_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:28: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q8_0_acc_ct1.get_pointer(), + tile_x_d_q8_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:29: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q8_0_acc_ct1.get_pointer(), + tile_x_d_q8_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:30: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q2_K_acc_ct1.get_pointer(), + tile_x_dm_q2_K_acc_ct1.get_pointer(), + tile_x_sc_q2_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:31: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q2_K_acc_ct1.get_pointer(), + tile_x_dm_q2_K_acc_ct1.get_pointer(), + tile_x_sc_q2_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + +#if QK_K == 256 + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:32: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q3_K_acc_ct1.get_pointer(), + tile_x_dm_q3_K_acc_ct1.get_pointer(), + tile_x_qh_q3_K_acc_ct1.get_pointer(), + tile_x_sc_q3_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:33: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q3_K_acc_ct1.get_pointer(), + tile_x_dm_q3_K_acc_ct1.get_pointer(), + tile_x_qh_q3_K_acc_ct1.get_pointer(), + tile_x_sc_q3_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +#endif +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:34: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q4_K_acc_ct1.get_pointer(), + tile_x_dm_q4_K_acc_ct1.get_pointer(), + tile_x_sc_q4_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:35: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q4_K_acc_ct1.get_pointer(), + tile_x_dm_q4_K_acc_ct1.get_pointer(), + tile_x_sc_q4_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:36: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_K_acc_ct1.get_pointer(), + tile_x_dm_q5_K_acc_ct1.get_pointer(), + tile_x_sc_q5_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:37: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_K_acc_ct1.get_pointer(), + tile_x_dm_q5_K_acc_ct1.get_pointer(), + tile_x_sc_q5_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:38: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:39: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_op_mul_mat_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int device_id; + SYCL_CHECK( + CHECK_TRY_ERROR(device_id = get_current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = device_id == ctx.device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml-sycl/mmq.hpp b/ggml-sycl/mmq.hpp new file mode 100644 index 000000000..3f5297aaa --- /dev/null +++ b/ggml-sycl/mmq.hpp @@ -0,0 +1,33 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_MMQ_HPP +#define GGML_SYCL_MMQ_HPP + +#include "common.hpp" + +void ggml_sycl_op_mul_mat_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, + ggml_tensor* dst, + const char* src0_dd_i, + const float* src1_ddf_i, + const char* src1_ddq_i, + float* dst_dd_i, + const int64_t row_low, + const int64_t row_high, + const int64_t src1_ncols, + const int64_t src1_padded_row_size, + const dpct::queue_ptr& stream); + +#endif // GGML_SYCL_MMQ_HPP diff --git a/ggml-sycl/mmvq.cpp b/ggml-sycl/mmvq.cpp new file mode 100644 index 000000000..23227649e --- /dev/null +++ b/ggml-sycl/mmvq.cpp @@ -0,0 +1,1024 @@ +#include "mmvq.hpp" +#include "vecdotq.hpp" + + +template +static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + +template +static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + + +static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_xxs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_xs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq3_xxs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq3_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq1_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq1_m_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_NL == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_nl_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_xs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +void ggml_sycl_op_mul_mat_vec_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} diff --git a/ggml-sycl/mmvq.hpp b/ggml-sycl/mmvq.hpp new file mode 100644 index 000000000..049b43d45 --- /dev/null +++ b/ggml-sycl/mmvq.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_MMVQ_HPP +#define GGML_SYCL_MMVQ_HPP + +#include "common.hpp" + + +void ggml_sycl_op_mul_mat_vec_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); + +#endif // GGML_SYCL_MMVQ_HPP diff --git a/ggml-sycl/presets.hpp b/ggml-sycl/presets.hpp index dcf026110..5e6b61813 100644 --- a/ggml-sycl/presets.hpp +++ b/ggml-sycl/presets.hpp @@ -18,8 +18,6 @@ #define GGML_SYCL_MAX_DEVICES 48 #define GGML_SYCL_NAME "SYCL" -// FIXME: 1024 from cuda -#define GROUP_SIZE 1024 #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses diff --git a/ggml-sycl/vecdotq.hpp b/ggml-sycl/vecdotq.hpp new file mode 100644 index 000000000..5e2e82546 --- /dev/null +++ b/ggml-sycl/vecdotq.hpp @@ -0,0 +1,1161 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_VECDOTQ_HPP +#define GGML_SYCL_VECDOTQ_HPP + +#include "dpct/helper.hpp" + +typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); + +static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) { + const uint16_t* x16 = + (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte + // alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_uint8( + const uint8_t* x8, + const int& i32) { + const uint16_t* x16 = + (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte + // alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_int8_aligned( + const int8_t* x8, + const int& i32) { + return *( + (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ int get_int_from_uint8_aligned( + const uint8_t* x8, + const int& i32) { + return *( + (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, + const uint8_t *values, + int &val1, int &val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += + d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * + dpct::dp4a( + m, u[i], + 0); // multiply constant q2_K part with sum of q8_1 values + } + + const sycl::float2 dm2f = + dm2.convert(); + + return dm2f.x() * sumf_d - dm2f.y() * sumf_m; +} + + +#define VDR_Q3_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int &vl, const int &vh, const int *__restrict__ u, + const uint8_t *__restrict__ scales, const int &scale_offset, + const float &d3, const float *__restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = + dpct::vectorized_binary(vil, vih, dpct::sub_sat()); + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = + dpct::dp4a(v1i, u[2 * i + 1], + dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 1], + dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + + +#define VDR_Q5_K_Q8_1_MMVQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int *__restrict__ vl, const int *__restrict__ vh, + const int *__restrict__ u, const uint8_t *__restrict__ sc, + const uint8_t *__restrict__ m, const sycl::half2 &dm5, + const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = + dpct::dp4a(v0i, u[2 * i + 0], + dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 0], + dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const sycl::float2 dm5f = + dm5.convert(); + + return dm5f.x() * sumf_d - dm5f.y() * sumf_m; +} + + +#define VDR_Q6_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, + const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d, + const float *__restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = dpct::vectorized_binary( + (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +} + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, + const float &d4, + const sycl::half2 &ds8) { + int sumi = 0; +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + + const sycl::float2 ds8f = + ds8.convert(); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm4, + const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm4 * ds8).convert(); + const float d4d8 = tmp.x(); + const float m4s8 = tmp.y(); +#else + const sycl::float2 dm4f = + dm4.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d4d8 = dm4f.x() * ds8f.x(); + const float m4s8 = dm4f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, + const float &d5, const sycl::half2 &ds8) { + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + + const sycl::float2 ds8f = + ds8.convert(); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, + const sycl::half2 &dm5, const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm5 * ds8).convert(); + const float d5d8 = tmp.x(); + const float m5s8 = tmp.y(); + + +#else + const sycl::float2 dm5f = + dm5.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d5d8 = dm5f.x() * ds8f.x(); + const float m5s8 = dm5f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template +static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, + const float &d8_0, + const float &d8_1) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +} + +template +static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm8, + const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm8 * ds8).convert(); + const float d8d8 = tmp.x(); + const float m8s8 = tmp.y(); +#else + const sycl::float2 dm8f = + dm8.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d8d8 = dm8f.x() * ds8f.x(); + const float m8s8 = dm8f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +} + +static __dpct_inline__ float +vec_dot_q4_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q4_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q5_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q5_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q8_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, + bq8_1->ds[0]); +} + +static __dpct_inline__ float +vec_dot_q2_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +static __dpct_inline__ float +vec_dot_q3_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +static __dpct_inline__ float +vec_dot_q4_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + +#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = bq8_1[0].ds[0]; + const float d8_2 = bq8_1[1].ds[1]; + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0)); + const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + bad_arch(); +#endif // __SYCL_ARCH__ >= VER_4VEC + +#endif +} + +static __dpct_inline__ float +vec_dot_q5_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + +#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = bq8_1[0].ds[0]; + const float d8_2 = bq8_1[1].ds[1]; + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + bad_arch(); +#endif // __SYCL_ARCH__ >= VER_4VEC + +#endif +} + +static __dpct_inline__ float +vec_dot_q6_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + + +static __dpct_inline__ float +vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { +#if QK_K == 256 + const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; + +#if QR2_XXS == 8 + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const uint8_t * aux8 = (const uint8_t *)q2; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = q2[2] | (q2[3] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[aux32 & 127]; + for (int j = 0; j < 8; ++j) { + sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f; + return d * sumi; +#else + // iqs is 0...15 + const int ib32 = iqs/2; + const int il = iqs%2; + const uint16_t * q2 = bq2->qs + 4*ib32; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f; + const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; + const int8_t * q8 = bq8_1[ib32].qs + 16*il; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); + sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); + } + return d * (sumi1 + sumi2); +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs[1], signs[1], std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs[1], signs[1], std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; + + const int ib32 = iqs; + const uint8_t * q3 = bq2->qs + 8*ib32; + const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = gas[0] | (gas[1] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; + const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; + const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); + const int grid_l = dpct::vectorized_binary( + grid1[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid2[0] ^ signs[1], signs[1], std::minus<>()); + sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3s_grid) { +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = dpct::vectorized_binary( + ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); + uint32_t signs1 = dpct::vectorized_binary( + ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid1[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid2[0] ^ signs1, signs1, std::minus<>()); + sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + q8 += 8; + } + const float d = + (float)bq2->d * + (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * + bq8_1[ib32].ds[0]; + return d * sumi; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq1s_grid_gpu) { +#if QK_K == 256 + const block_iq1_s * bq1 = (const block_iq1_s *) vbq; + + const int ib32 = iqs; + int sumi = 0; + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi)); + } + + const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; + const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); + const float d = d1q * bq8_1[ib32].ds[0]; + const float m = d1q * bq8_1[ib32].ds[1]; + return d * sumi + m * delta; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; + + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101, + dpct::dp4a(q8[2 * l + 0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } + + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + assert(false); +#endif +} + + +static __dpct_inline__ float +vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2); + } + + const float d = (float)bq->d * bq8_1->ds[0]; + return d * (sumi1 + sumi2); +} + + +static __dpct_inline__ float +vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#if QK_K == 256 + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + // iqs is 0...7 + const int ib32 = iqs; + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0]; + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); + } + return d * (sumi1 + sumi2); +#else + assert(false); +#endif +} + +#endif // GGML_SYCL_VECDOTQ_HPP diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f389934ea..87af33b56 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1745,31 +1745,37 @@ void ggml_vk_instance_init() { // Default to using all dedicated GPUs for (size_t i = 0; i < devices.size(); i++) { - vk::PhysicalDeviceProperties props = devices[i].getProperties(); + vk::PhysicalDeviceProperties2 new_props; + vk::PhysicalDeviceDriverProperties new_driver; + vk::PhysicalDeviceIDProperties new_id; + new_props.pNext = &new_driver; + new_driver.pNext = &new_id; + devices[i].getProperties2(&new_props); - if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { + if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { // Check if there are two physical devices corresponding to the same GPU auto old_device = std::find_if( vk_instance.device_indices.begin(), vk_instance.device_indices.end(), - [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; } + [&devices, &new_id](const size_t k){ + vk::PhysicalDeviceProperties2 old_props; + vk::PhysicalDeviceIDProperties old_id; + old_props.pNext = &old_id; + devices[k].getProperties2(&old_props); + return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID)); + } ); if (old_device == vk_instance.device_indices.end()) { vk_instance.device_indices.push_back(i); } else { // There can be two physical devices corresponding to the same GPU if there are 2 different drivers // This can cause error when splitting layers aross the devices, need to keep only 1 - VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same device id"); + VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID"); - vk::PhysicalDeviceProperties2 old_prop; + vk::PhysicalDeviceProperties2 old_props; vk::PhysicalDeviceDriverProperties old_driver; - old_prop.pNext = &old_driver; - devices[*old_device].getProperties2(&old_prop); - - vk::PhysicalDeviceProperties2 new_prop; - vk::PhysicalDeviceDriverProperties new_driver; - new_prop.pNext = &new_driver; - devices[i].getProperties2(&new_prop); + old_props.pNext = &old_driver; + devices[*old_device].getProperties2(&old_props); std::map driver_priorities {}; int old_priority = std::numeric_limits::max(); @@ -1777,7 +1783,7 @@ void ggml_vk_instance_init() { // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id // Smaller number -> higher priority - switch (old_prop.properties.vendorID) { + switch (old_props.properties.vendorID) { case VK_VENDOR_ID_AMD: driver_priorities[vk::DriverId::eMesaRadv] = 1; driver_priorities[vk::DriverId::eAmdOpenSource] = 2; diff --git a/ggml.c b/ggml.c index dc25be746..87b726e9f 100644 --- a/ggml.c +++ b/ggml.c @@ -1761,9 +1761,8 @@ struct ggml_compute_state_shared { int n_threads; // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node - atomic_int node_task; // active graph node task phase + atomic_int n_barrier; + atomic_int n_barrier_passed; ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void* abort_callback_data; @@ -19027,47 +19026,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ return n_tasks; } -static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { - // wait for other threads to finish - const int last_node_n = * node_n; +#ifdef GGML_USE_OPENMP +static void ggml_barrier(struct ggml_compute_state * state) { + if (state->shared->n_threads == 1) { + return; + } - while (true) { - if (do_yield) { + #pragma omp barrier +} +#else +static void ggml_barrier(struct ggml_compute_state * state) { + if (state->shared->n_threads == 1) { + return; + } + + atomic_int * n_barrier = &state->shared->n_barrier; + atomic_int * n_barrier_passed = &state->shared->n_barrier_passed; + + int n_threads = state->shared->n_threads; + int passed_old = atomic_load(n_barrier_passed); + + if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { + // last thread + atomic_store(n_barrier, 0); + atomic_fetch_add(n_barrier_passed, 1); + } else { + // wait for other threads + //while (atomic_load(n_barrier_passed) == passed_old) { + //} + const int n_spin_before_sleep = 100000; + while (true) { + for (int i = 0; i < n_spin_before_sleep; i++) { + if (atomic_load(n_barrier_passed) != passed_old) { + return; + } + #if defined(__SSE3__) + _mm_pause(); + #endif + } sched_yield(); } - - *node_n = atomic_load(&state->shared->node_n); - if (*node_n != last_node_n) { - break; - } - -#if defined(__SSE3__) - // Tell the processor we're spinning. It's a processor hint for spinlocks. - _mm_pause(); -#endif } } - -static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { - // wait for other threads to finish - const int last_task_phase = *task_phase; - - while (true) { - if (do_yield) { - sched_yield(); - } - - *task_phase = atomic_load(&state->shared->node_task); - if (*task_phase != last_task_phase) { - break; - } - -#if defined(__SSE3__) - // Tell the processor we're spinning. It's a processor hint for spinlocks. - _mm_pause(); #endif - } -} static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; @@ -19075,136 +19076,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const struct ggml_cgraph * cgraph = state->shared->cgraph; const struct ggml_cplan * cplan = state->shared->cplan; - const int n_threads = state->shared->n_threads; + const int ith = state->ith; + const int n_threads = state->shared->n_threads; - set_numa_thread_affinity(state->ith); + set_numa_thread_affinity(ith); - int node_n = -1; - int task_phase = GGML_TASK_TYPE_FINALIZE; + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_INIT, + /*.ith =*/ ith, + /*.nth =*/ state->shared->n_threads, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; - while (true) { + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->node_n += 1; state->ec = GGML_STATUS_ABORTED; return 0; } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - // all other threads are finished and spinning - // do finalize and init here so we don't have synchronize again - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_TYPE_FINALIZE, - /*.ith =*/ 0, - /*.nth =*/ 0, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - }; - - if (node_n != -1) { - /* FINALIZE */ - struct ggml_tensor * node = cgraph->nodes[node_n]; - if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); - ggml_compute_forward(¶ms, node, state); - } - ggml_graph_compute_perf_stats_node(node, state->shared); - } - - // distribute new work or execute it direct if 1T - while (++node_n < cgraph->n_nodes) { - GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); - - state->shared->perf_node_start_cycles = ggml_perf_cycles(); - state->shared->perf_node_start_time_us = ggml_perf_time_us(); - - params.nth = n_tasks; - - if (n_tasks == 1) { - /* INIT */ - if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_TYPE_INIT; - ggml_compute_forward(¶ms, node, state); - } - - // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, - // they do something more efficient than spinning (?) - params.type = GGML_TASK_TYPE_COMPUTE; - ggml_compute_forward(¶ms, node, state); - - if (GGML_OP_HAS_FINALIZE[node->op]) { - params.type = GGML_TASK_TYPE_FINALIZE; - ggml_compute_forward(¶ms, node, state); - } - - ggml_graph_compute_perf_stats_node(node, state->shared); - } else { - break; - } - - if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - break; - } - } - - task_phase = GGML_TASK_TYPE_INIT; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_n, node_n); - atomic_store(&state->shared->node_task, task_phase); - } else { - ggml_graph_compute_thread_sync_node(&node_n, state, false); - ggml_graph_compute_thread_sync_task(&task_phase, state, false); - } - - // check if we should stop - if (node_n >= cgraph->n_nodes) break; - - /* INIT & COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_TYPE_INIT, - /*.ith =*/ state->ith, - /*.nth =*/ n_tasks, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - }; + params.nth = n_tasks; - if (state->ith < n_tasks) { - if (GGML_OP_HAS_INIT[node->op]) { + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + if (ith < n_tasks) { + params.type = GGML_TASK_TYPE_INIT; ggml_compute_forward(¶ms, node, state); } + ggml_barrier(state); } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_TYPE_COMPUTE; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_task, task_phase); - } - else { - // TODO: this sched_yield can have significant impact on the performance - either positive or negative - // depending on the workload and the operating system. - // since it is not clear what is the best approach, it should potentially become user-configurable - // ref: https://github.com/ggerganov/ggml/issues/291 - // UPD: adding the do_yield flag seems to resolve the issue universally - const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; - ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); - } - - if (state->ith < n_tasks) { + /* COMPUTE */ + if (ith < n_tasks) { params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node, state); } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_TYPE_FINALIZE; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_task, task_phase); - } - else { - ggml_graph_compute_thread_sync_task(&task_phase, state, false); + ggml_barrier(state); + + /* FINALIZE */ + if (GGML_OP_HAS_FINALIZE[node->op]) { + if (params.ith == 0) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_compute_forward(¶ms, node, state); + } + ggml_barrier(state); } } @@ -19396,7 +19315,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * // update the number of threads from the actual number of threads that we got from OpenMP n_threads = omp_get_num_threads(); workers[0].shared->n_threads = n_threads; - workers[0].shared->n_active = n_threads; } ggml_graph_compute_thread(&workers[omp_get_thread_num()]); } @@ -19459,9 +19377,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, + /*.n_barrier =*/ 0, + /*.n_barrier_passed =*/ 0, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, /*.current_chunk; =*/ 0, diff --git a/ggml.h b/ggml.h index 4bcb97c86..781af7b95 100644 --- a/ggml.h +++ b/ggml.h @@ -319,6 +319,12 @@ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ GGML_TENSOR_LOCALS(size_t, nb, dst, nb) +#define GGML_TENSOR_BINARY_OP_LOCALS01 \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + #ifdef __cplusplus extern "C" { #endif diff --git a/klite.embd b/klite.embd index 0b90c321d..612ab56c5 100644 --- a/klite.embd +++ b/klite.embd @@ -15744,6 +15744,7 @@ Current version: 147 +
Add Endpoint Version
diff --git a/llama.cpp b/llama.cpp index 1342b119b..a05b6973b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2321,6 +2321,8 @@ struct llama_vocab { enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + int max_token_len = 0; // used for optimizing longest token search + std::unordered_map token_to_id; std::vector id_to_token; @@ -2338,21 +2340,23 @@ struct llama_vocab { id special_cls_id = -1; id special_mask_id = -1; - int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. - int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. - id linefeed_id = 13; id special_prefix_id = -1; id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token - bool add_space_prefix = true; + // tokenizer flags + bool tokenizer_add_space_prefix = true; + bool tokenizer_add_bos = false; + bool tokenizer_add_eos = false; + bool tokenizer_ignore_merges = false; + int find_bpe_rank(std::string token_left, std::string token_right) const { - // GGML_ASSERT(token_left.find(" ") == std::string::npos); - // GGML_ASSERT(token_left.find("\n") == std::string::npos); - // GGML_ASSERT(token_right.find(" ") == std::string::npos); - // GGML_ASSERT(token_right.find("\n") == std::string::npos); + //GGML_ASSERT(token_left.find(' ') == std::string::npos); + //GGML_ASSERT(token_left.find('\n') == std::string::npos); + //GGML_ASSERT(token_right.find(' ') == std::string::npos); + //GGML_ASSERT(token_right.find('\n') == std::string::npos); //the above breaks gguf v1 falcons replace_all(token_left, " ", "\u0120"); replace_all(token_left, "\n", "\u010A"); @@ -4823,7 +4827,7 @@ static void llm_load_vocab( const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { - vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; @@ -4836,13 +4840,13 @@ static void llm_load_vocab( vocab.special_pad_id = 0; vocab.special_cls_id = 101; vocab.special_mask_id = 103; - vocab.add_space_prefix = false; + vocab.tokenizer_add_space_prefix = false; } else if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { - vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // read bpe merges and populate bpe ranks @@ -4909,6 +4913,8 @@ static void llm_load_vocab( tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + vocab.tokenizer_ignore_merges = true; + vocab.tokenizer_add_bos = true; } else if ( tokenizer_pre == "deepseek-llm") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; @@ -4959,6 +4965,14 @@ static void llm_load_vocab( } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } + } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_eos = false; + } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_eos = false; } else { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } @@ -4999,6 +5013,7 @@ static void llm_load_vocab( } vocab.token_to_id[word] = i; + vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size()); auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); @@ -5112,10 +5127,10 @@ static void llm_load_vocab( bool temp = true; if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { - vocab.special_add_bos = int(temp); + vocab.tokenizer_add_bos = temp; } if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { - vocab.special_add_eos = int(temp); + vocab.tokenizer_add_eos = temp; } } @@ -5215,7 +5230,7 @@ static void llm_load_vocab( ); // set attributes by model/tokenizer name - if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : vocab.cache_special_tokens) { @@ -5309,6 +5324,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); } if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } + LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len); + if (model.arch == LLM_ARCH_DEEPSEEK2) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); @@ -7716,6 +7733,50 @@ struct llm_build_context { return lctx.inp_s_seq; } + struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { + // find result_norm tensor for input + struct ggml_tensor * inp = nullptr; + for (int i = gf->n_nodes - 1; i >= 0; --i) { + inp = gf->nodes[i]; + if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { + break; + } else { + inp = nullptr; + } + } + GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); + + struct ggml_tensor * cur; + + switch (pooling_type) { + case LLAMA_POOLING_TYPE_MEAN: + { + struct ggml_tensor * inp_mean = build_inp_mean(); + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean); + } break; + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + struct ggml_tensor * inp_cls = build_inp_cls(); + cur = ggml_get_rows(ctx0, inp, inp_cls); + } break; + case LLAMA_POOLING_TYPE_NONE: + { + cur = inp; + } break; + default: + { + GGML_ASSERT(false && "unknown pooling type"); + } break; + } + + cb(cur, "result_embd_pooled", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -8696,8 +8757,6 @@ struct llm_build_context { if (model.arch != LLM_ARCH_JINA_BERT_V2) { inp_pos = build_inp_pos(); } - struct ggml_tensor * inp_mean = build_inp_mean(); - struct ggml_tensor * inp_cls = build_inp_cls(); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); @@ -8872,28 +8931,6 @@ struct llm_build_context { cur = inpL; cb(cur, "result_embd", -1); - // pooling layer - switch (pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // nop - } break; - case LLAMA_POOLING_TYPE_MEAN: - { - cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - cb(cur, "result_embd_pooled", -1); - } break; - case LLAMA_POOLING_TYPE_CLS: - { - cur = ggml_get_rows(ctx0, cur, inp_cls); - cb(cur, "result_embd_pooled", -1); - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ASSERT(false && "Invalid pooling type"); - } break; - } - ggml_build_forward_expand(gf, cur); return gf; @@ -11978,6 +12015,11 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } + // add on pooling layer + if (lctx.cparams.embeddings) { + result = llm.append_pooling(result); + } + llm.free(); return result; @@ -12067,7 +12109,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // (!a || b) is a logical implication (a -> b) // !hparams.causal_attn -> !cparams.causal_attn (hparams.causal_attn || !cparams.causal_attn) && - "causal attention with embedding models is not supported" + "causal attention is not supported by this model" ); if (lctx.inp_KQ_mask) { @@ -12199,6 +12241,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } + if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(lctx.inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + + uint32_t * data = (uint32_t *) lctx.inp_cls->data; + memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + const llama_pos pos = batch.pos[i]; + + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = i; + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } + if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; @@ -12260,8 +12333,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead - const bool has_logits = cparams.causal_attn; - const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; @@ -12391,11 +12464,13 @@ static int llama_decode_internal( std::vector> seq_id; // count outputs - if (batch_all.logits) { + if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) { + n_outputs = n_tokens_all; + } else if (batch_all.logits) { for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs += batch_all.logits[i] != 0; } - } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + } else if (lctx.logits_all) { n_outputs = n_tokens_all; } else { // keep last output only @@ -12526,30 +12601,13 @@ static int llama_decode_internal( // no output res = nullptr; embd = nullptr; - } else if (!hparams.causal_attn) { - res = nullptr; // do not extract logits for embedding models such as BERT - - // token or sequence embeddings - embd = gf->nodes[gf->n_nodes - 1]; - - GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); } else if (cparams.embeddings) { - // the embeddings could be in the second to last tensor, or any of the previous tensors - int i_embd = gf->n_nodes - 2; - for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) { - i_embd = gf->n_nodes - i; - if (i_embd < 0) { break; } - embd = gf->nodes[i_embd]; - } - GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor"); - - // TODO: use a per-batch flag to know when to skip logits while keeping embeddings - if (!cparams.causal_attn) { - res = nullptr; // do not extract logits when not needed - // skip computing logits - // TODO: is this safe? - gf->n_nodes = i_embd + 1; + res = nullptr; // do not extract logits for embedding case + embd = gf->nodes[gf->n_nodes - 1]; + if (strcmp(embd->name, "result_embd_pooled") != 0) { + embd = gf->nodes[gf->n_nodes - 2]; } + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); } else { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); @@ -12618,11 +12676,10 @@ static int llama_decode_internal( ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); } } break; - case LLAMA_POOLING_TYPE_CLS: case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: { - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0); - // extract sequence embeddings auto & embd_seq_out = lctx.embd_seq; embd_seq_out.clear(); @@ -13457,112 +13514,142 @@ private: ///// end legacy functions for Falcon ////// struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) { + GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); + switch (vocab.type_pre) { + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + regex_exprs = { + // original regex from tokenizer.json + //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + + // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + case LLAMA_VOCAB_PRE_TYPE_SMAUG: + regex_exprs = { + // same as llama3 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + regex_exprs = { + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + regex_exprs = { + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_FALCON: + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|`]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "[0-9][0-9][0-9]", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_MPT: + // TODO: MPT pre-tokenization regexes are unknown + // the following are close, but not exact. run the following: + // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf + GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed"); + regex_exprs = { + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + regex_exprs = { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_GPT2: + case LLAMA_VOCAB_PRE_TYPE_OLMO: + regex_exprs = { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_STABLELM2: + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + regex_exprs = { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_PORO: + regex_exprs = { + " ?[^(\\s|.,!?…。,、।۔،)]+", + }; + break; + default: + // default regex for BPE tokenization pre-processing + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }; + break; + } + } + + void append(const llama_vocab::id token_id, std::vector & output) const { + output.push_back(token_id); + } + + bool append_bos(std::vector & output) const { + if (vocab.tokenizer_add_bos) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + return true; + } + return false; + } + + bool append_eos(std::vector & output) const { + if (vocab.tokenizer_add_eos) { + GGML_ASSERT(vocab.special_eos_id != -1); + output.push_back(vocab.special_eos_id); + return true; + } + return false; + } + + void check_double_bos_eos(const std::vector & output) const { + if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) { + LLAMA_LOG_WARN( + "%s: Added a EOS token to the prompt as specified by the model but the prompt " + "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + } void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - bool ignore_merges = false; - std::vector word_collection; - switch (vocab.type) { - case LLAMA_VOCAB_TYPE_BPE: - switch (vocab.type_pre) { - case LLAMA_VOCAB_PRE_TYPE_LLAMA3: - ignore_merges = true; - word_collection = unicode_regex_split(text, { - // original regex from tokenizer.json - //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - - // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DBRX: - case LLAMA_VOCAB_PRE_TYPE_SMAUG: - word_collection = unicode_regex_split(text, { - // same as llama3 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: - word_collection = unicode_regex_split(text, { - "[\r\n]", - "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", - "\\s?[!-/:-~!-/:-~‘-‟ -。]+", - "\\s+$", - "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: - word_collection = unicode_regex_split(text, { - "[\r\n]", - "\\s?\\p{L}+", - "\\s?\\p{P}+", - "[一-龥ࠀ-一가-퟿]+", - "\\p{N}", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_FALCON: - word_collection = unicode_regex_split(text, { - "[\\p{P}\\$\\+<=>\\^~\\|]+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "[0-9][0-9][0-9]", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_MPT: - // TODO: MPT pre-tokenization regexes are unknown - // the following are close, but not exact. run the following: - // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf - GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed"); - word_collection = unicode_regex_split(text, { - "\\s?\\p{L}+", - "\\s?\\p{P}+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_STARCODER: - case LLAMA_VOCAB_PRE_TYPE_REFACT: - case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: - word_collection = unicode_regex_split(text, { - "\\p{N}", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_GPT2: - case LLAMA_VOCAB_PRE_TYPE_OLMO: - word_collection = unicode_regex_split(text, { - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_STABLELM2: - case LLAMA_VOCAB_PRE_TYPE_QWEN2: - word_collection = unicode_regex_split(text, { - // original regex from tokenizer.json - // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }); - break; - case LLAMA_VOCAB_PRE_TYPE_PORO: - word_collection = unicode_regex_split(text, { - " ?[^(\\s|.,!?…。,、।۔،)]+", - }); - break; - default: - // default regex for BPE tokenization pre-processing - word_collection = unicode_regex_split(text, { - "[\\p{P}\\$\\+<=>\\^~\\|]+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", - "[0-9][0-9][0-9]", - }); - break; - } - break; - default: - GGML_ASSERT(false); - break; - } + const auto word_collection = unicode_regex_split(text, regex_exprs); symbols_final.clear(); @@ -13573,7 +13660,7 @@ struct llm_tokenizer_bpe { int index = 0; size_t offset = 0; - if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); offset = word.size(); } @@ -13654,10 +13741,9 @@ struct llm_tokenizer_bpe { for (auto j = str.begin(); j != str.end(); ++j) { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); - if (token_multibyte == vocab.token_to_id.end()) { - throw std::runtime_error("ERROR: byte not found in vocab"); + if (token_multibyte != vocab.token_to_id.end()) { + output.push_back(token_multibyte->second); } - output.push_back((*token_multibyte).second); } } else { output.push_back((*token).second); @@ -13696,6 +13782,8 @@ private: const llama_vocab & vocab; + std::vector regex_exprs; + std::vector symbols; std::vector symbols_final; @@ -13705,7 +13793,7 @@ private: struct llm_tokenizer_wpm { llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} - void tokenize(const std::string & text, std::vector & output) { + void tokenize(const std::string & text, std::vector & output) const { const auto & token_map = vocab.token_to_id; // normalize and split by whitespace @@ -13714,7 +13802,7 @@ struct llm_tokenizer_wpm { // bos token prepended already // find the longest tokens that form the words - for (const std::string &word : words) { + for (const std::string & word : words) { // skip empty words if (word.size() == 0) { continue; @@ -13731,7 +13819,7 @@ struct llm_tokenizer_wpm { for (int i = 0; i < n; ++i) { // loop through possible match length bool match = false; - for (int j = n; j > i; j--) { + for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) { auto it = token_map.find(word1.substr(i, j - i)); if (it != token_map.end()) { output.push_back(it->second); @@ -13754,7 +13842,8 @@ struct llm_tokenizer_wpm { } } - std::vector preprocess(const std::string & text) { + // TODO: reduce string copies by using cpts_offs array + std::vector preprocess(const std::string & text) const { const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); std::vector words(1, ""); @@ -13976,7 +14065,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & bool is_prev_special = false; - if (add_special && vocab.special_add_bos != 0) { + if (add_special && vocab.tokenizer_add_bos) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); is_prev_special = true; @@ -13986,7 +14075,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (vocab.add_space_prefix) { + if (vocab.tokenizer_add_space_prefix) { if (!output.size() || is_prev_special) { // prefix with space if first token raw_text = " " + raw_text; } @@ -14004,23 +14093,51 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } - if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { - // LLAMA_LOG_WARN( - // "%s: Added a BOS token to the prompt as specified by the model but the prompt " - // "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " - // "Are you sure this is what you want?\n", __FUNCTION__); + if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { + // LLAMA_LOG_WARN( + // "%s: Added a BOS token to the prompt as specified by the model but the prompt " + // "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + // "Are you sure this is what you want?\n", __FUNCTION__); } - if (add_special && vocab.special_add_eos == 1) { + if (add_special && vocab.tokenizer_add_eos) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); } } break; case LLAMA_VOCAB_TYPE_BPE: { - if (add_special && vocab.special_add_bos != 0) { - GGML_ASSERT(vocab.special_bos_id != -1); - output.push_back(vocab.special_bos_id); + if (OldBPETokenizerMode) + { + if (add_special && vocab.tokenizer_add_bos != 0) + { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + } + for (const auto &fragment : fragment_buffer) + { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) + { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + llm_tokenizer_bpe_old tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } + else + { + output.push_back(fragment.token); + } + } + if (add_special && vocab.tokenizer_add_eos == 1) + { + output.push_back(vocab.special_eos_id); + } + break; + } + + llm_tokenizer_bpe tokenizer(vocab); + + if (add_special) { + tokenizer.append_bos(output); } for (const auto & fragment : fragment_buffer) { @@ -14031,32 +14148,15 @@ static std::vector llama_tokenize_internal(const llama_vocab & LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - if(OldBPETokenizerMode) - { - llm_tokenizer_bpe_old tokenizer(vocab); - tokenizer.tokenize(raw_text, output); - } - else - { - llm_tokenizer_bpe tokenizer(vocab); - tokenizer.tokenize(raw_text, output); - } - + tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - output.push_back(fragment.token); + tokenizer.append(fragment.token, output); } } - if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { - // LLAMA_LOG_WARN( - // "%s: Added a BOS token to the prompt as specified by the model but the prompt " - // "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " - // "Are you sure this is what you want?\n", __FUNCTION__); - } - - if (add_special && vocab.special_add_eos == 1) { - GGML_ASSERT(vocab.special_add_eos != -1); - output.push_back(vocab.special_eos_id); + if (add_special) { + tokenizer.append_eos(output); + tokenizer.check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: @@ -14066,6 +14166,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & output.push_back(vocab.special_cls_id); } + llm_tokenizer_wpm tokenizer(vocab); + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -14073,7 +14175,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_wpm tokenizer(vocab); tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); @@ -18399,6 +18500,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback) ctx->abort_callback_data = abort_callback_data; } +void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { + ctx->cparams.embeddings = embeddings; +} + void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { ctx->cparams.causal_attn = causal_attn; } @@ -18642,11 +18747,11 @@ llama_token llama_token_nl(const struct llama_model * model) { } int32_t llama_add_bos_token(const struct llama_model * model) { - return model->vocab.special_add_bos; + return model->vocab.tokenizer_add_bos; } int32_t llama_add_eos_token(const struct llama_model * model) { - return model->vocab.special_add_eos; + return model->vocab.tokenizer_add_eos; } llama_token llama_token_prefix(const struct llama_model * model) { diff --git a/llama.h b/llama.h index 15e4bf660..6e096638d 100644 --- a/llama.h +++ b/llama.h @@ -174,6 +174,7 @@ extern "C" { LLAMA_POOLING_TYPE_NONE = 0, LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_CLS = 2, + LLAMA_POOLING_TYPE_LAST = 3, }; enum llama_split_mode { @@ -293,7 +294,6 @@ extern "C" { enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id - // (ignored if no pooling layer) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model @@ -788,6 +788,10 @@ extern "C" { // Get the number of threads used for prompt and batch processing (multiple token). LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + // Set whether the model is in embeddings model or not + // If true, embeddings will be returned but logits will not + LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 744873c2a..890e4d7c2 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -1,83 +1,143 @@ -import regex -import ctypes +import array import unicodedata - - -class CoodepointFlags (ctypes.Structure): - _fields_ = [ # see definition in unicode.h - ("is_undefined", ctypes.c_uint16, 1), - ("is_number", ctypes.c_uint16, 1), # regex: \p{N} - ("is_letter", ctypes.c_uint16, 1), # regex: \p{L} - ("is_separator", ctypes.c_uint16, 1), # regex: \p{Z} - ("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M} - ("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P} - ("is_symbol", ctypes.c_uint16, 1), # regex: \p{S} - ("is_control", ctypes.c_uint16, 1), # regex: \p{C} - ] - - -assert (ctypes.sizeof(CoodepointFlags) == 2) +import requests MAX_CODEPOINTS = 0x110000 -regex_number = regex.compile(r'\p{N}') -regex_letter = regex.compile(r'\p{L}') -regex_separator = regex.compile(r'\p{Z}') -regex_accent_mark = regex.compile(r'\p{M}') -regex_punctuation = regex.compile(r'\p{P}') -regex_symbol = regex.compile(r'\p{S}') -regex_control = regex.compile(r'\p{C}') -regex_whitespace = regex.compile(r'\s') +UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" -codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)() + +# see https://www.unicode.org/L2/L1999/UnicodeData.html +def unicode_data_iter(): + res = requests.get(UNICODE_DATA_URL) + res.raise_for_status() + data = res.content.decode() + + prev = [] + + for line in data.splitlines(): + # ej: 0000;;Cc;0;BN;;;;;N;NULL;;;; + line = line.split(";") + + cpt = int(line[0], base=16) + assert cpt < MAX_CODEPOINTS + + cpt_lower = int(line[-2] or "0", base=16) + assert cpt_lower < MAX_CODEPOINTS + + cpt_upper = int(line[-3] or "0", base=16) + assert cpt_upper < MAX_CODEPOINTS + + categ = line[2].strip() + assert len(categ) == 2 + + bidir = line[4].strip() + assert len(categ) == 2 + + name = line[1] + if name.endswith(", First>"): + prev = (cpt, cpt_lower, cpt_upper, categ, bidir) + continue + if name.endswith(", Last>"): + assert prev[1:] == (0, 0, categ, bidir) + for c in range(prev[0], cpt): + yield (c, cpt_lower, cpt_upper, categ, bidir) + + yield (cpt, cpt_lower, cpt_upper, categ, bidir) + + +# see definition in unicode.h +CODEPOINT_FLAG_UNDEFINED = 0x0001 # +CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N} +CODEPOINT_FLAG_LETTER = 0x0004 # \p{L} +CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z} +CODEPOINT_FLAG_MARK = 0x0010 # \p{M} +CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P} +CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S} +CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C} + +UNICODE_CATEGORY_TO_FLAG = { + "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined + "Cc": CODEPOINT_FLAG_CONTROL, # Control + "Cf": CODEPOINT_FLAG_CONTROL, # Format + "Co": CODEPOINT_FLAG_CONTROL, # Private Use + "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate + "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter + "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter + "Lo": CODEPOINT_FLAG_LETTER, # Other Letter + "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter + "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter + "L&": CODEPOINT_FLAG_LETTER, # Cased Letter + "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark + "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark + "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark + "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number + "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number + "No": CODEPOINT_FLAG_NUMBER, # Other Number + "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation + "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation + "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation + "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation + "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation + "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation + "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation + "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol + "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol + "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol + "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol + "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator + "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator + "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator +} + + +codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS table_whitespace = [] table_lowercase = [] table_uppercase = [] table_nfd = [] -for codepoint in range(MAX_CODEPOINTS): +for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter(): # convert codepoint to unicode character - char = chr(codepoint) + char = chr(cpt) - # regex categories - flags = codepoint_flags[codepoint] - flags.is_number = bool(regex_number.match(char)) - flags.is_letter = bool(regex_letter.match(char)) - flags.is_separator = bool(regex_separator.match(char)) - flags.is_accent_mark = bool(regex_accent_mark.match(char)) - flags.is_punctuation = bool(regex_punctuation.match(char)) - flags.is_symbol = bool(regex_symbol.match(char)) - flags.is_control = bool(regex_control.match(char)) - flags.is_undefined = bytes(flags)[0] == 0 - assert (not flags.is_undefined) - - # whitespaces - if bool(regex_whitespace.match(char)): - table_whitespace.append(codepoint) + # codepoint category flags + codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ] # lowercase conversion - lower = ord(char.lower()[0]) - if codepoint != lower: - table_lowercase.append((codepoint, lower)) + if cpt_lower: + table_lowercase.append((cpt, cpt_lower)) # uppercase conversion - upper = ord(char.upper()[0]) - if codepoint != upper: - table_uppercase.append((codepoint, upper)) + if cpt_upper: + table_uppercase.append((cpt, cpt_upper)) # NFD normalization norm = ord(unicodedata.normalize('NFD', char)[0]) - if codepoint != norm: - table_nfd.append((codepoint, norm)) + if cpt != norm: + table_nfd.append((cpt, norm)) + + +# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +table_whitespace.extend(range(0x0009, 0x000D + 1)) +table_whitespace.extend(range(0x2000, 0x200A + 1)) +table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) + + +# sort by codepoint +table_whitespace.sort() +table_lowercase.sort() +table_uppercase.sort() +table_nfd.sort() # group ranges with same flags ranges_flags = [(0, codepoint_flags[0])] # start, flags for codepoint, flags in enumerate(codepoint_flags): - if bytes(flags) != bytes(ranges_flags[-1][1]): + if flags != ranges_flags[-1][1]: ranges_flags.append((codepoint, flags)) -ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags())) +ranges_flags.append((MAX_CODEPOINTS, 0x0000)) # group ranges with same nfd @@ -90,8 +150,8 @@ for codepoint, norm in table_nfd: ranges_nfd[-1] = (start, codepoint, norm) -# Generate 'unicode-data.cpp' - +# Generate 'unicode-data.cpp': +# python ./scripts//gen-unicode-data.py > unicode-data.cpp def out(line=""): print(line, end='\n') # noqa @@ -110,12 +170,12 @@ out("""\ out("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") for codepoint, flags in ranges_flags: - flags = int.from_bytes(bytes(flags), "little") out("{0x%06X, 0x%04X}," % (codepoint, flags)) out("};\n") out("const std::unordered_set unicode_set_whitespace = {") -out(", ".join("0x%06X" % cpt for cpt in table_whitespace)) +for codepoint in table_whitespace: + out("0x%06X," % codepoint) out("};\n") out("const std::unordered_map unicode_map_lowercase = {") diff --git a/sgemm.cpp b/sgemm.cpp index 40ba9d7e9..bbe263ddd 100644 --- a/sgemm.cpp +++ b/sgemm.cpp @@ -43,8 +43,10 @@ // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif #include "sgemm.h" #include "ggml-impl.h" diff --git a/unicode-data.cpp b/unicode-data.cpp index d7c1c898d..4a939898b 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -68,36 +68,36 @@ const std::vector> unicode_ranges_flags = { // st {0x000370, 0x0004}, {0x000375, 0x0040}, {0x000376, 0x0004}, -{0x000378, 0x0080}, +{0x000378, 0x0001}, {0x00037A, 0x0004}, {0x00037E, 0x0020}, {0x00037F, 0x0004}, -{0x000380, 0x0080}, +{0x000380, 0x0001}, {0x000384, 0x0040}, {0x000386, 0x0004}, {0x000387, 0x0020}, {0x000388, 0x0004}, -{0x00038B, 0x0080}, +{0x00038B, 0x0001}, {0x00038C, 0x0004}, -{0x00038D, 0x0080}, +{0x00038D, 0x0001}, {0x00038E, 0x0004}, -{0x0003A2, 0x0080}, +{0x0003A2, 0x0001}, {0x0003A3, 0x0004}, {0x0003F6, 0x0040}, {0x0003F7, 0x0004}, {0x000482, 0x0040}, {0x000483, 0x0010}, {0x00048A, 0x0004}, -{0x000530, 0x0080}, +{0x000530, 0x0001}, {0x000531, 0x0004}, -{0x000557, 0x0080}, +{0x000557, 0x0001}, {0x000559, 0x0004}, {0x00055A, 0x0020}, {0x000560, 0x0004}, {0x000589, 0x0020}, -{0x00058B, 0x0080}, +{0x00058B, 0x0001}, {0x00058D, 0x0040}, -{0x000590, 0x0080}, +{0x000590, 0x0001}, {0x000591, 0x0010}, {0x0005BE, 0x0020}, {0x0005BF, 0x0010}, @@ -107,12 +107,13 @@ const std::vector> unicode_ranges_flags = { // st {0x0005C4, 0x0010}, {0x0005C6, 0x0020}, {0x0005C7, 0x0010}, -{0x0005C8, 0x0080}, +{0x0005C8, 0x0001}, {0x0005D0, 0x0004}, -{0x0005EB, 0x0080}, +{0x0005EB, 0x0001}, {0x0005EF, 0x0004}, {0x0005F3, 0x0020}, -{0x0005F5, 0x0080}, +{0x0005F5, 0x0001}, +{0x000600, 0x0080}, {0x000606, 0x0040}, {0x000609, 0x0020}, {0x00060B, 0x0040}, @@ -145,16 +146,17 @@ const std::vector> unicode_ranges_flags = { // st {0x0006FD, 0x0040}, {0x0006FF, 0x0004}, {0x000700, 0x0020}, -{0x00070E, 0x0080}, +{0x00070E, 0x0001}, +{0x00070F, 0x0080}, {0x000710, 0x0004}, {0x000711, 0x0010}, {0x000712, 0x0004}, {0x000730, 0x0010}, -{0x00074B, 0x0080}, +{0x00074B, 0x0001}, {0x00074D, 0x0004}, {0x0007A6, 0x0010}, {0x0007B1, 0x0004}, -{0x0007B2, 0x0080}, +{0x0007B2, 0x0001}, {0x0007C0, 0x0002}, {0x0007CA, 0x0004}, {0x0007EB, 0x0010}, @@ -162,7 +164,7 @@ const std::vector> unicode_ranges_flags = { // st {0x0007F6, 0x0040}, {0x0007F7, 0x0020}, {0x0007FA, 0x0004}, -{0x0007FB, 0x0080}, +{0x0007FB, 0x0001}, {0x0007FD, 0x0010}, {0x0007FE, 0x0040}, {0x000800, 0x0004}, @@ -173,20 +175,22 @@ const std::vector> unicode_ranges_flags = { // st {0x000825, 0x0010}, {0x000828, 0x0004}, {0x000829, 0x0010}, -{0x00082E, 0x0080}, +{0x00082E, 0x0001}, {0x000830, 0x0020}, -{0x00083F, 0x0080}, +{0x00083F, 0x0001}, {0x000840, 0x0004}, {0x000859, 0x0010}, -{0x00085C, 0x0080}, +{0x00085C, 0x0001}, {0x00085E, 0x0020}, -{0x00085F, 0x0080}, +{0x00085F, 0x0001}, {0x000860, 0x0004}, -{0x00086B, 0x0080}, +{0x00086B, 0x0001}, {0x000870, 0x0004}, {0x000888, 0x0040}, {0x000889, 0x0004}, -{0x00088F, 0x0080}, +{0x00088F, 0x0001}, +{0x000890, 0x0080}, +{0x000892, 0x0001}, {0x000898, 0x0010}, {0x0008A0, 0x0004}, {0x0008CA, 0x0010}, @@ -205,35 +209,35 @@ const std::vector> unicode_ranges_flags = { // st {0x000970, 0x0020}, {0x000971, 0x0004}, {0x000981, 0x0010}, -{0x000984, 0x0080}, +{0x000984, 0x0001}, {0x000985, 0x0004}, -{0x00098D, 0x0080}, +{0x00098D, 0x0001}, {0x00098F, 0x0004}, -{0x000991, 0x0080}, +{0x000991, 0x0001}, {0x000993, 0x0004}, -{0x0009A9, 0x0080}, +{0x0009A9, 0x0001}, {0x0009AA, 0x0004}, -{0x0009B1, 0x0080}, +{0x0009B1, 0x0001}, {0x0009B2, 0x0004}, -{0x0009B3, 0x0080}, +{0x0009B3, 0x0001}, {0x0009B6, 0x0004}, -{0x0009BA, 0x0080}, +{0x0009BA, 0x0001}, {0x0009BC, 0x0010}, {0x0009BD, 0x0004}, {0x0009BE, 0x0010}, -{0x0009C5, 0x0080}, +{0x0009C5, 0x0001}, {0x0009C7, 0x0010}, -{0x0009C9, 0x0080}, +{0x0009C9, 0x0001}, {0x0009CB, 0x0010}, {0x0009CE, 0x0004}, -{0x0009CF, 0x0080}, +{0x0009CF, 0x0001}, {0x0009D7, 0x0010}, -{0x0009D8, 0x0080}, +{0x0009D8, 0x0001}, {0x0009DC, 0x0004}, -{0x0009DE, 0x0080}, +{0x0009DE, 0x0001}, {0x0009DF, 0x0004}, {0x0009E2, 0x0010}, -{0x0009E4, 0x0080}, +{0x0009E4, 0x0001}, {0x0009E6, 0x0002}, {0x0009F0, 0x0004}, {0x0009F2, 0x0040}, @@ -242,173 +246,173 @@ const std::vector> unicode_ranges_flags = { // st {0x0009FC, 0x0004}, {0x0009FD, 0x0020}, {0x0009FE, 0x0010}, -{0x0009FF, 0x0080}, +{0x0009FF, 0x0001}, {0x000A01, 0x0010}, -{0x000A04, 0x0080}, +{0x000A04, 0x0001}, {0x000A05, 0x0004}, -{0x000A0B, 0x0080}, +{0x000A0B, 0x0001}, {0x000A0F, 0x0004}, -{0x000A11, 0x0080}, +{0x000A11, 0x0001}, {0x000A13, 0x0004}, -{0x000A29, 0x0080}, +{0x000A29, 0x0001}, {0x000A2A, 0x0004}, -{0x000A31, 0x0080}, +{0x000A31, 0x0001}, {0x000A32, 0x0004}, -{0x000A34, 0x0080}, +{0x000A34, 0x0001}, {0x000A35, 0x0004}, -{0x000A37, 0x0080}, +{0x000A37, 0x0001}, {0x000A38, 0x0004}, -{0x000A3A, 0x0080}, +{0x000A3A, 0x0001}, {0x000A3C, 0x0010}, -{0x000A3D, 0x0080}, +{0x000A3D, 0x0001}, {0x000A3E, 0x0010}, -{0x000A43, 0x0080}, +{0x000A43, 0x0001}, {0x000A47, 0x0010}, -{0x000A49, 0x0080}, +{0x000A49, 0x0001}, {0x000A4B, 0x0010}, -{0x000A4E, 0x0080}, +{0x000A4E, 0x0001}, {0x000A51, 0x0010}, -{0x000A52, 0x0080}, +{0x000A52, 0x0001}, {0x000A59, 0x0004}, -{0x000A5D, 0x0080}, +{0x000A5D, 0x0001}, {0x000A5E, 0x0004}, -{0x000A5F, 0x0080}, +{0x000A5F, 0x0001}, {0x000A66, 0x0002}, {0x000A70, 0x0010}, {0x000A72, 0x0004}, {0x000A75, 0x0010}, {0x000A76, 0x0020}, -{0x000A77, 0x0080}, +{0x000A77, 0x0001}, {0x000A81, 0x0010}, -{0x000A84, 0x0080}, +{0x000A84, 0x0001}, {0x000A85, 0x0004}, -{0x000A8E, 0x0080}, +{0x000A8E, 0x0001}, {0x000A8F, 0x0004}, -{0x000A92, 0x0080}, +{0x000A92, 0x0001}, {0x000A93, 0x0004}, -{0x000AA9, 0x0080}, +{0x000AA9, 0x0001}, {0x000AAA, 0x0004}, -{0x000AB1, 0x0080}, +{0x000AB1, 0x0001}, {0x000AB2, 0x0004}, -{0x000AB4, 0x0080}, +{0x000AB4, 0x0001}, {0x000AB5, 0x0004}, -{0x000ABA, 0x0080}, +{0x000ABA, 0x0001}, {0x000ABC, 0x0010}, {0x000ABD, 0x0004}, {0x000ABE, 0x0010}, -{0x000AC6, 0x0080}, +{0x000AC6, 0x0001}, {0x000AC7, 0x0010}, -{0x000ACA, 0x0080}, +{0x000ACA, 0x0001}, {0x000ACB, 0x0010}, -{0x000ACE, 0x0080}, +{0x000ACE, 0x0001}, {0x000AD0, 0x0004}, -{0x000AD1, 0x0080}, +{0x000AD1, 0x0001}, {0x000AE0, 0x0004}, {0x000AE2, 0x0010}, -{0x000AE4, 0x0080}, +{0x000AE4, 0x0001}, {0x000AE6, 0x0002}, {0x000AF0, 0x0020}, {0x000AF1, 0x0040}, -{0x000AF2, 0x0080}, +{0x000AF2, 0x0001}, {0x000AF9, 0x0004}, {0x000AFA, 0x0010}, -{0x000B00, 0x0080}, +{0x000B00, 0x0001}, {0x000B01, 0x0010}, -{0x000B04, 0x0080}, +{0x000B04, 0x0001}, {0x000B05, 0x0004}, -{0x000B0D, 0x0080}, +{0x000B0D, 0x0001}, {0x000B0F, 0x0004}, -{0x000B11, 0x0080}, +{0x000B11, 0x0001}, {0x000B13, 0x0004}, -{0x000B29, 0x0080}, +{0x000B29, 0x0001}, {0x000B2A, 0x0004}, -{0x000B31, 0x0080}, +{0x000B31, 0x0001}, {0x000B32, 0x0004}, -{0x000B34, 0x0080}, +{0x000B34, 0x0001}, {0x000B35, 0x0004}, -{0x000B3A, 0x0080}, +{0x000B3A, 0x0001}, {0x000B3C, 0x0010}, {0x000B3D, 0x0004}, {0x000B3E, 0x0010}, -{0x000B45, 0x0080}, +{0x000B45, 0x0001}, {0x000B47, 0x0010}, -{0x000B49, 0x0080}, +{0x000B49, 0x0001}, {0x000B4B, 0x0010}, -{0x000B4E, 0x0080}, +{0x000B4E, 0x0001}, {0x000B55, 0x0010}, -{0x000B58, 0x0080}, +{0x000B58, 0x0001}, {0x000B5C, 0x0004}, -{0x000B5E, 0x0080}, +{0x000B5E, 0x0001}, {0x000B5F, 0x0004}, {0x000B62, 0x0010}, -{0x000B64, 0x0080}, +{0x000B64, 0x0001}, {0x000B66, 0x0002}, {0x000B70, 0x0040}, {0x000B71, 0x0004}, {0x000B72, 0x0002}, -{0x000B78, 0x0080}, +{0x000B78, 0x0001}, {0x000B82, 0x0010}, {0x000B83, 0x0004}, -{0x000B84, 0x0080}, +{0x000B84, 0x0001}, {0x000B85, 0x0004}, -{0x000B8B, 0x0080}, +{0x000B8B, 0x0001}, {0x000B8E, 0x0004}, -{0x000B91, 0x0080}, +{0x000B91, 0x0001}, {0x000B92, 0x0004}, -{0x000B96, 0x0080}, +{0x000B96, 0x0001}, {0x000B99, 0x0004}, -{0x000B9B, 0x0080}, +{0x000B9B, 0x0001}, {0x000B9C, 0x0004}, -{0x000B9D, 0x0080}, +{0x000B9D, 0x0001}, {0x000B9E, 0x0004}, -{0x000BA0, 0x0080}, +{0x000BA0, 0x0001}, {0x000BA3, 0x0004}, -{0x000BA5, 0x0080}, +{0x000BA5, 0x0001}, {0x000BA8, 0x0004}, -{0x000BAB, 0x0080}, +{0x000BAB, 0x0001}, {0x000BAE, 0x0004}, -{0x000BBA, 0x0080}, +{0x000BBA, 0x0001}, {0x000BBE, 0x0010}, -{0x000BC3, 0x0080}, +{0x000BC3, 0x0001}, {0x000BC6, 0x0010}, -{0x000BC9, 0x0080}, +{0x000BC9, 0x0001}, {0x000BCA, 0x0010}, -{0x000BCE, 0x0080}, +{0x000BCE, 0x0001}, {0x000BD0, 0x0004}, -{0x000BD1, 0x0080}, +{0x000BD1, 0x0001}, {0x000BD7, 0x0010}, -{0x000BD8, 0x0080}, +{0x000BD8, 0x0001}, {0x000BE6, 0x0002}, {0x000BF3, 0x0040}, -{0x000BFB, 0x0080}, +{0x000BFB, 0x0001}, {0x000C00, 0x0010}, {0x000C05, 0x0004}, -{0x000C0D, 0x0080}, +{0x000C0D, 0x0001}, {0x000C0E, 0x0004}, -{0x000C11, 0x0080}, +{0x000C11, 0x0001}, {0x000C12, 0x0004}, -{0x000C29, 0x0080}, +{0x000C29, 0x0001}, {0x000C2A, 0x0004}, -{0x000C3A, 0x0080}, +{0x000C3A, 0x0001}, {0x000C3C, 0x0010}, {0x000C3D, 0x0004}, {0x000C3E, 0x0010}, -{0x000C45, 0x0080}, +{0x000C45, 0x0001}, {0x000C46, 0x0010}, -{0x000C49, 0x0080}, +{0x000C49, 0x0001}, {0x000C4A, 0x0010}, -{0x000C4E, 0x0080}, +{0x000C4E, 0x0001}, {0x000C55, 0x0010}, -{0x000C57, 0x0080}, +{0x000C57, 0x0001}, {0x000C58, 0x0004}, -{0x000C5B, 0x0080}, +{0x000C5B, 0x0001}, {0x000C5D, 0x0004}, -{0x000C5E, 0x0080}, +{0x000C5E, 0x0001}, {0x000C60, 0x0004}, {0x000C62, 0x0010}, -{0x000C64, 0x0080}, +{0x000C64, 0x0001}, {0x000C66, 0x0002}, -{0x000C70, 0x0080}, +{0x000C70, 0x0001}, {0x000C77, 0x0020}, {0x000C78, 0x0002}, {0x000C7F, 0x0040}, @@ -416,124 +420,124 @@ const std::vector> unicode_ranges_flags = { // st {0x000C81, 0x0010}, {0x000C84, 0x0020}, {0x000C85, 0x0004}, -{0x000C8D, 0x0080}, +{0x000C8D, 0x0001}, {0x000C8E, 0x0004}, -{0x000C91, 0x0080}, +{0x000C91, 0x0001}, {0x000C92, 0x0004}, -{0x000CA9, 0x0080}, +{0x000CA9, 0x0001}, {0x000CAA, 0x0004}, -{0x000CB4, 0x0080}, +{0x000CB4, 0x0001}, {0x000CB5, 0x0004}, -{0x000CBA, 0x0080}, +{0x000CBA, 0x0001}, {0x000CBC, 0x0010}, {0x000CBD, 0x0004}, {0x000CBE, 0x0010}, -{0x000CC5, 0x0080}, +{0x000CC5, 0x0001}, {0x000CC6, 0x0010}, -{0x000CC9, 0x0080}, +{0x000CC9, 0x0001}, {0x000CCA, 0x0010}, -{0x000CCE, 0x0080}, +{0x000CCE, 0x0001}, {0x000CD5, 0x0010}, -{0x000CD7, 0x0080}, +{0x000CD7, 0x0001}, {0x000CDD, 0x0004}, -{0x000CDF, 0x0080}, +{0x000CDF, 0x0001}, {0x000CE0, 0x0004}, {0x000CE2, 0x0010}, -{0x000CE4, 0x0080}, +{0x000CE4, 0x0001}, {0x000CE6, 0x0002}, -{0x000CF0, 0x0080}, +{0x000CF0, 0x0001}, {0x000CF1, 0x0004}, {0x000CF3, 0x0010}, -{0x000CF4, 0x0080}, +{0x000CF4, 0x0001}, {0x000D00, 0x0010}, {0x000D04, 0x0004}, -{0x000D0D, 0x0080}, +{0x000D0D, 0x0001}, {0x000D0E, 0x0004}, -{0x000D11, 0x0080}, +{0x000D11, 0x0001}, {0x000D12, 0x0004}, {0x000D3B, 0x0010}, {0x000D3D, 0x0004}, {0x000D3E, 0x0010}, -{0x000D45, 0x0080}, +{0x000D45, 0x0001}, {0x000D46, 0x0010}, -{0x000D49, 0x0080}, +{0x000D49, 0x0001}, {0x000D4A, 0x0010}, {0x000D4E, 0x0004}, {0x000D4F, 0x0040}, -{0x000D50, 0x0080}, +{0x000D50, 0x0001}, {0x000D54, 0x0004}, {0x000D57, 0x0010}, {0x000D58, 0x0002}, {0x000D5F, 0x0004}, {0x000D62, 0x0010}, -{0x000D64, 0x0080}, +{0x000D64, 0x0001}, {0x000D66, 0x0002}, {0x000D79, 0x0040}, {0x000D7A, 0x0004}, -{0x000D80, 0x0080}, +{0x000D80, 0x0001}, {0x000D81, 0x0010}, -{0x000D84, 0x0080}, +{0x000D84, 0x0001}, {0x000D85, 0x0004}, -{0x000D97, 0x0080}, +{0x000D97, 0x0001}, {0x000D9A, 0x0004}, -{0x000DB2, 0x0080}, +{0x000DB2, 0x0001}, {0x000DB3, 0x0004}, -{0x000DBC, 0x0080}, +{0x000DBC, 0x0001}, {0x000DBD, 0x0004}, -{0x000DBE, 0x0080}, +{0x000DBE, 0x0001}, {0x000DC0, 0x0004}, -{0x000DC7, 0x0080}, +{0x000DC7, 0x0001}, {0x000DCA, 0x0010}, -{0x000DCB, 0x0080}, +{0x000DCB, 0x0001}, {0x000DCF, 0x0010}, -{0x000DD5, 0x0080}, +{0x000DD5, 0x0001}, {0x000DD6, 0x0010}, -{0x000DD7, 0x0080}, +{0x000DD7, 0x0001}, {0x000DD8, 0x0010}, -{0x000DE0, 0x0080}, +{0x000DE0, 0x0001}, {0x000DE6, 0x0002}, -{0x000DF0, 0x0080}, +{0x000DF0, 0x0001}, {0x000DF2, 0x0010}, {0x000DF4, 0x0020}, -{0x000DF5, 0x0080}, +{0x000DF5, 0x0001}, {0x000E01, 0x0004}, {0x000E31, 0x0010}, {0x000E32, 0x0004}, {0x000E34, 0x0010}, -{0x000E3B, 0x0080}, +{0x000E3B, 0x0001}, {0x000E3F, 0x0040}, {0x000E40, 0x0004}, {0x000E47, 0x0010}, {0x000E4F, 0x0020}, {0x000E50, 0x0002}, {0x000E5A, 0x0020}, -{0x000E5C, 0x0080}, +{0x000E5C, 0x0001}, {0x000E81, 0x0004}, -{0x000E83, 0x0080}, +{0x000E83, 0x0001}, {0x000E84, 0x0004}, -{0x000E85, 0x0080}, +{0x000E85, 0x0001}, {0x000E86, 0x0004}, -{0x000E8B, 0x0080}, +{0x000E8B, 0x0001}, {0x000E8C, 0x0004}, -{0x000EA4, 0x0080}, +{0x000EA4, 0x0001}, {0x000EA5, 0x0004}, -{0x000EA6, 0x0080}, +{0x000EA6, 0x0001}, {0x000EA7, 0x0004}, {0x000EB1, 0x0010}, {0x000EB2, 0x0004}, {0x000EB4, 0x0010}, {0x000EBD, 0x0004}, -{0x000EBE, 0x0080}, +{0x000EBE, 0x0001}, {0x000EC0, 0x0004}, -{0x000EC5, 0x0080}, +{0x000EC5, 0x0001}, {0x000EC6, 0x0004}, -{0x000EC7, 0x0080}, +{0x000EC7, 0x0001}, {0x000EC8, 0x0010}, -{0x000ECF, 0x0080}, +{0x000ECF, 0x0001}, {0x000ED0, 0x0002}, -{0x000EDA, 0x0080}, +{0x000EDA, 0x0001}, {0x000EDC, 0x0004}, -{0x000EE0, 0x0080}, +{0x000EE0, 0x0001}, {0x000F00, 0x0004}, {0x000F01, 0x0040}, {0x000F04, 0x0020}, @@ -552,26 +556,26 @@ const std::vector> unicode_ranges_flags = { // st {0x000F3A, 0x0020}, {0x000F3E, 0x0010}, {0x000F40, 0x0004}, -{0x000F48, 0x0080}, +{0x000F48, 0x0001}, {0x000F49, 0x0004}, -{0x000F6D, 0x0080}, +{0x000F6D, 0x0001}, {0x000F71, 0x0010}, {0x000F85, 0x0020}, {0x000F86, 0x0010}, {0x000F88, 0x0004}, {0x000F8D, 0x0010}, -{0x000F98, 0x0080}, +{0x000F98, 0x0001}, {0x000F99, 0x0010}, -{0x000FBD, 0x0080}, +{0x000FBD, 0x0001}, {0x000FBE, 0x0040}, {0x000FC6, 0x0010}, {0x000FC7, 0x0040}, -{0x000FCD, 0x0080}, +{0x000FCD, 0x0001}, {0x000FCE, 0x0040}, {0x000FD0, 0x0020}, {0x000FD5, 0x0040}, {0x000FD9, 0x0020}, -{0x000FDB, 0x0080}, +{0x000FDB, 0x0001}, {0x001000, 0x0004}, {0x00102B, 0x0010}, {0x00103F, 0x0004}, @@ -595,56 +599,56 @@ const std::vector> unicode_ranges_flags = { // st {0x00109A, 0x0010}, {0x00109E, 0x0040}, {0x0010A0, 0x0004}, -{0x0010C6, 0x0080}, +{0x0010C6, 0x0001}, {0x0010C7, 0x0004}, -{0x0010C8, 0x0080}, +{0x0010C8, 0x0001}, {0x0010CD, 0x0004}, -{0x0010CE, 0x0080}, +{0x0010CE, 0x0001}, {0x0010D0, 0x0004}, {0x0010FB, 0x0020}, {0x0010FC, 0x0004}, -{0x001249, 0x0080}, +{0x001249, 0x0001}, {0x00124A, 0x0004}, -{0x00124E, 0x0080}, +{0x00124E, 0x0001}, {0x001250, 0x0004}, -{0x001257, 0x0080}, +{0x001257, 0x0001}, {0x001258, 0x0004}, -{0x001259, 0x0080}, +{0x001259, 0x0001}, {0x00125A, 0x0004}, -{0x00125E, 0x0080}, +{0x00125E, 0x0001}, {0x001260, 0x0004}, -{0x001289, 0x0080}, +{0x001289, 0x0001}, {0x00128A, 0x0004}, -{0x00128E, 0x0080}, +{0x00128E, 0x0001}, {0x001290, 0x0004}, -{0x0012B1, 0x0080}, +{0x0012B1, 0x0001}, {0x0012B2, 0x0004}, -{0x0012B6, 0x0080}, +{0x0012B6, 0x0001}, {0x0012B8, 0x0004}, -{0x0012BF, 0x0080}, +{0x0012BF, 0x0001}, {0x0012C0, 0x0004}, -{0x0012C1, 0x0080}, +{0x0012C1, 0x0001}, {0x0012C2, 0x0004}, -{0x0012C6, 0x0080}, +{0x0012C6, 0x0001}, {0x0012C8, 0x0004}, -{0x0012D7, 0x0080}, +{0x0012D7, 0x0001}, {0x0012D8, 0x0004}, -{0x001311, 0x0080}, +{0x001311, 0x0001}, {0x001312, 0x0004}, -{0x001316, 0x0080}, +{0x001316, 0x0001}, {0x001318, 0x0004}, -{0x00135B, 0x0080}, +{0x00135B, 0x0001}, {0x00135D, 0x0010}, {0x001360, 0x0020}, {0x001369, 0x0002}, -{0x00137D, 0x0080}, +{0x00137D, 0x0001}, {0x001380, 0x0004}, {0x001390, 0x0040}, -{0x00139A, 0x0080}, +{0x00139A, 0x0001}, {0x0013A0, 0x0004}, -{0x0013F6, 0x0080}, +{0x0013F6, 0x0001}, {0x0013F8, 0x0004}, -{0x0013FE, 0x0080}, +{0x0013FE, 0x0001}, {0x001400, 0x0020}, {0x001401, 0x0004}, {0x00166D, 0x0040}, @@ -653,28 +657,28 @@ const std::vector> unicode_ranges_flags = { // st {0x001680, 0x0008}, {0x001681, 0x0004}, {0x00169B, 0x0020}, -{0x00169D, 0x0080}, +{0x00169D, 0x0001}, {0x0016A0, 0x0004}, {0x0016EB, 0x0020}, {0x0016EE, 0x0002}, {0x0016F1, 0x0004}, -{0x0016F9, 0x0080}, +{0x0016F9, 0x0001}, {0x001700, 0x0004}, {0x001712, 0x0010}, -{0x001716, 0x0080}, +{0x001716, 0x0001}, {0x00171F, 0x0004}, {0x001732, 0x0010}, {0x001735, 0x0020}, -{0x001737, 0x0080}, +{0x001737, 0x0001}, {0x001740, 0x0004}, {0x001752, 0x0010}, -{0x001754, 0x0080}, +{0x001754, 0x0001}, {0x001760, 0x0004}, -{0x00176D, 0x0080}, +{0x00176D, 0x0001}, {0x00176E, 0x0004}, -{0x001771, 0x0080}, +{0x001771, 0x0001}, {0x001772, 0x0010}, -{0x001774, 0x0080}, +{0x001774, 0x0001}, {0x001780, 0x0004}, {0x0017B4, 0x0010}, {0x0017D4, 0x0020}, @@ -683,80 +687,80 @@ const std::vector> unicode_ranges_flags = { // st {0x0017DB, 0x0040}, {0x0017DC, 0x0004}, {0x0017DD, 0x0010}, -{0x0017DE, 0x0080}, +{0x0017DE, 0x0001}, {0x0017E0, 0x0002}, -{0x0017EA, 0x0080}, +{0x0017EA, 0x0001}, {0x0017F0, 0x0002}, -{0x0017FA, 0x0080}, +{0x0017FA, 0x0001}, {0x001800, 0x0020}, {0x00180B, 0x0010}, {0x00180E, 0x0080}, {0x00180F, 0x0010}, {0x001810, 0x0002}, -{0x00181A, 0x0080}, +{0x00181A, 0x0001}, {0x001820, 0x0004}, -{0x001879, 0x0080}, +{0x001879, 0x0001}, {0x001880, 0x0004}, {0x001885, 0x0010}, {0x001887, 0x0004}, {0x0018A9, 0x0010}, {0x0018AA, 0x0004}, -{0x0018AB, 0x0080}, +{0x0018AB, 0x0001}, {0x0018B0, 0x0004}, -{0x0018F6, 0x0080}, +{0x0018F6, 0x0001}, {0x001900, 0x0004}, -{0x00191F, 0x0080}, +{0x00191F, 0x0001}, {0x001920, 0x0010}, -{0x00192C, 0x0080}, +{0x00192C, 0x0001}, {0x001930, 0x0010}, -{0x00193C, 0x0080}, +{0x00193C, 0x0001}, {0x001940, 0x0040}, -{0x001941, 0x0080}, +{0x001941, 0x0001}, {0x001944, 0x0020}, {0x001946, 0x0002}, {0x001950, 0x0004}, -{0x00196E, 0x0080}, +{0x00196E, 0x0001}, {0x001970, 0x0004}, -{0x001975, 0x0080}, +{0x001975, 0x0001}, {0x001980, 0x0004}, -{0x0019AC, 0x0080}, +{0x0019AC, 0x0001}, {0x0019B0, 0x0004}, -{0x0019CA, 0x0080}, +{0x0019CA, 0x0001}, {0x0019D0, 0x0002}, -{0x0019DB, 0x0080}, +{0x0019DB, 0x0001}, {0x0019DE, 0x0040}, {0x001A00, 0x0004}, {0x001A17, 0x0010}, -{0x001A1C, 0x0080}, +{0x001A1C, 0x0001}, {0x001A1E, 0x0020}, {0x001A20, 0x0004}, {0x001A55, 0x0010}, -{0x001A5F, 0x0080}, +{0x001A5F, 0x0001}, {0x001A60, 0x0010}, -{0x001A7D, 0x0080}, +{0x001A7D, 0x0001}, {0x001A7F, 0x0010}, {0x001A80, 0x0002}, -{0x001A8A, 0x0080}, +{0x001A8A, 0x0001}, {0x001A90, 0x0002}, -{0x001A9A, 0x0080}, +{0x001A9A, 0x0001}, {0x001AA0, 0x0020}, {0x001AA7, 0x0004}, {0x001AA8, 0x0020}, -{0x001AAE, 0x0080}, +{0x001AAE, 0x0001}, {0x001AB0, 0x0010}, -{0x001ACF, 0x0080}, +{0x001ACF, 0x0001}, {0x001B00, 0x0010}, {0x001B05, 0x0004}, {0x001B34, 0x0010}, {0x001B45, 0x0004}, -{0x001B4D, 0x0080}, +{0x001B4D, 0x0001}, {0x001B50, 0x0002}, {0x001B5A, 0x0020}, {0x001B61, 0x0040}, {0x001B6B, 0x0010}, {0x001B74, 0x0040}, {0x001B7D, 0x0020}, -{0x001B7F, 0x0080}, +{0x001B7F, 0x0001}, {0x001B80, 0x0010}, {0x001B83, 0x0004}, {0x001BA1, 0x0010}, @@ -764,25 +768,25 @@ const std::vector> unicode_ranges_flags = { // st {0x001BB0, 0x0002}, {0x001BBA, 0x0004}, {0x001BE6, 0x0010}, -{0x001BF4, 0x0080}, +{0x001BF4, 0x0001}, {0x001BFC, 0x0020}, {0x001C00, 0x0004}, {0x001C24, 0x0010}, -{0x001C38, 0x0080}, +{0x001C38, 0x0001}, {0x001C3B, 0x0020}, {0x001C40, 0x0002}, -{0x001C4A, 0x0080}, +{0x001C4A, 0x0001}, {0x001C4D, 0x0004}, {0x001C50, 0x0002}, {0x001C5A, 0x0004}, {0x001C7E, 0x0020}, {0x001C80, 0x0004}, -{0x001C89, 0x0080}, +{0x001C89, 0x0001}, {0x001C90, 0x0004}, -{0x001CBB, 0x0080}, +{0x001CBB, 0x0001}, {0x001CBD, 0x0004}, {0x001CC0, 0x0020}, -{0x001CC8, 0x0080}, +{0x001CC8, 0x0001}, {0x001CD0, 0x0010}, {0x001CD3, 0x0020}, {0x001CD4, 0x0010}, @@ -793,50 +797,50 @@ const std::vector> unicode_ranges_flags = { // st {0x001CF5, 0x0004}, {0x001CF7, 0x0010}, {0x001CFA, 0x0004}, -{0x001CFB, 0x0080}, +{0x001CFB, 0x0001}, {0x001D00, 0x0004}, {0x001DC0, 0x0010}, {0x001E00, 0x0004}, -{0x001F16, 0x0080}, +{0x001F16, 0x0001}, {0x001F18, 0x0004}, -{0x001F1E, 0x0080}, +{0x001F1E, 0x0001}, {0x001F20, 0x0004}, -{0x001F46, 0x0080}, +{0x001F46, 0x0001}, {0x001F48, 0x0004}, -{0x001F4E, 0x0080}, +{0x001F4E, 0x0001}, {0x001F50, 0x0004}, -{0x001F58, 0x0080}, +{0x001F58, 0x0001}, {0x001F59, 0x0004}, -{0x001F5A, 0x0080}, +{0x001F5A, 0x0001}, {0x001F5B, 0x0004}, -{0x001F5C, 0x0080}, +{0x001F5C, 0x0001}, {0x001F5D, 0x0004}, -{0x001F5E, 0x0080}, +{0x001F5E, 0x0001}, {0x001F5F, 0x0004}, -{0x001F7E, 0x0080}, +{0x001F7E, 0x0001}, {0x001F80, 0x0004}, -{0x001FB5, 0x0080}, +{0x001FB5, 0x0001}, {0x001FB6, 0x0004}, {0x001FBD, 0x0040}, {0x001FBE, 0x0004}, {0x001FBF, 0x0040}, {0x001FC2, 0x0004}, -{0x001FC5, 0x0080}, +{0x001FC5, 0x0001}, {0x001FC6, 0x0004}, {0x001FCD, 0x0040}, {0x001FD0, 0x0004}, -{0x001FD4, 0x0080}, +{0x001FD4, 0x0001}, {0x001FD6, 0x0004}, -{0x001FDC, 0x0080}, +{0x001FDC, 0x0001}, {0x001FDD, 0x0040}, {0x001FE0, 0x0004}, {0x001FED, 0x0040}, -{0x001FF0, 0x0080}, +{0x001FF0, 0x0001}, {0x001FF2, 0x0004}, -{0x001FF5, 0x0080}, +{0x001FF5, 0x0001}, {0x001FF6, 0x0004}, {0x001FFD, 0x0040}, -{0x001FFF, 0x0080}, +{0x001FFF, 0x0001}, {0x002000, 0x0008}, {0x00200B, 0x0080}, {0x002010, 0x0020}, @@ -850,9 +854,11 @@ const std::vector> unicode_ranges_flags = { // st {0x002053, 0x0020}, {0x00205F, 0x0008}, {0x002060, 0x0080}, +{0x002065, 0x0001}, +{0x002066, 0x0080}, {0x002070, 0x0002}, {0x002071, 0x0004}, -{0x002072, 0x0080}, +{0x002072, 0x0001}, {0x002074, 0x0002}, {0x00207A, 0x0040}, {0x00207D, 0x0020}, @@ -860,13 +866,13 @@ const std::vector> unicode_ranges_flags = { // st {0x002080, 0x0002}, {0x00208A, 0x0040}, {0x00208D, 0x0020}, -{0x00208F, 0x0080}, +{0x00208F, 0x0001}, {0x002090, 0x0004}, -{0x00209D, 0x0080}, +{0x00209D, 0x0001}, {0x0020A0, 0x0040}, -{0x0020C1, 0x0080}, +{0x0020C1, 0x0001}, {0x0020D0, 0x0010}, -{0x0020F1, 0x0080}, +{0x0020F1, 0x0001}, {0x002100, 0x0040}, {0x002102, 0x0004}, {0x002103, 0x0040}, @@ -898,15 +904,15 @@ const std::vector> unicode_ranges_flags = { // st {0x002183, 0x0004}, {0x002185, 0x0002}, {0x00218A, 0x0040}, -{0x00218C, 0x0080}, +{0x00218C, 0x0001}, {0x002190, 0x0040}, {0x002308, 0x0020}, {0x00230C, 0x0040}, {0x002329, 0x0020}, {0x00232B, 0x0040}, -{0x002427, 0x0080}, +{0x002427, 0x0001}, {0x002440, 0x0040}, -{0x00244B, 0x0080}, +{0x00244B, 0x0001}, {0x002460, 0x0002}, {0x00249C, 0x0040}, {0x0024EA, 0x0002}, @@ -924,62 +930,62 @@ const std::vector> unicode_ranges_flags = { // st {0x0029DC, 0x0040}, {0x0029FC, 0x0020}, {0x0029FE, 0x0040}, -{0x002B74, 0x0080}, +{0x002B74, 0x0001}, {0x002B76, 0x0040}, -{0x002B96, 0x0080}, +{0x002B96, 0x0001}, {0x002B97, 0x0040}, {0x002C00, 0x0004}, {0x002CE5, 0x0040}, {0x002CEB, 0x0004}, {0x002CEF, 0x0010}, {0x002CF2, 0x0004}, -{0x002CF4, 0x0080}, +{0x002CF4, 0x0001}, {0x002CF9, 0x0020}, {0x002CFD, 0x0002}, {0x002CFE, 0x0020}, {0x002D00, 0x0004}, -{0x002D26, 0x0080}, +{0x002D26, 0x0001}, {0x002D27, 0x0004}, -{0x002D28, 0x0080}, +{0x002D28, 0x0001}, {0x002D2D, 0x0004}, -{0x002D2E, 0x0080}, +{0x002D2E, 0x0001}, {0x002D30, 0x0004}, -{0x002D68, 0x0080}, +{0x002D68, 0x0001}, {0x002D6F, 0x0004}, {0x002D70, 0x0020}, -{0x002D71, 0x0080}, +{0x002D71, 0x0001}, {0x002D7F, 0x0010}, {0x002D80, 0x0004}, -{0x002D97, 0x0080}, +{0x002D97, 0x0001}, {0x002DA0, 0x0004}, -{0x002DA7, 0x0080}, +{0x002DA7, 0x0001}, {0x002DA8, 0x0004}, -{0x002DAF, 0x0080}, +{0x002DAF, 0x0001}, {0x002DB0, 0x0004}, -{0x002DB7, 0x0080}, +{0x002DB7, 0x0001}, {0x002DB8, 0x0004}, -{0x002DBF, 0x0080}, +{0x002DBF, 0x0001}, {0x002DC0, 0x0004}, -{0x002DC7, 0x0080}, +{0x002DC7, 0x0001}, {0x002DC8, 0x0004}, -{0x002DCF, 0x0080}, +{0x002DCF, 0x0001}, {0x002DD0, 0x0004}, -{0x002DD7, 0x0080}, +{0x002DD7, 0x0001}, {0x002DD8, 0x0004}, -{0x002DDF, 0x0080}, +{0x002DDF, 0x0001}, {0x002DE0, 0x0010}, {0x002E00, 0x0020}, {0x002E2F, 0x0004}, {0x002E30, 0x0020}, {0x002E50, 0x0040}, {0x002E52, 0x0020}, -{0x002E5E, 0x0080}, +{0x002E5E, 0x0001}, {0x002E80, 0x0040}, -{0x002E9A, 0x0080}, +{0x002E9A, 0x0001}, {0x002E9B, 0x0040}, -{0x002EF4, 0x0080}, +{0x002EF4, 0x0001}, {0x002F00, 0x0040}, -{0x002FD6, 0x0080}, +{0x002FD6, 0x0001}, {0x002FF0, 0x0040}, {0x003000, 0x0008}, {0x003001, 0x0020}, @@ -999,9 +1005,9 @@ const std::vector> unicode_ranges_flags = { // st {0x00303B, 0x0004}, {0x00303D, 0x0020}, {0x00303E, 0x0040}, -{0x003040, 0x0080}, +{0x003040, 0x0001}, {0x003041, 0x0004}, -{0x003097, 0x0080}, +{0x003097, 0x0001}, {0x003099, 0x0010}, {0x00309B, 0x0040}, {0x00309D, 0x0004}, @@ -1009,21 +1015,21 @@ const std::vector> unicode_ranges_flags = { // st {0x0030A1, 0x0004}, {0x0030FB, 0x0020}, {0x0030FC, 0x0004}, -{0x003100, 0x0080}, +{0x003100, 0x0001}, {0x003105, 0x0004}, -{0x003130, 0x0080}, +{0x003130, 0x0001}, {0x003131, 0x0004}, -{0x00318F, 0x0080}, +{0x00318F, 0x0001}, {0x003190, 0x0040}, {0x003192, 0x0002}, {0x003196, 0x0040}, {0x0031A0, 0x0004}, {0x0031C0, 0x0040}, -{0x0031E4, 0x0080}, +{0x0031E4, 0x0001}, {0x0031EF, 0x0040}, {0x0031F0, 0x0004}, {0x003200, 0x0040}, -{0x00321F, 0x0080}, +{0x00321F, 0x0001}, {0x003220, 0x0002}, {0x00322A, 0x0040}, {0x003248, 0x0002}, @@ -1037,9 +1043,9 @@ const std::vector> unicode_ranges_flags = { // st {0x003400, 0x0004}, {0x004DC0, 0x0040}, {0x004E00, 0x0004}, -{0x00A48D, 0x0080}, +{0x00A48D, 0x0001}, {0x00A490, 0x0040}, -{0x00A4C7, 0x0080}, +{0x00A4C7, 0x0001}, {0x00A4D0, 0x0004}, {0x00A4FE, 0x0020}, {0x00A500, 0x0004}, @@ -1047,7 +1053,7 @@ const std::vector> unicode_ranges_flags = { // st {0x00A610, 0x0004}, {0x00A620, 0x0002}, {0x00A62A, 0x0004}, -{0x00A62C, 0x0080}, +{0x00A62C, 0x0001}, {0x00A640, 0x0004}, {0x00A66F, 0x0010}, {0x00A673, 0x0020}, @@ -1059,20 +1065,20 @@ const std::vector> unicode_ranges_flags = { // st {0x00A6E6, 0x0002}, {0x00A6F0, 0x0010}, {0x00A6F2, 0x0020}, -{0x00A6F8, 0x0080}, +{0x00A6F8, 0x0001}, {0x00A700, 0x0040}, {0x00A717, 0x0004}, {0x00A720, 0x0040}, {0x00A722, 0x0004}, {0x00A789, 0x0040}, {0x00A78B, 0x0004}, -{0x00A7CB, 0x0080}, +{0x00A7CB, 0x0001}, {0x00A7D0, 0x0004}, -{0x00A7D2, 0x0080}, +{0x00A7D2, 0x0001}, {0x00A7D3, 0x0004}, -{0x00A7D4, 0x0080}, +{0x00A7D4, 0x0001}, {0x00A7D5, 0x0004}, -{0x00A7DA, 0x0080}, +{0x00A7DA, 0x0001}, {0x00A7F2, 0x0004}, {0x00A802, 0x0010}, {0x00A803, 0x0004}, @@ -1083,20 +1089,20 @@ const std::vector> unicode_ranges_flags = { // st {0x00A823, 0x0010}, {0x00A828, 0x0040}, {0x00A82C, 0x0010}, -{0x00A82D, 0x0080}, +{0x00A82D, 0x0001}, {0x00A830, 0x0002}, {0x00A836, 0x0040}, -{0x00A83A, 0x0080}, +{0x00A83A, 0x0001}, {0x00A840, 0x0004}, {0x00A874, 0x0020}, -{0x00A878, 0x0080}, +{0x00A878, 0x0001}, {0x00A880, 0x0010}, {0x00A882, 0x0004}, {0x00A8B4, 0x0010}, -{0x00A8C6, 0x0080}, +{0x00A8C6, 0x0001}, {0x00A8CE, 0x0020}, {0x00A8D0, 0x0002}, -{0x00A8DA, 0x0080}, +{0x00A8DA, 0x0001}, {0x00A8E0, 0x0010}, {0x00A8F2, 0x0004}, {0x00A8F8, 0x0020}, @@ -1110,35 +1116,35 @@ const std::vector> unicode_ranges_flags = { // st {0x00A92E, 0x0020}, {0x00A930, 0x0004}, {0x00A947, 0x0010}, -{0x00A954, 0x0080}, +{0x00A954, 0x0001}, {0x00A95F, 0x0020}, {0x00A960, 0x0004}, -{0x00A97D, 0x0080}, +{0x00A97D, 0x0001}, {0x00A980, 0x0010}, {0x00A984, 0x0004}, {0x00A9B3, 0x0010}, {0x00A9C1, 0x0020}, -{0x00A9CE, 0x0080}, +{0x00A9CE, 0x0001}, {0x00A9CF, 0x0004}, {0x00A9D0, 0x0002}, -{0x00A9DA, 0x0080}, +{0x00A9DA, 0x0001}, {0x00A9DE, 0x0020}, {0x00A9E0, 0x0004}, {0x00A9E5, 0x0010}, {0x00A9E6, 0x0004}, {0x00A9F0, 0x0002}, {0x00A9FA, 0x0004}, -{0x00A9FF, 0x0080}, +{0x00A9FF, 0x0001}, {0x00AA00, 0x0004}, {0x00AA29, 0x0010}, -{0x00AA37, 0x0080}, +{0x00AA37, 0x0001}, {0x00AA40, 0x0004}, {0x00AA43, 0x0010}, {0x00AA44, 0x0004}, {0x00AA4C, 0x0010}, -{0x00AA4E, 0x0080}, +{0x00AA4E, 0x0001}, {0x00AA50, 0x0002}, -{0x00AA5A, 0x0080}, +{0x00AA5A, 0x0001}, {0x00AA5C, 0x0020}, {0x00AA60, 0x0004}, {0x00AA77, 0x0040}, @@ -1155,7 +1161,7 @@ const std::vector> unicode_ranges_flags = { // st {0x00AAC0, 0x0004}, {0x00AAC1, 0x0010}, {0x00AAC2, 0x0004}, -{0x00AAC3, 0x0080}, +{0x00AAC3, 0x0001}, {0x00AADB, 0x0004}, {0x00AADE, 0x0020}, {0x00AAE0, 0x0004}, @@ -1163,90 +1169,93 @@ const std::vector> unicode_ranges_flags = { // st {0x00AAF0, 0x0020}, {0x00AAF2, 0x0004}, {0x00AAF5, 0x0010}, -{0x00AAF7, 0x0080}, +{0x00AAF7, 0x0001}, {0x00AB01, 0x0004}, -{0x00AB07, 0x0080}, +{0x00AB07, 0x0001}, {0x00AB09, 0x0004}, -{0x00AB0F, 0x0080}, +{0x00AB0F, 0x0001}, {0x00AB11, 0x0004}, -{0x00AB17, 0x0080}, +{0x00AB17, 0x0001}, {0x00AB20, 0x0004}, -{0x00AB27, 0x0080}, +{0x00AB27, 0x0001}, {0x00AB28, 0x0004}, -{0x00AB2F, 0x0080}, +{0x00AB2F, 0x0001}, {0x00AB30, 0x0004}, {0x00AB5B, 0x0040}, {0x00AB5C, 0x0004}, {0x00AB6A, 0x0040}, -{0x00AB6C, 0x0080}, +{0x00AB6C, 0x0001}, {0x00AB70, 0x0004}, {0x00ABE3, 0x0010}, {0x00ABEB, 0x0020}, {0x00ABEC, 0x0010}, -{0x00ABEE, 0x0080}, +{0x00ABEE, 0x0001}, {0x00ABF0, 0x0002}, -{0x00ABFA, 0x0080}, +{0x00ABFA, 0x0001}, {0x00AC00, 0x0004}, -{0x00D7A4, 0x0080}, +{0x00D7A4, 0x0001}, {0x00D7B0, 0x0004}, -{0x00D7C7, 0x0080}, +{0x00D7C7, 0x0001}, {0x00D7CB, 0x0004}, -{0x00D7FC, 0x0080}, +{0x00D7FC, 0x0001}, +{0x00D800, 0x0080}, {0x00F900, 0x0004}, -{0x00FA6E, 0x0080}, +{0x00FA6E, 0x0001}, {0x00FA70, 0x0004}, -{0x00FADA, 0x0080}, +{0x00FADA, 0x0001}, {0x00FB00, 0x0004}, -{0x00FB07, 0x0080}, +{0x00FB07, 0x0001}, {0x00FB13, 0x0004}, -{0x00FB18, 0x0080}, +{0x00FB18, 0x0001}, {0x00FB1D, 0x0004}, {0x00FB1E, 0x0010}, {0x00FB1F, 0x0004}, {0x00FB29, 0x0040}, {0x00FB2A, 0x0004}, -{0x00FB37, 0x0080}, +{0x00FB37, 0x0001}, {0x00FB38, 0x0004}, -{0x00FB3D, 0x0080}, +{0x00FB3D, 0x0001}, {0x00FB3E, 0x0004}, -{0x00FB3F, 0x0080}, +{0x00FB3F, 0x0001}, {0x00FB40, 0x0004}, -{0x00FB42, 0x0080}, +{0x00FB42, 0x0001}, {0x00FB43, 0x0004}, -{0x00FB45, 0x0080}, +{0x00FB45, 0x0001}, {0x00FB46, 0x0004}, {0x00FBB2, 0x0040}, -{0x00FBC3, 0x0080}, +{0x00FBC3, 0x0001}, {0x00FBD3, 0x0004}, {0x00FD3E, 0x0020}, {0x00FD40, 0x0040}, {0x00FD50, 0x0004}, -{0x00FD90, 0x0080}, +{0x00FD90, 0x0001}, {0x00FD92, 0x0004}, -{0x00FDC8, 0x0080}, +{0x00FDC8, 0x0001}, {0x00FDCF, 0x0040}, -{0x00FDD0, 0x0080}, +{0x00FDD0, 0x0001}, {0x00FDF0, 0x0004}, {0x00FDFC, 0x0040}, {0x00FE00, 0x0010}, {0x00FE10, 0x0020}, -{0x00FE1A, 0x0080}, +{0x00FE1A, 0x0001}, {0x00FE20, 0x0010}, {0x00FE30, 0x0020}, -{0x00FE53, 0x0080}, +{0x00FE53, 0x0001}, {0x00FE54, 0x0020}, {0x00FE62, 0x0040}, {0x00FE63, 0x0020}, {0x00FE64, 0x0040}, -{0x00FE67, 0x0080}, +{0x00FE67, 0x0001}, {0x00FE68, 0x0020}, {0x00FE69, 0x0040}, {0x00FE6A, 0x0020}, -{0x00FE6C, 0x0080}, +{0x00FE6C, 0x0001}, {0x00FE70, 0x0004}, -{0x00FE75, 0x0080}, +{0x00FE75, 0x0001}, {0x00FE76, 0x0004}, -{0x00FEFD, 0x0080}, +{0x00FEFD, 0x0001}, +{0x00FEFF, 0x0080}, +{0x00FF00, 0x0001}, {0x00FF01, 0x0020}, {0x00FF04, 0x0040}, {0x00FF05, 0x0020}, @@ -1268,260 +1277,261 @@ const std::vector> unicode_ranges_flags = { // st {0x00FF5E, 0x0040}, {0x00FF5F, 0x0020}, {0x00FF66, 0x0004}, -{0x00FFBF, 0x0080}, +{0x00FFBF, 0x0001}, {0x00FFC2, 0x0004}, -{0x00FFC8, 0x0080}, +{0x00FFC8, 0x0001}, {0x00FFCA, 0x0004}, -{0x00FFD0, 0x0080}, +{0x00FFD0, 0x0001}, {0x00FFD2, 0x0004}, -{0x00FFD8, 0x0080}, +{0x00FFD8, 0x0001}, {0x00FFDA, 0x0004}, -{0x00FFDD, 0x0080}, +{0x00FFDD, 0x0001}, {0x00FFE0, 0x0040}, -{0x00FFE7, 0x0080}, +{0x00FFE7, 0x0001}, {0x00FFE8, 0x0040}, -{0x00FFEF, 0x0080}, +{0x00FFEF, 0x0001}, +{0x00FFF9, 0x0080}, {0x00FFFC, 0x0040}, -{0x00FFFE, 0x0080}, +{0x00FFFE, 0x0001}, {0x010000, 0x0004}, -{0x01000C, 0x0080}, +{0x01000C, 0x0001}, {0x01000D, 0x0004}, -{0x010027, 0x0080}, +{0x010027, 0x0001}, {0x010028, 0x0004}, -{0x01003B, 0x0080}, +{0x01003B, 0x0001}, {0x01003C, 0x0004}, -{0x01003E, 0x0080}, +{0x01003E, 0x0001}, {0x01003F, 0x0004}, -{0x01004E, 0x0080}, +{0x01004E, 0x0001}, {0x010050, 0x0004}, -{0x01005E, 0x0080}, +{0x01005E, 0x0001}, {0x010080, 0x0004}, -{0x0100FB, 0x0080}, +{0x0100FB, 0x0001}, {0x010100, 0x0020}, -{0x010103, 0x0080}, +{0x010103, 0x0001}, {0x010107, 0x0002}, -{0x010134, 0x0080}, +{0x010134, 0x0001}, {0x010137, 0x0040}, {0x010140, 0x0002}, {0x010179, 0x0040}, {0x01018A, 0x0002}, {0x01018C, 0x0040}, -{0x01018F, 0x0080}, +{0x01018F, 0x0001}, {0x010190, 0x0040}, -{0x01019D, 0x0080}, +{0x01019D, 0x0001}, {0x0101A0, 0x0040}, -{0x0101A1, 0x0080}, +{0x0101A1, 0x0001}, {0x0101D0, 0x0040}, {0x0101FD, 0x0010}, -{0x0101FE, 0x0080}, +{0x0101FE, 0x0001}, {0x010280, 0x0004}, -{0x01029D, 0x0080}, +{0x01029D, 0x0001}, {0x0102A0, 0x0004}, -{0x0102D1, 0x0080}, +{0x0102D1, 0x0001}, {0x0102E0, 0x0010}, {0x0102E1, 0x0002}, -{0x0102FC, 0x0080}, +{0x0102FC, 0x0001}, {0x010300, 0x0004}, {0x010320, 0x0002}, -{0x010324, 0x0080}, +{0x010324, 0x0001}, {0x01032D, 0x0004}, {0x010341, 0x0002}, {0x010342, 0x0004}, {0x01034A, 0x0002}, -{0x01034B, 0x0080}, +{0x01034B, 0x0001}, {0x010350, 0x0004}, {0x010376, 0x0010}, -{0x01037B, 0x0080}, +{0x01037B, 0x0001}, {0x010380, 0x0004}, -{0x01039E, 0x0080}, +{0x01039E, 0x0001}, {0x01039F, 0x0020}, {0x0103A0, 0x0004}, -{0x0103C4, 0x0080}, +{0x0103C4, 0x0001}, {0x0103C8, 0x0004}, {0x0103D0, 0x0020}, {0x0103D1, 0x0002}, -{0x0103D6, 0x0080}, +{0x0103D6, 0x0001}, {0x010400, 0x0004}, -{0x01049E, 0x0080}, +{0x01049E, 0x0001}, {0x0104A0, 0x0002}, -{0x0104AA, 0x0080}, +{0x0104AA, 0x0001}, {0x0104B0, 0x0004}, -{0x0104D4, 0x0080}, +{0x0104D4, 0x0001}, {0x0104D8, 0x0004}, -{0x0104FC, 0x0080}, +{0x0104FC, 0x0001}, {0x010500, 0x0004}, -{0x010528, 0x0080}, +{0x010528, 0x0001}, {0x010530, 0x0004}, -{0x010564, 0x0080}, +{0x010564, 0x0001}, {0x01056F, 0x0020}, {0x010570, 0x0004}, -{0x01057B, 0x0080}, +{0x01057B, 0x0001}, {0x01057C, 0x0004}, -{0x01058B, 0x0080}, +{0x01058B, 0x0001}, {0x01058C, 0x0004}, -{0x010593, 0x0080}, +{0x010593, 0x0001}, {0x010594, 0x0004}, -{0x010596, 0x0080}, +{0x010596, 0x0001}, {0x010597, 0x0004}, -{0x0105A2, 0x0080}, +{0x0105A2, 0x0001}, {0x0105A3, 0x0004}, -{0x0105B2, 0x0080}, +{0x0105B2, 0x0001}, {0x0105B3, 0x0004}, -{0x0105BA, 0x0080}, +{0x0105BA, 0x0001}, {0x0105BB, 0x0004}, -{0x0105BD, 0x0080}, +{0x0105BD, 0x0001}, {0x010600, 0x0004}, -{0x010737, 0x0080}, +{0x010737, 0x0001}, {0x010740, 0x0004}, -{0x010756, 0x0080}, +{0x010756, 0x0001}, {0x010760, 0x0004}, -{0x010768, 0x0080}, +{0x010768, 0x0001}, {0x010780, 0x0004}, -{0x010786, 0x0080}, +{0x010786, 0x0001}, {0x010787, 0x0004}, -{0x0107B1, 0x0080}, +{0x0107B1, 0x0001}, {0x0107B2, 0x0004}, -{0x0107BB, 0x0080}, +{0x0107BB, 0x0001}, {0x010800, 0x0004}, -{0x010806, 0x0080}, +{0x010806, 0x0001}, {0x010808, 0x0004}, -{0x010809, 0x0080}, +{0x010809, 0x0001}, {0x01080A, 0x0004}, -{0x010836, 0x0080}, +{0x010836, 0x0001}, {0x010837, 0x0004}, -{0x010839, 0x0080}, +{0x010839, 0x0001}, {0x01083C, 0x0004}, -{0x01083D, 0x0080}, +{0x01083D, 0x0001}, {0x01083F, 0x0004}, -{0x010856, 0x0080}, +{0x010856, 0x0001}, {0x010857, 0x0020}, {0x010858, 0x0002}, {0x010860, 0x0004}, {0x010877, 0x0040}, {0x010879, 0x0002}, {0x010880, 0x0004}, -{0x01089F, 0x0080}, +{0x01089F, 0x0001}, {0x0108A7, 0x0002}, -{0x0108B0, 0x0080}, +{0x0108B0, 0x0001}, {0x0108E0, 0x0004}, -{0x0108F3, 0x0080}, +{0x0108F3, 0x0001}, {0x0108F4, 0x0004}, -{0x0108F6, 0x0080}, +{0x0108F6, 0x0001}, {0x0108FB, 0x0002}, {0x010900, 0x0004}, {0x010916, 0x0002}, -{0x01091C, 0x0080}, +{0x01091C, 0x0001}, {0x01091F, 0x0020}, {0x010920, 0x0004}, -{0x01093A, 0x0080}, +{0x01093A, 0x0001}, {0x01093F, 0x0020}, -{0x010940, 0x0080}, +{0x010940, 0x0001}, {0x010980, 0x0004}, -{0x0109B8, 0x0080}, +{0x0109B8, 0x0001}, {0x0109BC, 0x0002}, {0x0109BE, 0x0004}, {0x0109C0, 0x0002}, -{0x0109D0, 0x0080}, +{0x0109D0, 0x0001}, {0x0109D2, 0x0002}, {0x010A00, 0x0004}, {0x010A01, 0x0010}, -{0x010A04, 0x0080}, +{0x010A04, 0x0001}, {0x010A05, 0x0010}, -{0x010A07, 0x0080}, +{0x010A07, 0x0001}, {0x010A0C, 0x0010}, {0x010A10, 0x0004}, -{0x010A14, 0x0080}, +{0x010A14, 0x0001}, {0x010A15, 0x0004}, -{0x010A18, 0x0080}, +{0x010A18, 0x0001}, {0x010A19, 0x0004}, -{0x010A36, 0x0080}, +{0x010A36, 0x0001}, {0x010A38, 0x0010}, -{0x010A3B, 0x0080}, +{0x010A3B, 0x0001}, {0x010A3F, 0x0010}, {0x010A40, 0x0002}, -{0x010A49, 0x0080}, +{0x010A49, 0x0001}, {0x010A50, 0x0020}, -{0x010A59, 0x0080}, +{0x010A59, 0x0001}, {0x010A60, 0x0004}, {0x010A7D, 0x0002}, {0x010A7F, 0x0020}, {0x010A80, 0x0004}, {0x010A9D, 0x0002}, -{0x010AA0, 0x0080}, +{0x010AA0, 0x0001}, {0x010AC0, 0x0004}, {0x010AC8, 0x0040}, {0x010AC9, 0x0004}, {0x010AE5, 0x0010}, -{0x010AE7, 0x0080}, +{0x010AE7, 0x0001}, {0x010AEB, 0x0002}, {0x010AF0, 0x0020}, -{0x010AF7, 0x0080}, +{0x010AF7, 0x0001}, {0x010B00, 0x0004}, -{0x010B36, 0x0080}, +{0x010B36, 0x0001}, {0x010B39, 0x0020}, {0x010B40, 0x0004}, -{0x010B56, 0x0080}, +{0x010B56, 0x0001}, {0x010B58, 0x0002}, {0x010B60, 0x0004}, -{0x010B73, 0x0080}, +{0x010B73, 0x0001}, {0x010B78, 0x0002}, {0x010B80, 0x0004}, -{0x010B92, 0x0080}, +{0x010B92, 0x0001}, {0x010B99, 0x0020}, -{0x010B9D, 0x0080}, +{0x010B9D, 0x0001}, {0x010BA9, 0x0002}, -{0x010BB0, 0x0080}, +{0x010BB0, 0x0001}, {0x010C00, 0x0004}, -{0x010C49, 0x0080}, +{0x010C49, 0x0001}, {0x010C80, 0x0004}, -{0x010CB3, 0x0080}, +{0x010CB3, 0x0001}, {0x010CC0, 0x0004}, -{0x010CF3, 0x0080}, +{0x010CF3, 0x0001}, {0x010CFA, 0x0002}, {0x010D00, 0x0004}, {0x010D24, 0x0010}, -{0x010D28, 0x0080}, +{0x010D28, 0x0001}, {0x010D30, 0x0002}, -{0x010D3A, 0x0080}, +{0x010D3A, 0x0001}, {0x010E60, 0x0002}, -{0x010E7F, 0x0080}, +{0x010E7F, 0x0001}, {0x010E80, 0x0004}, -{0x010EAA, 0x0080}, +{0x010EAA, 0x0001}, {0x010EAB, 0x0010}, {0x010EAD, 0x0020}, -{0x010EAE, 0x0080}, +{0x010EAE, 0x0001}, {0x010EB0, 0x0004}, -{0x010EB2, 0x0080}, +{0x010EB2, 0x0001}, {0x010EFD, 0x0010}, {0x010F00, 0x0004}, {0x010F1D, 0x0002}, {0x010F27, 0x0004}, -{0x010F28, 0x0080}, +{0x010F28, 0x0001}, {0x010F30, 0x0004}, {0x010F46, 0x0010}, {0x010F51, 0x0002}, {0x010F55, 0x0020}, -{0x010F5A, 0x0080}, +{0x010F5A, 0x0001}, {0x010F70, 0x0004}, {0x010F82, 0x0010}, {0x010F86, 0x0020}, -{0x010F8A, 0x0080}, +{0x010F8A, 0x0001}, {0x010FB0, 0x0004}, {0x010FC5, 0x0002}, -{0x010FCC, 0x0080}, +{0x010FCC, 0x0001}, {0x010FE0, 0x0004}, -{0x010FF7, 0x0080}, +{0x010FF7, 0x0001}, {0x011000, 0x0010}, {0x011003, 0x0004}, {0x011038, 0x0010}, {0x011047, 0x0020}, -{0x01104E, 0x0080}, +{0x01104E, 0x0001}, {0x011052, 0x0002}, {0x011070, 0x0010}, {0x011071, 0x0004}, {0x011073, 0x0010}, {0x011075, 0x0004}, -{0x011076, 0x0080}, +{0x011076, 0x0001}, {0x01107F, 0x0010}, {0x011083, 0x0004}, {0x0110B0, 0x0010}, @@ -1529,26 +1539,28 @@ const std::vector> unicode_ranges_flags = { // st {0x0110BD, 0x0080}, {0x0110BE, 0x0020}, {0x0110C2, 0x0010}, -{0x0110C3, 0x0080}, +{0x0110C3, 0x0001}, +{0x0110CD, 0x0080}, +{0x0110CE, 0x0001}, {0x0110D0, 0x0004}, -{0x0110E9, 0x0080}, +{0x0110E9, 0x0001}, {0x0110F0, 0x0002}, -{0x0110FA, 0x0080}, +{0x0110FA, 0x0001}, {0x011100, 0x0010}, {0x011103, 0x0004}, {0x011127, 0x0010}, -{0x011135, 0x0080}, +{0x011135, 0x0001}, {0x011136, 0x0002}, {0x011140, 0x0020}, {0x011144, 0x0004}, {0x011145, 0x0010}, {0x011147, 0x0004}, -{0x011148, 0x0080}, +{0x011148, 0x0001}, {0x011150, 0x0004}, {0x011173, 0x0010}, {0x011174, 0x0020}, {0x011176, 0x0004}, -{0x011177, 0x0080}, +{0x011177, 0x0001}, {0x011180, 0x0010}, {0x011183, 0x0004}, {0x0111B3, 0x0010}, @@ -1562,159 +1574,159 @@ const std::vector> unicode_ranges_flags = { // st {0x0111DB, 0x0020}, {0x0111DC, 0x0004}, {0x0111DD, 0x0020}, -{0x0111E0, 0x0080}, +{0x0111E0, 0x0001}, {0x0111E1, 0x0002}, -{0x0111F5, 0x0080}, +{0x0111F5, 0x0001}, {0x011200, 0x0004}, -{0x011212, 0x0080}, +{0x011212, 0x0001}, {0x011213, 0x0004}, {0x01122C, 0x0010}, {0x011238, 0x0020}, {0x01123E, 0x0010}, {0x01123F, 0x0004}, {0x011241, 0x0010}, -{0x011242, 0x0080}, +{0x011242, 0x0001}, {0x011280, 0x0004}, -{0x011287, 0x0080}, +{0x011287, 0x0001}, {0x011288, 0x0004}, -{0x011289, 0x0080}, +{0x011289, 0x0001}, {0x01128A, 0x0004}, -{0x01128E, 0x0080}, +{0x01128E, 0x0001}, {0x01128F, 0x0004}, -{0x01129E, 0x0080}, +{0x01129E, 0x0001}, {0x01129F, 0x0004}, {0x0112A9, 0x0020}, -{0x0112AA, 0x0080}, +{0x0112AA, 0x0001}, {0x0112B0, 0x0004}, {0x0112DF, 0x0010}, -{0x0112EB, 0x0080}, +{0x0112EB, 0x0001}, {0x0112F0, 0x0002}, -{0x0112FA, 0x0080}, +{0x0112FA, 0x0001}, {0x011300, 0x0010}, -{0x011304, 0x0080}, +{0x011304, 0x0001}, {0x011305, 0x0004}, -{0x01130D, 0x0080}, +{0x01130D, 0x0001}, {0x01130F, 0x0004}, -{0x011311, 0x0080}, +{0x011311, 0x0001}, {0x011313, 0x0004}, -{0x011329, 0x0080}, +{0x011329, 0x0001}, {0x01132A, 0x0004}, -{0x011331, 0x0080}, +{0x011331, 0x0001}, {0x011332, 0x0004}, -{0x011334, 0x0080}, +{0x011334, 0x0001}, {0x011335, 0x0004}, -{0x01133A, 0x0080}, +{0x01133A, 0x0001}, {0x01133B, 0x0010}, {0x01133D, 0x0004}, {0x01133E, 0x0010}, -{0x011345, 0x0080}, +{0x011345, 0x0001}, {0x011347, 0x0010}, -{0x011349, 0x0080}, +{0x011349, 0x0001}, {0x01134B, 0x0010}, -{0x01134E, 0x0080}, +{0x01134E, 0x0001}, {0x011350, 0x0004}, -{0x011351, 0x0080}, +{0x011351, 0x0001}, {0x011357, 0x0010}, -{0x011358, 0x0080}, +{0x011358, 0x0001}, {0x01135D, 0x0004}, {0x011362, 0x0010}, -{0x011364, 0x0080}, +{0x011364, 0x0001}, {0x011366, 0x0010}, -{0x01136D, 0x0080}, +{0x01136D, 0x0001}, {0x011370, 0x0010}, -{0x011375, 0x0080}, +{0x011375, 0x0001}, {0x011400, 0x0004}, {0x011435, 0x0010}, {0x011447, 0x0004}, {0x01144B, 0x0020}, {0x011450, 0x0002}, {0x01145A, 0x0020}, -{0x01145C, 0x0080}, +{0x01145C, 0x0001}, {0x01145D, 0x0020}, {0x01145E, 0x0010}, {0x01145F, 0x0004}, -{0x011462, 0x0080}, +{0x011462, 0x0001}, {0x011480, 0x0004}, {0x0114B0, 0x0010}, {0x0114C4, 0x0004}, {0x0114C6, 0x0020}, {0x0114C7, 0x0004}, -{0x0114C8, 0x0080}, +{0x0114C8, 0x0001}, {0x0114D0, 0x0002}, -{0x0114DA, 0x0080}, +{0x0114DA, 0x0001}, {0x011580, 0x0004}, {0x0115AF, 0x0010}, -{0x0115B6, 0x0080}, +{0x0115B6, 0x0001}, {0x0115B8, 0x0010}, {0x0115C1, 0x0020}, {0x0115D8, 0x0004}, {0x0115DC, 0x0010}, -{0x0115DE, 0x0080}, +{0x0115DE, 0x0001}, {0x011600, 0x0004}, {0x011630, 0x0010}, {0x011641, 0x0020}, {0x011644, 0x0004}, -{0x011645, 0x0080}, +{0x011645, 0x0001}, {0x011650, 0x0002}, -{0x01165A, 0x0080}, +{0x01165A, 0x0001}, {0x011660, 0x0020}, -{0x01166D, 0x0080}, +{0x01166D, 0x0001}, {0x011680, 0x0004}, {0x0116AB, 0x0010}, {0x0116B8, 0x0004}, {0x0116B9, 0x0020}, -{0x0116BA, 0x0080}, +{0x0116BA, 0x0001}, {0x0116C0, 0x0002}, -{0x0116CA, 0x0080}, +{0x0116CA, 0x0001}, {0x011700, 0x0004}, -{0x01171B, 0x0080}, +{0x01171B, 0x0001}, {0x01171D, 0x0010}, -{0x01172C, 0x0080}, +{0x01172C, 0x0001}, {0x011730, 0x0002}, {0x01173C, 0x0020}, {0x01173F, 0x0040}, {0x011740, 0x0004}, -{0x011747, 0x0080}, +{0x011747, 0x0001}, {0x011800, 0x0004}, {0x01182C, 0x0010}, {0x01183B, 0x0020}, -{0x01183C, 0x0080}, +{0x01183C, 0x0001}, {0x0118A0, 0x0004}, {0x0118E0, 0x0002}, -{0x0118F3, 0x0080}, +{0x0118F3, 0x0001}, {0x0118FF, 0x0004}, -{0x011907, 0x0080}, +{0x011907, 0x0001}, {0x011909, 0x0004}, -{0x01190A, 0x0080}, +{0x01190A, 0x0001}, {0x01190C, 0x0004}, -{0x011914, 0x0080}, +{0x011914, 0x0001}, {0x011915, 0x0004}, -{0x011917, 0x0080}, +{0x011917, 0x0001}, {0x011918, 0x0004}, {0x011930, 0x0010}, -{0x011936, 0x0080}, +{0x011936, 0x0001}, {0x011937, 0x0010}, -{0x011939, 0x0080}, +{0x011939, 0x0001}, {0x01193B, 0x0010}, {0x01193F, 0x0004}, {0x011940, 0x0010}, {0x011941, 0x0004}, {0x011942, 0x0010}, {0x011944, 0x0020}, -{0x011947, 0x0080}, +{0x011947, 0x0001}, {0x011950, 0x0002}, -{0x01195A, 0x0080}, +{0x01195A, 0x0001}, {0x0119A0, 0x0004}, -{0x0119A8, 0x0080}, +{0x0119A8, 0x0001}, {0x0119AA, 0x0004}, {0x0119D1, 0x0010}, -{0x0119D8, 0x0080}, +{0x0119D8, 0x0001}, {0x0119DA, 0x0010}, {0x0119E1, 0x0004}, {0x0119E2, 0x0020}, {0x0119E3, 0x0004}, {0x0119E4, 0x0010}, -{0x0119E5, 0x0080}, +{0x0119E5, 0x0001}, {0x011A00, 0x0004}, {0x011A01, 0x0010}, {0x011A0B, 0x0004}, @@ -1723,7 +1735,7 @@ const std::vector> unicode_ranges_flags = { // st {0x011A3B, 0x0010}, {0x011A3F, 0x0020}, {0x011A47, 0x0010}, -{0x011A48, 0x0080}, +{0x011A48, 0x0001}, {0x011A50, 0x0004}, {0x011A51, 0x0010}, {0x011A5C, 0x0004}, @@ -1731,117 +1743,117 @@ const std::vector> unicode_ranges_flags = { // st {0x011A9A, 0x0020}, {0x011A9D, 0x0004}, {0x011A9E, 0x0020}, -{0x011AA3, 0x0080}, +{0x011AA3, 0x0001}, {0x011AB0, 0x0004}, -{0x011AF9, 0x0080}, +{0x011AF9, 0x0001}, {0x011B00, 0x0020}, -{0x011B0A, 0x0080}, +{0x011B0A, 0x0001}, {0x011C00, 0x0004}, -{0x011C09, 0x0080}, +{0x011C09, 0x0001}, {0x011C0A, 0x0004}, {0x011C2F, 0x0010}, -{0x011C37, 0x0080}, +{0x011C37, 0x0001}, {0x011C38, 0x0010}, {0x011C40, 0x0004}, {0x011C41, 0x0020}, -{0x011C46, 0x0080}, +{0x011C46, 0x0001}, {0x011C50, 0x0002}, -{0x011C6D, 0x0080}, +{0x011C6D, 0x0001}, {0x011C70, 0x0020}, {0x011C72, 0x0004}, -{0x011C90, 0x0080}, +{0x011C90, 0x0001}, {0x011C92, 0x0010}, -{0x011CA8, 0x0080}, +{0x011CA8, 0x0001}, {0x011CA9, 0x0010}, -{0x011CB7, 0x0080}, +{0x011CB7, 0x0001}, {0x011D00, 0x0004}, -{0x011D07, 0x0080}, +{0x011D07, 0x0001}, {0x011D08, 0x0004}, -{0x011D0A, 0x0080}, +{0x011D0A, 0x0001}, {0x011D0B, 0x0004}, {0x011D31, 0x0010}, -{0x011D37, 0x0080}, +{0x011D37, 0x0001}, {0x011D3A, 0x0010}, -{0x011D3B, 0x0080}, +{0x011D3B, 0x0001}, {0x011D3C, 0x0010}, -{0x011D3E, 0x0080}, +{0x011D3E, 0x0001}, {0x011D3F, 0x0010}, {0x011D46, 0x0004}, {0x011D47, 0x0010}, -{0x011D48, 0x0080}, +{0x011D48, 0x0001}, {0x011D50, 0x0002}, -{0x011D5A, 0x0080}, +{0x011D5A, 0x0001}, {0x011D60, 0x0004}, -{0x011D66, 0x0080}, +{0x011D66, 0x0001}, {0x011D67, 0x0004}, -{0x011D69, 0x0080}, +{0x011D69, 0x0001}, {0x011D6A, 0x0004}, {0x011D8A, 0x0010}, -{0x011D8F, 0x0080}, +{0x011D8F, 0x0001}, {0x011D90, 0x0010}, -{0x011D92, 0x0080}, +{0x011D92, 0x0001}, {0x011D93, 0x0010}, {0x011D98, 0x0004}, -{0x011D99, 0x0080}, +{0x011D99, 0x0001}, {0x011DA0, 0x0002}, -{0x011DAA, 0x0080}, +{0x011DAA, 0x0001}, {0x011EE0, 0x0004}, {0x011EF3, 0x0010}, {0x011EF7, 0x0020}, -{0x011EF9, 0x0080}, +{0x011EF9, 0x0001}, {0x011F00, 0x0010}, {0x011F02, 0x0004}, {0x011F03, 0x0010}, {0x011F04, 0x0004}, -{0x011F11, 0x0080}, +{0x011F11, 0x0001}, {0x011F12, 0x0004}, {0x011F34, 0x0010}, -{0x011F3B, 0x0080}, +{0x011F3B, 0x0001}, {0x011F3E, 0x0010}, {0x011F43, 0x0020}, {0x011F50, 0x0002}, -{0x011F5A, 0x0080}, +{0x011F5A, 0x0001}, {0x011FB0, 0x0004}, -{0x011FB1, 0x0080}, +{0x011FB1, 0x0001}, {0x011FC0, 0x0002}, {0x011FD5, 0x0040}, -{0x011FF2, 0x0080}, +{0x011FF2, 0x0001}, {0x011FFF, 0x0020}, {0x012000, 0x0004}, -{0x01239A, 0x0080}, +{0x01239A, 0x0001}, {0x012400, 0x0002}, -{0x01246F, 0x0080}, +{0x01246F, 0x0001}, {0x012470, 0x0020}, -{0x012475, 0x0080}, +{0x012475, 0x0001}, {0x012480, 0x0004}, -{0x012544, 0x0080}, +{0x012544, 0x0001}, {0x012F90, 0x0004}, {0x012FF1, 0x0020}, -{0x012FF3, 0x0080}, +{0x012FF3, 0x0001}, {0x013000, 0x0004}, {0x013430, 0x0080}, {0x013440, 0x0010}, {0x013441, 0x0004}, {0x013447, 0x0010}, -{0x013456, 0x0080}, +{0x013456, 0x0001}, {0x014400, 0x0004}, -{0x014647, 0x0080}, +{0x014647, 0x0001}, {0x016800, 0x0004}, -{0x016A39, 0x0080}, +{0x016A39, 0x0001}, {0x016A40, 0x0004}, -{0x016A5F, 0x0080}, +{0x016A5F, 0x0001}, {0x016A60, 0x0002}, -{0x016A6A, 0x0080}, +{0x016A6A, 0x0001}, {0x016A6E, 0x0020}, {0x016A70, 0x0004}, -{0x016ABF, 0x0080}, +{0x016ABF, 0x0001}, {0x016AC0, 0x0002}, -{0x016ACA, 0x0080}, +{0x016ACA, 0x0001}, {0x016AD0, 0x0004}, -{0x016AEE, 0x0080}, +{0x016AEE, 0x0001}, {0x016AF0, 0x0010}, {0x016AF5, 0x0020}, -{0x016AF6, 0x0080}, +{0x016AF6, 0x0001}, {0x016B00, 0x0004}, {0x016B30, 0x0010}, {0x016B37, 0x0020}, @@ -1849,81 +1861,82 @@ const std::vector> unicode_ranges_flags = { // st {0x016B40, 0x0004}, {0x016B44, 0x0020}, {0x016B45, 0x0040}, -{0x016B46, 0x0080}, +{0x016B46, 0x0001}, {0x016B50, 0x0002}, -{0x016B5A, 0x0080}, +{0x016B5A, 0x0001}, {0x016B5B, 0x0002}, -{0x016B62, 0x0080}, +{0x016B62, 0x0001}, {0x016B63, 0x0004}, -{0x016B78, 0x0080}, +{0x016B78, 0x0001}, {0x016B7D, 0x0004}, -{0x016B90, 0x0080}, +{0x016B90, 0x0001}, {0x016E40, 0x0004}, {0x016E80, 0x0002}, {0x016E97, 0x0020}, -{0x016E9B, 0x0080}, +{0x016E9B, 0x0001}, {0x016F00, 0x0004}, -{0x016F4B, 0x0080}, +{0x016F4B, 0x0001}, {0x016F4F, 0x0010}, {0x016F50, 0x0004}, {0x016F51, 0x0010}, -{0x016F88, 0x0080}, +{0x016F88, 0x0001}, {0x016F8F, 0x0010}, {0x016F93, 0x0004}, -{0x016FA0, 0x0080}, +{0x016FA0, 0x0001}, {0x016FE0, 0x0004}, {0x016FE2, 0x0020}, {0x016FE3, 0x0004}, {0x016FE4, 0x0010}, -{0x016FE5, 0x0080}, +{0x016FE5, 0x0001}, {0x016FF0, 0x0010}, -{0x016FF2, 0x0080}, +{0x016FF2, 0x0001}, {0x017000, 0x0004}, -{0x0187F8, 0x0080}, +{0x0187F8, 0x0001}, {0x018800, 0x0004}, -{0x018CD6, 0x0080}, +{0x018CD6, 0x0001}, {0x018D00, 0x0004}, -{0x018D09, 0x0080}, +{0x018D09, 0x0001}, {0x01AFF0, 0x0004}, -{0x01AFF4, 0x0080}, +{0x01AFF4, 0x0001}, {0x01AFF5, 0x0004}, -{0x01AFFC, 0x0080}, +{0x01AFFC, 0x0001}, {0x01AFFD, 0x0004}, -{0x01AFFF, 0x0080}, +{0x01AFFF, 0x0001}, {0x01B000, 0x0004}, -{0x01B123, 0x0080}, +{0x01B123, 0x0001}, {0x01B132, 0x0004}, -{0x01B133, 0x0080}, +{0x01B133, 0x0001}, {0x01B150, 0x0004}, -{0x01B153, 0x0080}, +{0x01B153, 0x0001}, {0x01B155, 0x0004}, -{0x01B156, 0x0080}, +{0x01B156, 0x0001}, {0x01B164, 0x0004}, -{0x01B168, 0x0080}, +{0x01B168, 0x0001}, {0x01B170, 0x0004}, -{0x01B2FC, 0x0080}, +{0x01B2FC, 0x0001}, {0x01BC00, 0x0004}, -{0x01BC6B, 0x0080}, +{0x01BC6B, 0x0001}, {0x01BC70, 0x0004}, -{0x01BC7D, 0x0080}, +{0x01BC7D, 0x0001}, {0x01BC80, 0x0004}, -{0x01BC89, 0x0080}, +{0x01BC89, 0x0001}, {0x01BC90, 0x0004}, -{0x01BC9A, 0x0080}, +{0x01BC9A, 0x0001}, {0x01BC9C, 0x0040}, {0x01BC9D, 0x0010}, {0x01BC9F, 0x0020}, {0x01BCA0, 0x0080}, +{0x01BCA4, 0x0001}, {0x01CF00, 0x0010}, -{0x01CF2E, 0x0080}, +{0x01CF2E, 0x0001}, {0x01CF30, 0x0010}, -{0x01CF47, 0x0080}, +{0x01CF47, 0x0001}, {0x01CF50, 0x0040}, -{0x01CFC4, 0x0080}, +{0x01CFC4, 0x0001}, {0x01D000, 0x0040}, -{0x01D0F6, 0x0080}, +{0x01D0F6, 0x0001}, {0x01D100, 0x0040}, -{0x01D127, 0x0080}, +{0x01D127, 0x0001}, {0x01D129, 0x0040}, {0x01D165, 0x0010}, {0x01D16A, 0x0040}, @@ -1935,57 +1948,57 @@ const std::vector> unicode_ranges_flags = { // st {0x01D18C, 0x0040}, {0x01D1AA, 0x0010}, {0x01D1AE, 0x0040}, -{0x01D1EB, 0x0080}, +{0x01D1EB, 0x0001}, {0x01D200, 0x0040}, {0x01D242, 0x0010}, {0x01D245, 0x0040}, -{0x01D246, 0x0080}, +{0x01D246, 0x0001}, {0x01D2C0, 0x0002}, -{0x01D2D4, 0x0080}, +{0x01D2D4, 0x0001}, {0x01D2E0, 0x0002}, -{0x01D2F4, 0x0080}, +{0x01D2F4, 0x0001}, {0x01D300, 0x0040}, -{0x01D357, 0x0080}, +{0x01D357, 0x0001}, {0x01D360, 0x0002}, -{0x01D379, 0x0080}, +{0x01D379, 0x0001}, {0x01D400, 0x0004}, -{0x01D455, 0x0080}, +{0x01D455, 0x0001}, {0x01D456, 0x0004}, -{0x01D49D, 0x0080}, +{0x01D49D, 0x0001}, {0x01D49E, 0x0004}, -{0x01D4A0, 0x0080}, +{0x01D4A0, 0x0001}, {0x01D4A2, 0x0004}, -{0x01D4A3, 0x0080}, +{0x01D4A3, 0x0001}, {0x01D4A5, 0x0004}, -{0x01D4A7, 0x0080}, +{0x01D4A7, 0x0001}, {0x01D4A9, 0x0004}, -{0x01D4AD, 0x0080}, +{0x01D4AD, 0x0001}, {0x01D4AE, 0x0004}, -{0x01D4BA, 0x0080}, +{0x01D4BA, 0x0001}, {0x01D4BB, 0x0004}, -{0x01D4BC, 0x0080}, +{0x01D4BC, 0x0001}, {0x01D4BD, 0x0004}, -{0x01D4C4, 0x0080}, +{0x01D4C4, 0x0001}, {0x01D4C5, 0x0004}, -{0x01D506, 0x0080}, +{0x01D506, 0x0001}, {0x01D507, 0x0004}, -{0x01D50B, 0x0080}, +{0x01D50B, 0x0001}, {0x01D50D, 0x0004}, -{0x01D515, 0x0080}, +{0x01D515, 0x0001}, {0x01D516, 0x0004}, -{0x01D51D, 0x0080}, +{0x01D51D, 0x0001}, {0x01D51E, 0x0004}, -{0x01D53A, 0x0080}, +{0x01D53A, 0x0001}, {0x01D53B, 0x0004}, -{0x01D53F, 0x0080}, +{0x01D53F, 0x0001}, {0x01D540, 0x0004}, -{0x01D545, 0x0080}, +{0x01D545, 0x0001}, {0x01D546, 0x0004}, -{0x01D547, 0x0080}, +{0x01D547, 0x0001}, {0x01D54A, 0x0004}, -{0x01D551, 0x0080}, +{0x01D551, 0x0001}, {0x01D552, 0x0004}, -{0x01D6A6, 0x0080}, +{0x01D6A6, 0x0001}, {0x01D6A8, 0x0004}, {0x01D6C1, 0x0040}, {0x01D6C2, 0x0004}, @@ -2007,7 +2020,7 @@ const std::vector> unicode_ranges_flags = { // st {0x01D7AA, 0x0004}, {0x01D7C3, 0x0040}, {0x01D7C4, 0x0004}, -{0x01D7CC, 0x0080}, +{0x01D7CC, 0x0001}, {0x01D7CE, 0x0002}, {0x01D800, 0x0040}, {0x01DA00, 0x0010}, @@ -2019,251 +2032,283 @@ const std::vector> unicode_ranges_flags = { // st {0x01DA84, 0x0010}, {0x01DA85, 0x0040}, {0x01DA87, 0x0020}, -{0x01DA8C, 0x0080}, +{0x01DA8C, 0x0001}, {0x01DA9B, 0x0010}, -{0x01DAA0, 0x0080}, +{0x01DAA0, 0x0001}, {0x01DAA1, 0x0010}, -{0x01DAB0, 0x0080}, +{0x01DAB0, 0x0001}, {0x01DF00, 0x0004}, -{0x01DF1F, 0x0080}, +{0x01DF1F, 0x0001}, {0x01DF25, 0x0004}, -{0x01DF2B, 0x0080}, +{0x01DF2B, 0x0001}, {0x01E000, 0x0010}, -{0x01E007, 0x0080}, +{0x01E007, 0x0001}, {0x01E008, 0x0010}, -{0x01E019, 0x0080}, +{0x01E019, 0x0001}, {0x01E01B, 0x0010}, -{0x01E022, 0x0080}, +{0x01E022, 0x0001}, {0x01E023, 0x0010}, -{0x01E025, 0x0080}, +{0x01E025, 0x0001}, {0x01E026, 0x0010}, -{0x01E02B, 0x0080}, +{0x01E02B, 0x0001}, {0x01E030, 0x0004}, -{0x01E06E, 0x0080}, +{0x01E06E, 0x0001}, {0x01E08F, 0x0010}, -{0x01E090, 0x0080}, +{0x01E090, 0x0001}, {0x01E100, 0x0004}, -{0x01E12D, 0x0080}, +{0x01E12D, 0x0001}, {0x01E130, 0x0010}, {0x01E137, 0x0004}, -{0x01E13E, 0x0080}, +{0x01E13E, 0x0001}, {0x01E140, 0x0002}, -{0x01E14A, 0x0080}, +{0x01E14A, 0x0001}, {0x01E14E, 0x0004}, {0x01E14F, 0x0040}, -{0x01E150, 0x0080}, +{0x01E150, 0x0001}, {0x01E290, 0x0004}, {0x01E2AE, 0x0010}, -{0x01E2AF, 0x0080}, +{0x01E2AF, 0x0001}, {0x01E2C0, 0x0004}, {0x01E2EC, 0x0010}, {0x01E2F0, 0x0002}, -{0x01E2FA, 0x0080}, +{0x01E2FA, 0x0001}, {0x01E2FF, 0x0040}, -{0x01E300, 0x0080}, +{0x01E300, 0x0001}, {0x01E4D0, 0x0004}, {0x01E4EC, 0x0010}, {0x01E4F0, 0x0002}, -{0x01E4FA, 0x0080}, +{0x01E4FA, 0x0001}, {0x01E7E0, 0x0004}, -{0x01E7E7, 0x0080}, +{0x01E7E7, 0x0001}, {0x01E7E8, 0x0004}, -{0x01E7EC, 0x0080}, +{0x01E7EC, 0x0001}, {0x01E7ED, 0x0004}, -{0x01E7EF, 0x0080}, +{0x01E7EF, 0x0001}, {0x01E7F0, 0x0004}, -{0x01E7FF, 0x0080}, +{0x01E7FF, 0x0001}, {0x01E800, 0x0004}, -{0x01E8C5, 0x0080}, +{0x01E8C5, 0x0001}, {0x01E8C7, 0x0002}, {0x01E8D0, 0x0010}, -{0x01E8D7, 0x0080}, +{0x01E8D7, 0x0001}, {0x01E900, 0x0004}, {0x01E944, 0x0010}, {0x01E94B, 0x0004}, -{0x01E94C, 0x0080}, +{0x01E94C, 0x0001}, {0x01E950, 0x0002}, -{0x01E95A, 0x0080}, +{0x01E95A, 0x0001}, {0x01E95E, 0x0020}, -{0x01E960, 0x0080}, +{0x01E960, 0x0001}, {0x01EC71, 0x0002}, {0x01ECAC, 0x0040}, {0x01ECAD, 0x0002}, {0x01ECB0, 0x0040}, {0x01ECB1, 0x0002}, -{0x01ECB5, 0x0080}, +{0x01ECB5, 0x0001}, {0x01ED01, 0x0002}, {0x01ED2E, 0x0040}, {0x01ED2F, 0x0002}, -{0x01ED3E, 0x0080}, +{0x01ED3E, 0x0001}, {0x01EE00, 0x0004}, -{0x01EE04, 0x0080}, +{0x01EE04, 0x0001}, {0x01EE05, 0x0004}, -{0x01EE20, 0x0080}, +{0x01EE20, 0x0001}, {0x01EE21, 0x0004}, -{0x01EE23, 0x0080}, +{0x01EE23, 0x0001}, {0x01EE24, 0x0004}, -{0x01EE25, 0x0080}, +{0x01EE25, 0x0001}, {0x01EE27, 0x0004}, -{0x01EE28, 0x0080}, +{0x01EE28, 0x0001}, {0x01EE29, 0x0004}, -{0x01EE33, 0x0080}, +{0x01EE33, 0x0001}, {0x01EE34, 0x0004}, -{0x01EE38, 0x0080}, +{0x01EE38, 0x0001}, {0x01EE39, 0x0004}, -{0x01EE3A, 0x0080}, +{0x01EE3A, 0x0001}, {0x01EE3B, 0x0004}, -{0x01EE3C, 0x0080}, +{0x01EE3C, 0x0001}, {0x01EE42, 0x0004}, -{0x01EE43, 0x0080}, +{0x01EE43, 0x0001}, {0x01EE47, 0x0004}, -{0x01EE48, 0x0080}, +{0x01EE48, 0x0001}, {0x01EE49, 0x0004}, -{0x01EE4A, 0x0080}, +{0x01EE4A, 0x0001}, {0x01EE4B, 0x0004}, -{0x01EE4C, 0x0080}, +{0x01EE4C, 0x0001}, {0x01EE4D, 0x0004}, -{0x01EE50, 0x0080}, +{0x01EE50, 0x0001}, {0x01EE51, 0x0004}, -{0x01EE53, 0x0080}, +{0x01EE53, 0x0001}, {0x01EE54, 0x0004}, -{0x01EE55, 0x0080}, +{0x01EE55, 0x0001}, {0x01EE57, 0x0004}, -{0x01EE58, 0x0080}, +{0x01EE58, 0x0001}, {0x01EE59, 0x0004}, -{0x01EE5A, 0x0080}, +{0x01EE5A, 0x0001}, {0x01EE5B, 0x0004}, -{0x01EE5C, 0x0080}, +{0x01EE5C, 0x0001}, {0x01EE5D, 0x0004}, -{0x01EE5E, 0x0080}, +{0x01EE5E, 0x0001}, {0x01EE5F, 0x0004}, -{0x01EE60, 0x0080}, +{0x01EE60, 0x0001}, {0x01EE61, 0x0004}, -{0x01EE63, 0x0080}, +{0x01EE63, 0x0001}, {0x01EE64, 0x0004}, -{0x01EE65, 0x0080}, +{0x01EE65, 0x0001}, {0x01EE67, 0x0004}, -{0x01EE6B, 0x0080}, +{0x01EE6B, 0x0001}, {0x01EE6C, 0x0004}, -{0x01EE73, 0x0080}, +{0x01EE73, 0x0001}, {0x01EE74, 0x0004}, -{0x01EE78, 0x0080}, +{0x01EE78, 0x0001}, {0x01EE79, 0x0004}, -{0x01EE7D, 0x0080}, +{0x01EE7D, 0x0001}, {0x01EE7E, 0x0004}, -{0x01EE7F, 0x0080}, +{0x01EE7F, 0x0001}, {0x01EE80, 0x0004}, -{0x01EE8A, 0x0080}, +{0x01EE8A, 0x0001}, {0x01EE8B, 0x0004}, -{0x01EE9C, 0x0080}, +{0x01EE9C, 0x0001}, {0x01EEA1, 0x0004}, -{0x01EEA4, 0x0080}, +{0x01EEA4, 0x0001}, {0x01EEA5, 0x0004}, -{0x01EEAA, 0x0080}, +{0x01EEAA, 0x0001}, {0x01EEAB, 0x0004}, -{0x01EEBC, 0x0080}, +{0x01EEBC, 0x0001}, {0x01EEF0, 0x0040}, -{0x01EEF2, 0x0080}, +{0x01EEF2, 0x0001}, {0x01F000, 0x0040}, -{0x01F02C, 0x0080}, +{0x01F02C, 0x0001}, {0x01F030, 0x0040}, -{0x01F094, 0x0080}, +{0x01F094, 0x0001}, {0x01F0A0, 0x0040}, -{0x01F0AF, 0x0080}, +{0x01F0AF, 0x0001}, {0x01F0B1, 0x0040}, -{0x01F0C0, 0x0080}, +{0x01F0C0, 0x0001}, {0x01F0C1, 0x0040}, -{0x01F0D0, 0x0080}, +{0x01F0D0, 0x0001}, {0x01F0D1, 0x0040}, -{0x01F0F6, 0x0080}, +{0x01F0F6, 0x0001}, {0x01F100, 0x0002}, {0x01F10D, 0x0040}, -{0x01F1AE, 0x0080}, +{0x01F1AE, 0x0001}, {0x01F1E6, 0x0040}, -{0x01F203, 0x0080}, +{0x01F203, 0x0001}, {0x01F210, 0x0040}, -{0x01F23C, 0x0080}, +{0x01F23C, 0x0001}, {0x01F240, 0x0040}, -{0x01F249, 0x0080}, +{0x01F249, 0x0001}, {0x01F250, 0x0040}, -{0x01F252, 0x0080}, +{0x01F252, 0x0001}, {0x01F260, 0x0040}, -{0x01F266, 0x0080}, +{0x01F266, 0x0001}, {0x01F300, 0x0040}, -{0x01F6D8, 0x0080}, +{0x01F6D8, 0x0001}, {0x01F6DC, 0x0040}, -{0x01F6ED, 0x0080}, +{0x01F6ED, 0x0001}, {0x01F6F0, 0x0040}, -{0x01F6FD, 0x0080}, +{0x01F6FD, 0x0001}, {0x01F700, 0x0040}, -{0x01F777, 0x0080}, +{0x01F777, 0x0001}, {0x01F77B, 0x0040}, -{0x01F7DA, 0x0080}, +{0x01F7DA, 0x0001}, {0x01F7E0, 0x0040}, -{0x01F7EC, 0x0080}, +{0x01F7EC, 0x0001}, {0x01F7F0, 0x0040}, -{0x01F7F1, 0x0080}, +{0x01F7F1, 0x0001}, {0x01F800, 0x0040}, -{0x01F80C, 0x0080}, +{0x01F80C, 0x0001}, {0x01F810, 0x0040}, -{0x01F848, 0x0080}, +{0x01F848, 0x0001}, {0x01F850, 0x0040}, -{0x01F85A, 0x0080}, +{0x01F85A, 0x0001}, {0x01F860, 0x0040}, -{0x01F888, 0x0080}, +{0x01F888, 0x0001}, {0x01F890, 0x0040}, -{0x01F8AE, 0x0080}, +{0x01F8AE, 0x0001}, {0x01F8B0, 0x0040}, -{0x01F8B2, 0x0080}, +{0x01F8B2, 0x0001}, {0x01F900, 0x0040}, -{0x01FA54, 0x0080}, +{0x01FA54, 0x0001}, {0x01FA60, 0x0040}, -{0x01FA6E, 0x0080}, +{0x01FA6E, 0x0001}, {0x01FA70, 0x0040}, -{0x01FA7D, 0x0080}, +{0x01FA7D, 0x0001}, {0x01FA80, 0x0040}, -{0x01FA89, 0x0080}, +{0x01FA89, 0x0001}, {0x01FA90, 0x0040}, -{0x01FABE, 0x0080}, +{0x01FABE, 0x0001}, {0x01FABF, 0x0040}, -{0x01FAC6, 0x0080}, +{0x01FAC6, 0x0001}, {0x01FACE, 0x0040}, -{0x01FADC, 0x0080}, +{0x01FADC, 0x0001}, {0x01FAE0, 0x0040}, -{0x01FAE9, 0x0080}, +{0x01FAE9, 0x0001}, {0x01FAF0, 0x0040}, -{0x01FAF9, 0x0080}, +{0x01FAF9, 0x0001}, {0x01FB00, 0x0040}, -{0x01FB93, 0x0080}, +{0x01FB93, 0x0001}, {0x01FB94, 0x0040}, -{0x01FBCB, 0x0080}, +{0x01FBCB, 0x0001}, {0x01FBF0, 0x0002}, -{0x01FBFA, 0x0080}, +{0x01FBFA, 0x0001}, {0x020000, 0x0004}, -{0x02A6E0, 0x0080}, +{0x02A6E0, 0x0001}, {0x02A700, 0x0004}, -{0x02B73A, 0x0080}, +{0x02B73A, 0x0001}, {0x02B740, 0x0004}, -{0x02B81E, 0x0080}, +{0x02B81E, 0x0001}, {0x02B820, 0x0004}, -{0x02CEA2, 0x0080}, +{0x02CEA2, 0x0001}, {0x02CEB0, 0x0004}, -{0x02EBE1, 0x0080}, +{0x02EBE1, 0x0001}, {0x02EBF0, 0x0004}, -{0x02EE5E, 0x0080}, +{0x02EE5E, 0x0001}, {0x02F800, 0x0004}, -{0x02FA1E, 0x0080}, +{0x02FA1E, 0x0001}, {0x030000, 0x0004}, -{0x03134B, 0x0080}, +{0x03134B, 0x0001}, {0x031350, 0x0004}, -{0x0323B0, 0x0080}, +{0x0323B0, 0x0001}, +{0x0E0001, 0x0080}, +{0x0E0002, 0x0001}, +{0x0E0020, 0x0080}, +{0x0E0080, 0x0001}, {0x0E0100, 0x0010}, -{0x0E01F0, 0x0080}, +{0x0E01F0, 0x0001}, +{0x0F0000, 0x0080}, +{0x0FFFFE, 0x0001}, +{0x100000, 0x0080}, +{0x10FFFE, 0x0001}, {0x110000, 0x0000}, }; const std::unordered_set unicode_set_whitespace = { -0x000009, 0x00000A, 0x00000B, 0x00000C, 0x00000D, 0x000020, 0x000085, 0x0000A0, 0x001680, 0x002000, 0x002001, 0x002002, 0x002003, 0x002004, 0x002005, 0x002006, 0x002007, 0x002008, 0x002009, 0x00200A, 0x002028, 0x002029, 0x00202F, 0x00205F, 0x003000 +0x000009, +0x00000A, +0x00000B, +0x00000C, +0x00000D, +0x000020, +0x000085, +0x0000A0, +0x001680, +0x002000, +0x002001, +0x002002, +0x002003, +0x002004, +0x002005, +0x002006, +0x002007, +0x002008, +0x002009, +0x00200A, +0x002028, +0x002029, +0x00202F, +0x00205F, +0x003000, }; const std::unordered_map unicode_map_lowercase = { @@ -3222,6 +3267,7 @@ const std::unordered_map unicode_map_lowercase = { {0x002C2C, 0x002C5C}, {0x002C2D, 0x002C5D}, {0x002C2E, 0x002C5E}, +{0x002C2F, 0x002C5F}, {0x002C60, 0x002C61}, {0x002C62, 0x00026B}, {0x002C63, 0x001D7D}, @@ -3402,12 +3448,16 @@ const std::unordered_map unicode_map_lowercase = { {0x00A7BA, 0x00A7BB}, {0x00A7BC, 0x00A7BD}, {0x00A7BE, 0x00A7BF}, +{0x00A7C0, 0x00A7C1}, {0x00A7C2, 0x00A7C3}, {0x00A7C4, 0x00A794}, {0x00A7C5, 0x000282}, {0x00A7C6, 0x001D8E}, {0x00A7C7, 0x00A7C8}, {0x00A7C9, 0x00A7CA}, +{0x00A7D0, 0x00A7D1}, +{0x00A7D6, 0x00A7D7}, +{0x00A7D8, 0x00A7D9}, {0x00A7F5, 0x00A7F6}, {0x00FF21, 0x00FF41}, {0x00FF22, 0x00FF42}, @@ -3511,6 +3561,41 @@ const std::unordered_map unicode_map_lowercase = { {0x0104D1, 0x0104F9}, {0x0104D2, 0x0104FA}, {0x0104D3, 0x0104FB}, +{0x010570, 0x010597}, +{0x010571, 0x010598}, +{0x010572, 0x010599}, +{0x010573, 0x01059A}, +{0x010574, 0x01059B}, +{0x010575, 0x01059C}, +{0x010576, 0x01059D}, +{0x010577, 0x01059E}, +{0x010578, 0x01059F}, +{0x010579, 0x0105A0}, +{0x01057A, 0x0105A1}, +{0x01057C, 0x0105A3}, +{0x01057D, 0x0105A4}, +{0x01057E, 0x0105A5}, +{0x01057F, 0x0105A6}, +{0x010580, 0x0105A7}, +{0x010581, 0x0105A8}, +{0x010582, 0x0105A9}, +{0x010583, 0x0105AA}, +{0x010584, 0x0105AB}, +{0x010585, 0x0105AC}, +{0x010586, 0x0105AD}, +{0x010587, 0x0105AE}, +{0x010588, 0x0105AF}, +{0x010589, 0x0105B0}, +{0x01058A, 0x0105B1}, +{0x01058C, 0x0105B3}, +{0x01058D, 0x0105B4}, +{0x01058E, 0x0105B5}, +{0x01058F, 0x0105B6}, +{0x010590, 0x0105B7}, +{0x010591, 0x0105B8}, +{0x010592, 0x0105B9}, +{0x010594, 0x0105BB}, +{0x010595, 0x0105BC}, {0x010C80, 0x010CC0}, {0x010C81, 0x010CC1}, {0x010C82, 0x010CC2}, @@ -3690,7 +3775,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000079, 0x000059}, {0x00007A, 0x00005A}, {0x0000B5, 0x00039C}, -{0x0000DF, 0x000053}, {0x0000E0, 0x0000C0}, {0x0000E1, 0x0000C1}, {0x0000E2, 0x0000C2}, @@ -3758,7 +3842,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000144, 0x000143}, {0x000146, 0x000145}, {0x000148, 0x000147}, -{0x000149, 0x0002BC}, {0x00014B, 0x00014A}, {0x00014D, 0x00014C}, {0x00014F, 0x00014E}, @@ -3831,7 +3914,6 @@ const std::unordered_map unicode_map_uppercase = { {0x0001EB, 0x0001EA}, {0x0001ED, 0x0001EC}, {0x0001EF, 0x0001EE}, -{0x0001F0, 0x00004A}, {0x0001F2, 0x0001F1}, {0x0001F3, 0x0001F1}, {0x0001F5, 0x0001F4}, @@ -3917,12 +3999,10 @@ const std::unordered_map unicode_map_uppercase = { {0x00037B, 0x0003FD}, {0x00037C, 0x0003FE}, {0x00037D, 0x0003FF}, -{0x000390, 0x000399}, {0x0003AC, 0x000386}, {0x0003AD, 0x000388}, {0x0003AE, 0x000389}, {0x0003AF, 0x00038A}, -{0x0003B0, 0x0003A5}, {0x0003B1, 0x000391}, {0x0003B2, 0x000392}, {0x0003B3, 0x000393}, @@ -4163,7 +4243,6 @@ const std::unordered_map unicode_map_uppercase = { {0x000584, 0x000554}, {0x000585, 0x000555}, {0x000586, 0x000556}, -{0x000587, 0x000535}, {0x0010D0, 0x001C90}, {0x0010D1, 0x001C91}, {0x0010D2, 0x001C92}, @@ -4303,11 +4382,6 @@ const std::unordered_map unicode_map_uppercase = { {0x001E91, 0x001E90}, {0x001E93, 0x001E92}, {0x001E95, 0x001E94}, -{0x001E96, 0x000048}, -{0x001E97, 0x000054}, -{0x001E98, 0x000057}, -{0x001E99, 0x000059}, -{0x001E9A, 0x000041}, {0x001E9B, 0x001E60}, {0x001EA1, 0x001EA0}, {0x001EA3, 0x001EA2}, @@ -4393,13 +4467,9 @@ const std::unordered_map unicode_map_uppercase = { {0x001F43, 0x001F4B}, {0x001F44, 0x001F4C}, {0x001F45, 0x001F4D}, -{0x001F50, 0x0003A5}, {0x001F51, 0x001F59}, -{0x001F52, 0x0003A5}, {0x001F53, 0x001F5B}, -{0x001F54, 0x0003A5}, {0x001F55, 0x001F5D}, -{0x001F56, 0x0003A5}, {0x001F57, 0x001F5F}, {0x001F60, 0x001F68}, {0x001F61, 0x001F69}, @@ -4423,89 +4493,41 @@ const std::unordered_map unicode_map_uppercase = { {0x001F7B, 0x001FEB}, {0x001F7C, 0x001FFA}, {0x001F7D, 0x001FFB}, -{0x001F80, 0x001F08}, -{0x001F81, 0x001F09}, -{0x001F82, 0x001F0A}, -{0x001F83, 0x001F0B}, -{0x001F84, 0x001F0C}, -{0x001F85, 0x001F0D}, -{0x001F86, 0x001F0E}, -{0x001F87, 0x001F0F}, -{0x001F88, 0x001F08}, -{0x001F89, 0x001F09}, -{0x001F8A, 0x001F0A}, -{0x001F8B, 0x001F0B}, -{0x001F8C, 0x001F0C}, -{0x001F8D, 0x001F0D}, -{0x001F8E, 0x001F0E}, -{0x001F8F, 0x001F0F}, -{0x001F90, 0x001F28}, -{0x001F91, 0x001F29}, -{0x001F92, 0x001F2A}, -{0x001F93, 0x001F2B}, -{0x001F94, 0x001F2C}, -{0x001F95, 0x001F2D}, -{0x001F96, 0x001F2E}, -{0x001F97, 0x001F2F}, -{0x001F98, 0x001F28}, -{0x001F99, 0x001F29}, -{0x001F9A, 0x001F2A}, -{0x001F9B, 0x001F2B}, -{0x001F9C, 0x001F2C}, -{0x001F9D, 0x001F2D}, -{0x001F9E, 0x001F2E}, -{0x001F9F, 0x001F2F}, -{0x001FA0, 0x001F68}, -{0x001FA1, 0x001F69}, -{0x001FA2, 0x001F6A}, -{0x001FA3, 0x001F6B}, -{0x001FA4, 0x001F6C}, -{0x001FA5, 0x001F6D}, -{0x001FA6, 0x001F6E}, -{0x001FA7, 0x001F6F}, -{0x001FA8, 0x001F68}, -{0x001FA9, 0x001F69}, -{0x001FAA, 0x001F6A}, -{0x001FAB, 0x001F6B}, -{0x001FAC, 0x001F6C}, -{0x001FAD, 0x001F6D}, -{0x001FAE, 0x001F6E}, -{0x001FAF, 0x001F6F}, +{0x001F80, 0x001F88}, +{0x001F81, 0x001F89}, +{0x001F82, 0x001F8A}, +{0x001F83, 0x001F8B}, +{0x001F84, 0x001F8C}, +{0x001F85, 0x001F8D}, +{0x001F86, 0x001F8E}, +{0x001F87, 0x001F8F}, +{0x001F90, 0x001F98}, +{0x001F91, 0x001F99}, +{0x001F92, 0x001F9A}, +{0x001F93, 0x001F9B}, +{0x001F94, 0x001F9C}, +{0x001F95, 0x001F9D}, +{0x001F96, 0x001F9E}, +{0x001F97, 0x001F9F}, +{0x001FA0, 0x001FA8}, +{0x001FA1, 0x001FA9}, +{0x001FA2, 0x001FAA}, +{0x001FA3, 0x001FAB}, +{0x001FA4, 0x001FAC}, +{0x001FA5, 0x001FAD}, +{0x001FA6, 0x001FAE}, +{0x001FA7, 0x001FAF}, {0x001FB0, 0x001FB8}, {0x001FB1, 0x001FB9}, -{0x001FB2, 0x001FBA}, -{0x001FB3, 0x000391}, -{0x001FB4, 0x000386}, -{0x001FB6, 0x000391}, -{0x001FB7, 0x000391}, -{0x001FBC, 0x000391}, +{0x001FB3, 0x001FBC}, {0x001FBE, 0x000399}, -{0x001FC2, 0x001FCA}, -{0x001FC3, 0x000397}, -{0x001FC4, 0x000389}, -{0x001FC6, 0x000397}, -{0x001FC7, 0x000397}, -{0x001FCC, 0x000397}, +{0x001FC3, 0x001FCC}, {0x001FD0, 0x001FD8}, {0x001FD1, 0x001FD9}, -{0x001FD2, 0x000399}, -{0x001FD3, 0x000399}, -{0x001FD6, 0x000399}, -{0x001FD7, 0x000399}, {0x001FE0, 0x001FE8}, {0x001FE1, 0x001FE9}, -{0x001FE2, 0x0003A5}, -{0x001FE3, 0x0003A5}, -{0x001FE4, 0x0003A1}, {0x001FE5, 0x001FEC}, -{0x001FE6, 0x0003A5}, -{0x001FE7, 0x0003A5}, -{0x001FF2, 0x001FFA}, -{0x001FF3, 0x0003A9}, -{0x001FF4, 0x00038F}, -{0x001FF6, 0x0003A9}, -{0x001FF7, 0x0003A9}, -{0x001FFC, 0x0003A9}, +{0x001FF3, 0x001FFC}, {0x00214E, 0x002132}, {0x002170, 0x002160}, {0x002171, 0x002161}, @@ -4597,6 +4619,7 @@ const std::unordered_map unicode_map_uppercase = { {0x002C5C, 0x002C2C}, {0x002C5D, 0x002C2D}, {0x002C5E, 0x002C2E}, +{0x002C5F, 0x002C2F}, {0x002C61, 0x002C60}, {0x002C65, 0x00023A}, {0x002C66, 0x00023E}, @@ -4800,9 +4823,13 @@ const std::unordered_map unicode_map_uppercase = { {0x00A7BB, 0x00A7BA}, {0x00A7BD, 0x00A7BC}, {0x00A7BF, 0x00A7BE}, +{0x00A7C1, 0x00A7C0}, {0x00A7C3, 0x00A7C2}, {0x00A7C8, 0x00A7C7}, {0x00A7CA, 0x00A7C9}, +{0x00A7D1, 0x00A7D0}, +{0x00A7D7, 0x00A7D6}, +{0x00A7D9, 0x00A7D8}, {0x00A7F6, 0x00A7F5}, {0x00AB53, 0x00A7B3}, {0x00AB70, 0x0013A0}, @@ -4885,18 +4912,6 @@ const std::unordered_map unicode_map_uppercase = { {0x00ABBD, 0x0013ED}, {0x00ABBE, 0x0013EE}, {0x00ABBF, 0x0013EF}, -{0x00FB00, 0x000046}, -{0x00FB01, 0x000046}, -{0x00FB02, 0x000046}, -{0x00FB03, 0x000046}, -{0x00FB04, 0x000046}, -{0x00FB05, 0x000053}, -{0x00FB06, 0x000053}, -{0x00FB13, 0x000544}, -{0x00FB14, 0x000544}, -{0x00FB15, 0x000544}, -{0x00FB16, 0x00054E}, -{0x00FB17, 0x000544}, {0x00FF41, 0x00FF21}, {0x00FF42, 0x00FF22}, {0x00FF43, 0x00FF23}, @@ -4999,6 +5014,41 @@ const std::unordered_map unicode_map_uppercase = { {0x0104F9, 0x0104D1}, {0x0104FA, 0x0104D2}, {0x0104FB, 0x0104D3}, +{0x010597, 0x010570}, +{0x010598, 0x010571}, +{0x010599, 0x010572}, +{0x01059A, 0x010573}, +{0x01059B, 0x010574}, +{0x01059C, 0x010575}, +{0x01059D, 0x010576}, +{0x01059E, 0x010577}, +{0x01059F, 0x010578}, +{0x0105A0, 0x010579}, +{0x0105A1, 0x01057A}, +{0x0105A3, 0x01057C}, +{0x0105A4, 0x01057D}, +{0x0105A5, 0x01057E}, +{0x0105A6, 0x01057F}, +{0x0105A7, 0x010580}, +{0x0105A8, 0x010581}, +{0x0105A9, 0x010582}, +{0x0105AA, 0x010583}, +{0x0105AB, 0x010584}, +{0x0105AC, 0x010585}, +{0x0105AD, 0x010586}, +{0x0105AE, 0x010587}, +{0x0105AF, 0x010588}, +{0x0105B0, 0x010589}, +{0x0105B1, 0x01058A}, +{0x0105B3, 0x01058C}, +{0x0105B4, 0x01058D}, +{0x0105B5, 0x01058E}, +{0x0105B6, 0x01058F}, +{0x0105B7, 0x010590}, +{0x0105B8, 0x010591}, +{0x0105B9, 0x010592}, +{0x0105BB, 0x010594}, +{0x0105BC, 0x010595}, {0x010CC0, 0x010C80}, {0x010CC1, 0x010C81}, {0x010CC2, 0x010C82}, diff --git a/unicode.cpp b/unicode.cpp index 2f8d73832..c0b76bf20 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -226,8 +226,9 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -309,7 +310,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -344,8 +345,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -450,7 +452,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -594,6 +596,7 @@ std::vector unicode_cpts_normalize_nfd(const std::vector & c std::vector unicode_cpts_from_utf8(const std::string & utf8) { std::vector result; + result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { result.push_back(unicode_cpt_from_utf8(utf8, offset)); @@ -679,10 +682,14 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); + const auto flags = unicode_cpt_flags(cpts[i]); - if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_flag); + if (flags.is_whitespace) { + //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. + //text_collapsed[i] = (char) 0x85; // as whitespace fallback + text_collapsed[i] = (char) 0x0B; // as whitespace fallback + } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); } else { text_collapsed[i] = (char) 0xD0; // fallback } @@ -766,9 +773,16 @@ std::vector unicode_regex_split(const std::string & text, const std bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); } else { // no unicode category used, we can use std::wregex directly - const std::wstring wtext = unicode_wstring_from_utf8(text); const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback + std::wstring wtext(cpts.begin(), cpts.end()); + for (size_t i = 0; i < wtext.size(); ++i) { + if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { + wtext[i] = 0x0B; + } + } + //printf("text: %s\n", text.c_str()); //printf("regex_expr: %s\n", regex_expr.c_str()); bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);