From b526ad2668944a7b2b1721f60679153646313831 Mon Sep 17 00:00:00 2001 From: rspOverflow <217881046+rspOverflow@users.noreply.github.com> Date: Sun, 20 Jul 2025 23:55:32 +0700 Subject: [PATCH 1/7] Documentation: Further revisions to the Vulkan section in build.md (#14785) * Documentation: Revised and further improved the Vulkan instructions for Linux users in build.md. * Minor: Revise step 2 of the Vulkan instructions for Linux users in build.md --- docs/build.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/build.md b/docs/build.md index 50dbba486..849c82526 100644 --- a/docs/build.md +++ b/docs/build.md @@ -387,12 +387,12 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren ### For Linux users: -First, follow the the official [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. +First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. > [!IMPORTANT] > After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this. -Second, after verifying that you have done everything in the Vulkan SDK guide provided in the first step, run the following command to verify that everything is set up correctly: +Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding: ```bash vulkaninfo ``` @@ -403,10 +403,11 @@ cmake -B build -DGGML_VULKAN=1 cmake --build build --config Release ``` -Finally, after finishing your build, you should be able to do this: +Finally, after finishing your build, you should be able to do something like this: ```bash -# Test the output binary (with "-ngl 33" to offload all layers to GPU) -./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 +# Test the output binary +# "-ngl 99" should offload all of the layers to GPU for most (if not all) models. +./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -ngl 99 # You should see in the output, ggml_vulkan detected your GPU. For example: # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 From 2be60cbc2707359241c2784f9d2e30d8fc7cdabb Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 21 Jul 2025 02:13:47 +0800 Subject: [PATCH 2/7] docs : fix link for tools/perplexity in README.md (#14780) --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index edde61238..6768d5a3d 100644 --- a/README.md +++ b/README.md @@ -436,7 +436,7 @@ To learn more about model quantization, [read this documentation](tools/quantize ## [`llama-perplexity`](tools/perplexity) -#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text. +#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text. -
Measure the perplexity over a text file @@ -459,8 +459,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
-[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md) -[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) +[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) ## [`llama-bench`](tools/llama-bench) From b4efd77f8ab407836ca73a5176f041650c5b2411 Mon Sep 17 00:00:00 2001 From: IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> Date: Mon, 21 Jul 2025 09:24:51 +0200 Subject: [PATCH 3/7] server : add parse_special option to /tokenize endpoint (#14783) --- tools/server/README.md | 2 ++ tools/server/server.cpp | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index e29511cb1..aa07f1ef5 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -575,6 +575,8 @@ These words will not be included in the completion, so make sure to add them to `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` +`parse_special`: (Optional) Boolean indicating if special tokens should be tokenized. When `false` special tokens are treated as plaintext. Default: `true` + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` **Response:** diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0afe213af..256a2928b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4516,9 +4516,10 @@ int main(int argc, char ** argv) { json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); + const bool parse_special = json_value(body, "parse_special", true); const bool with_pieces = json_value(body, "with_pieces", false); - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); if (with_pieces) { for (const auto& token : tokens) { From c82d48ec23fb8749c341d0838f6891fd5f6b6da0 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 21 Jul 2025 17:38:36 +0800 Subject: [PATCH 4/7] llama : fix `--reverse-prompt` crashing issue (#14794) Signed-off-by: Molly Sophia --- tools/main/main.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 516bf0965..eb36c6884 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -785,14 +785,17 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - llama_token last_token = common_sampler_last(smpl); - for (auto token : antiprompt_token) { - if (token == last_token) { - if (params.interactive) { - is_interacting = true; + // avoid calling common_sampler_last() if last_output is empty + if (!last_output.empty()) { + llama_token last_token = common_sampler_last(smpl); + for (auto token : antiprompt_token) { + if (token == last_token) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; } - is_antiprompt = true; - break; } } From c2e058f1b4e799f1be085560c1bcef95b7b5ed02 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 21 Jul 2025 06:35:40 -0500 Subject: [PATCH 5/7] vulkan/cuda: Fix im2col when KW!=KH (#14789) The tid is decomposed into "ow + ky*OW + kx*OW*KH". Change "ksize" to match. --- ggml/src/ggml-cuda/im2col.cu | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp | 6 ++---- tests/test-backend-ops.cpp | 1 + 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 86a54e42b..5bb85b480 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -10,7 +10,7 @@ static __global__ void im2col_kernel( return; } - const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t ksize = OW * KH; const int64_t kx = i / ksize; const int64_t kd = kx * ksize; const int64_t ky = (i - kd) / OW; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 17c7ccb90..fdbcf7eba 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -40,12 +40,10 @@ void main() { const uint src_base = ic * p.offset_delta + batch * p.batch_offset; const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH); const int oh_s1 = int(oh) * p.s1; - const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1); + const uint ksize = p.OW * p.KH; const uint base_linear_idx = gidx * NUM_ITER; - const uint max_ky = ksize / p.OW; - uint current_kx = base_linear_idx / ksize; const uint rem = base_linear_idx - (current_kx * ksize); uint current_ky = rem / p.OW; @@ -76,7 +74,7 @@ void main() { if (++current_ix == p.OW) { current_ix = 0; - if (++current_ky == max_ky) { + if (++current_ky == p.KH) { current_ky = 0; current_kx++; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 731b4980a..a6d00542d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5093,6 +5093,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true)); // Conv_2D test cases #ifdef DETAILED_TESTS From 2ba1333b35e471b344974dde553db11bf1c2836f Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 21 Jul 2025 15:03:49 +0300 Subject: [PATCH 6/7] docs : fix backends table in README.md (#14796) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6768d5a3d..9b2e0f851 100644 --- a/README.md +++ b/README.md @@ -270,7 +270,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [CANN](docs/build.md#cann) | Ascend NPU | | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | | [WebGPU [In Progress]](docs/build.md#webgpu) | All | - | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | ## Obtaining and quantizing models From 922042601b8a16877ccb1c2afaa2071f76734f10 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Mon, 21 Jul 2025 15:49:52 +0200 Subject: [PATCH 7/7] kleidiai: add support for get_rows (#14676) * kleidiai: add support for get_rows * apply fixes based on code review * apply more fixes based on code review --- ggml/src/ggml-cpu/CMakeLists.txt | 4 +- ggml/src/ggml-cpu/kleidiai/kernels.cpp | 121 +++++++++++++++++++++--- ggml/src/ggml-cpu/kleidiai/kernels.h | 3 + ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 98 +++++++++++++++++-- 4 files changed, 202 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 66a5ad8d2..d9590b9d0 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -494,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.9.0") + set(KLEIDIAI_COMMIT_TAG "v1.11.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017") + set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index 910fd0ee4..ddd29d002 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -22,9 +22,94 @@ #include "kai_common.h" +#include "simd-mappings.h" + #include "kernels.h" #define NELEMS(x) sizeof(x) / sizeof(*x) + +static const size_t INT4_PER_BYTE = 2; +static const size_t INT4_BITS = 4; +static const int Q4_0_ZERO_POINT = 8; +const size_t INT4_PER_UINT16 = 4; + +static void dequantize_row_qsi4c32pscalef16( + const void *packed_data, + int32_t row_idx, + int64_t nc, + float *out, + size_t nr_pack, + size_t packed_row_stride, + size_t kr, + size_t bl, + size_t num_bytes_multiplier +) { + size_t group_idx = row_idx / nr_pack; + size_t row_in_group = row_idx % nr_pack; + const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride; + size_t num_blocks = nc / bl; + const uint8_t *block_ptr = packed_group; + + for (size_t b = 0; b < num_blocks; ++b) { + uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier)); + float scale = GGML_CPU_FP16_TO_FP32(scale_f16); + + const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier; + size_t num_segments = bl / kr; + size_t num_bytes_per_segment = kr / INT4_PER_BYTE; + + for (size_t s = 0; s < num_segments; ++s) { + const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment; + const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment; + for (size_t k = 0; k < num_bytes_per_segment; ++k) { + uint8_t byte = qbytes[k] ^ 0x88; + int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT; + int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT; + out[b * bl + s * num_bytes_per_segment + k] = x0 * scale; + out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale; + } + } + block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment; + } +} + +static void dequantize_row_qsi4c32ps1s0scalef16( + const void *packed_data, + int32_t row_idx, + int64_t k, + float *out, + size_t nr, + size_t packed_row_stride, + size_t kr, + size_t bl, + size_t num_bytes_multiplier +) { + const size_t num_blocks = k / bl; + const size_t bl4 = bl / INT4_PER_UINT16; + + size_t group_idx = row_idx / nr; + size_t row_in_group = row_idx % nr; + + const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride; + const uint16_t *qdata = (const uint16_t *)packed_group; + const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier)); + + for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) { + uint16_t scale_f16 = scales[row_in_group + block_idx * nr]; + float scale = GGML_CPU_FP16_TO_FP32(scale_f16); + + for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) { + uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group]; + + for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) { + int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT; + out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale; + } + } + } + GGML_UNUSED(kr); +} + static ggml_kleidiai_kernels gemm_gemv_kernels[] = { #if defined(__ARM_FEATURE_SME) { @@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, - /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .packed_stride = */ NULL, + /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .to_float = */ NULL, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, @@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.h b/ggml/src/ggml-cpu/kleidiai/kernels.h index 3b268d4a2..bc8f33405 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.h +++ b/ggml/src/ggml-cpu/kleidiai/kernels.h @@ -71,12 +71,15 @@ struct rhs_packing_info { std::function, std::function > packed_size; + size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl); std::variant< std::function, std::function > pack_func; + void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride, + size_t kr, size_t bl, size_t num_bytes_multiplier); }; struct ggml_kleidiai_kernels { diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index fafe45e6c..3a513a55d 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -40,6 +40,17 @@ struct ggml_kleidiai_context { ggml_kleidiai_kernels * kernels; } static ctx = { CPU_FEATURE_NONE, NULL }; +static const char* cpu_feature_to_string(cpu_feature f) { + switch (f) { + case CPU_FEATURE_NONE: return "NONE"; + case CPU_FEATURE_DOTPROD: return "DOTPROD"; + case CPU_FEATURE_I8MM: return "I8MM"; + case CPU_FEATURE_SVE: return "SVE"; + case CPU_FEATURE_SME: return "SME"; + default: return "UNKNOWN"; + } +} + static void init_kleidiai_context(void) { ggml_critical_section_start(); @@ -62,6 +73,11 @@ static void init_kleidiai_context(void) { ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE; } ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features); +#ifndef NDEBUG + if (ctx.kernels) { + GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu)); + } +#endif } ggml_critical_section_end(); } @@ -102,6 +118,9 @@ static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint1 class tensor_traits : public ggml::cpu::tensor_traits { bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + if (op->op != GGML_OP_MUL_MAT) { + return false; + } ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op); GGML_ASSERT(kernels); kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm; @@ -135,6 +154,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { } else if (dst->src[0]->type == GGML_TYPE_F16) { return compute_forward_kv_cache(params, dst); } + } else if (dst->op == GGML_OP_GET_ROWS) { + if (dst->src[0]->type == GGML_TYPE_Q4_0) { + return compute_forward_get_rows(params, dst); + } } return false; } @@ -270,6 +293,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { } bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0); + const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -342,8 +367,49 @@ class tensor_traits : public ggml::cpu::tensor_traits { return true; } + bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0); + GGML_ASSERT(ctx.kernels); + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + rhs_packing_info * rhs_info = &ctx.kernels->rhs_info; + kernel_info * kernel = &ctx.kernels->gemm; + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + const size_t block_rows = kernel->get_nr(); + const size_t kr = kernel->get_kr(); + + const size_t num_bytes_multiplier = sizeof(uint16_t); + const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0); + + const int ith = params->ith; + const int nth = params->nth; + + const int dr = (nr + nth - 1) / nth; + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + int64_t row_idx = ((const int32_t *)src1->data)[i]; + GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]); + + float *out = (float *)((char *)dst->data + i * nb1); + rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier); + } + + return true; + } + public: int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) { + GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0); GGML_ASSERT(ctx.kernels); const size_t n = tensor->ne[1]; const size_t k = tensor->ne[0]; @@ -351,17 +417,12 @@ public: size_t kr = ctx.kernels->gemm.get_kr(); size_t sr = ctx.kernels->gemm.get_sr(); -#ifndef NDEBUG - const size_t repacked_size = variant_call(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0); - GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!"); -#endif struct kai_rhs_pack_qs4cxs1s0_param params; params.lhs_zero_point = 1; params.rhs_zero_point = 8; variant_call(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, ¶ms); return 0; - GGML_UNUSED(data_size); } }; @@ -375,8 +436,8 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor); - GGML_UNUSED(buffer); return GGML_STATUS_SUCCESS; + GGML_UNUSED(buffer); } static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, @@ -418,18 +479,35 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b GGML_UNUSED(buft); } +static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0); + GGML_ASSERT(ctx.kernels); + + const size_t n = tensor->ne[1]; + const size_t k = tensor->ne[0]; + const size_t nr = ctx.kernels->gemm.get_nr(); + const size_t kr = ctx.kernels->gemm.get_kr(); + + return variant_call(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0); + + GGML_UNUSED(buft); +} + namespace ggml::cpu::kleidiai { class extra_buffer_type : ggml::cpu::extra_buffer_type { bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT && + if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) && op->src[0]->type == GGML_TYPE_Q4_0 && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) { + if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) { + return false; + } if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { return false; } - if (op->src[1]->type == GGML_TYPE_F32 && + if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) && ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) { return true; } @@ -438,7 +516,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } @@ -469,7 +547,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) { /* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment, /* .get_max_size = */ nullptr, // defaults to SIZE_MAX - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size, /* .is_host = */ nullptr, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),