From 6380d6a3e709cb02a8695afdd96b40e674477332 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Sep 2025 13:37:03 +0800 Subject: [PATCH 01/35] ggml-zdnn: rm user mapped buffers (#15965) * ggml-zdnn: rm user mapped buffers Signed-off-by: Aaron Teo * ggml-zdnn: rm dead code Signed-off-by: Aaron Teo * ggml-zdnn: attempt to fix missing extra data buffer free Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- ggml/src/ggml-zdnn/ggml-zdnn.cpp | 87 ++++---------------------------- 1 file changed, 11 insertions(+), 76 deletions(-) diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 750717c0b..9ba23a330 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -127,11 +127,6 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten const int64_t output_rows = ne1; const int64_t output_cols = ne0; - // TODO: Weights are somehow not going through `ggml_backend_zdnn_buffer_set_tensor` during model loading. - // So we need to load the weights here. Remove this when the issue is fixed. - // Problem might be residing in `ggml_backend_zdnn_device_supports_buft`. - if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data); - // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n", // __func__, weights_extra->name, // weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0], @@ -355,6 +350,9 @@ static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { for (const auto & buf_ptr : ctx->buffers) { ggml_backend_zdnn_buffer * buf = buf_ptr.get(); + + // Free any extra buffer allocated for the tensor. E.g., bias for GGML_OP_MUL_MAT + if (buf->extra != nullptr) free(buf->extra->data); if (buf->ztensor.buffer_size > 0) ZDNN_CHECK(zdnn_free_ztensor_buffer(&buf->ztensor)); } @@ -432,8 +430,11 @@ static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, gg memcpy((char *)tensor->data + offset, data, size); ggml_backend_zdnn_buffer * extra = (ggml_backend_zdnn_buffer *)tensor->extra; - if (extra->ztensor.is_transformed) zdnn_reset_ztensor(&extra->ztensor); - ggml_zdnn_load_tensor(extra->ztensor, tensor->data); + + // Fixes the LLAMA_SET_ROWS bug + // see: https://github.com/ggml-org/llama.cpp/issues/15414 + if (tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_COMPUTE && extra->ztensor.is_transformed) zdnn_reset_ztensor(&extra->ztensor); + if (extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(extra->ztensor, tensor->data); GGML_UNUSED(buffer); } @@ -538,29 +539,6 @@ ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) { return &ggml_backend_buffer_type_zdnn; } -static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_ZDNN_NAME "_Mapped"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) { - static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = { - /* .iface = */ { - /* .get_name = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment, - /* .get_max_size = */ NULL, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_zdnn_buffer_type_is_host, - }, - /* .device = */ &g_ggml_backend_zdnn_device, - /* .context = */ NULL, - }; - - return &ggml_backend_buffer_from_ptr_type_zdnn; -} - // // backend // @@ -648,7 +626,7 @@ static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_back props->caps = (ggml_backend_dev_caps) { /* .async = */ false, /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, + /* .buffer_from_host_ptr = */ false, /* .events = */ false }; } @@ -679,48 +657,6 @@ static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_ GGML_UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context(); - - ctx->all_data = ptr; - ctx->all_size = size; - ctx->owned = false; - ctx->n_buffers = 0; - - const size_t size_page = sysconf(_SC_PAGESIZE); - - // page-align the data ptr - { - const uintptr_t offs = (uintptr_t) ptr % size_page; - ptr = (void *)((char *)ptr - offs); - size += offs; - } - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += size_page - (size_aligned % size_page); - } - - ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context; - - GGML_ASSERT(ctx_dev->zdnn_device >= 0); - int device = ctx_dev->zdnn_device; GGML_UNUSED(device); - - std::unique_ptr zdnn_buffer = std::make_unique(); - zdnn_buffer->data = ptr; - zdnn_buffer->size = size; - ctx->buffers.push_back(std::move(zdnn_buffer)); - - GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n", - __func__, size_aligned / 1024.0 / 1024.0); - - ++ctx->n_buffers; - - return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size); - - GGML_UNUSED(max_tensor_size); -} - static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context; @@ -729,8 +665,7 @@ static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const g static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return - buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name || - buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name; + buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name; GGML_UNUSED(dev); } @@ -744,7 +679,7 @@ static ggml_backend_device_i ggml_backend_zdnn_device_i = { /* .init_backend = */ ggml_backend_zdnn_device_init, /* .get_buffer_type = */ ggml_backend_zdnn_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_zdnn_device_supports_op, /* .supports_buft = */ ggml_backend_zdnn_device_supports_buft, /* .offload_op = */ NULL, From d1c6f11f47bae063a68a6be8e4830c060a11b7bd Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Sun, 14 Sep 2025 12:10:07 +0300 Subject: [PATCH 02/35] doc : update documentation for --tensor-split (#15980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * doc : update documentation for --tensor-split * Update tools/main/README.md Co-authored-by: Johannes Gäßler * Update tools/main/README.md Co-authored-by: Diego Devesa --------- Co-authored-by: Johannes Gäßler Co-authored-by: Diego Devesa --- tools/main/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/main/README.md b/tools/main/README.md index 4f16ad6b2..54e582de0 100644 --- a/tools/main/README.md +++ b/tools/main/README.md @@ -384,5 +384,5 @@ These options provide extra functionality and customization when running the LLa - `--verbose-prompt`: Print the prompt before generating text. - `--no-display-prompt`: Don't print prompt at generation. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple devices this option controls how tensors should be split across devices. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each device should get in order. For example, "3,2" will assign 60% of the data to device 0 and 40% to device 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. The list of the devices which are being used is printed on startup and can be different from the device list given by `--list-devices` or e.g. `nvidia-smi`. - `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache. From 9ecb88434644c865232bb665d2f6f05049fc6456 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 14 Sep 2025 02:21:59 -0700 Subject: [PATCH 03/35] releases : update ROCM, add gfx1200, gfx1201, gfx1151 (#15972) * releases : update ROCM, add gfx1200, gfx1201, gfx1151 * releases : set target to 13.3 for macos-x64 * add hipblaslt.dll to release * add hipblaslt/library to release --- .github/workflows/release.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 701811eeb..553463c41 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -108,7 +108,8 @@ jobs: -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_METAL=OFF \ - -DGGML_RPC=ON + -DGGML_RPC=ON \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Determine tag name @@ -532,7 +533,7 @@ jobs: matrix: include: - name: "radeon" - gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1200;gfx1201;gfx1151;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -565,7 +566,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-25.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $completed = $proc.WaitForExit(600000) @@ -610,9 +611,12 @@ jobs: -DLLAMA_CURL=OFF cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" + md "build\bin\hipblaslt\library" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" + cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" - name: Pack artifacts id: pack_artifacts From 918b26f197f55d5d562446dfc876d0e637929d07 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Sun, 14 Sep 2025 12:28:18 +0300 Subject: [PATCH 04/35] rpc : fix regression when --device is used (#15981) Fix regression introduced with commit 50f4281a6 --- tools/rpc/rpc-server.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp index f7568980d..dc8e077f3 100644 --- a/tools/rpc/rpc-server.cpp +++ b/tools/rpc/rpc-server.cpp @@ -227,7 +227,9 @@ static ggml_backend_t create_backend(const rpc_server_params & params) { } } - backend = ggml_backend_init_best(); + if (!backend) { + backend = ggml_backend_init_best(); + } if (backend) { fprintf(stderr, "%s: using %s backend\n", __func__, ggml_backend_name(backend)); From a14bd350141fb42b8bf2dd2342cebc27bfdce399 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Sep 2025 15:33:22 +0300 Subject: [PATCH 05/35] metal : fix kernel requirements (#15983) * metal : fix kernel requirements ggml-ci * cont : fix supports_op * cont : fix supports_op for ARGMAX --- ggml/src/ggml-metal/ggml-metal.m | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 82d8077a0..13f9de297 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1219,10 +1219,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP, ssm_scan_f32_group, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP, ssm_scan_f32_group, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); @@ -1443,9 +1443,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU_OAI, swiglu_oai, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_ERF, geglu_erf, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_QUICK, geglu_quick, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); } @@ -1982,7 +1982,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); case GGML_OP_ARGMAX: - return true; + return has_simdgroup_reduction; case GGML_OP_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); case GGML_OP_ROPE: @@ -2028,6 +2028,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return has_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: + return has_simdgroup_reduction; case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: return true; From a0e13dcbe5bae7025660349ef3e4ead060e507f2 Mon Sep 17 00:00:00 2001 From: lcy Date: Sun, 14 Sep 2025 22:20:35 +0800 Subject: [PATCH 06/35] build: fix the build failures of Windows HIP release job (#15984) * build: fix the cache keys for Windows HIP release job Update the cache keys to include the HIP SDK version, preventing the use of outdated ROCm installation caches. * build: sync changes from release.yml to build.yml - Update HIP SDK version to 25.Q3 and ROCm version to 6.4.2 - Update the cache keys to reflect the new versions * build: remove Windows HIP release for gfx1151 since the current stable rocWMMA does not support gfx1151. --- .github/workflows/build.yml | 17 ++++++++++------- .github/workflows/release.yml | 17 ++++++++++------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 20f956a12..1bd35627d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -127,7 +127,8 @@ jobs: -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_METAL=OFF \ - -DGGML_RPC=ON + -DGGML_RPC=ON \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -1051,9 +1052,13 @@ jobs: run: examples/sycl/win-build-sycl.bat windows-latest-cmake-hip: - if: ${{ github.event.inputs.create_release != 'true' }} runs-on: windows-2022 + env: + # The ROCm version must correspond to the version used in the HIP SDK. + ROCM_VERSION: "6.4.2" + HIPSDK_INSTALLER_VERSION: "25.Q3" + steps: - name: Clone id: checkout @@ -1062,16 +1067,14 @@ jobs: - name: Clone rocWMMA repository id: clone_rocwmma run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1 - name: Cache ROCm Installation id: cache-rocm uses: actions/cache@v4 with: path: C:\Program Files\AMD\ROCm - key: rocm-6.1-${{ runner.os }}-v1 - restore-keys: | - rocm-6.1-${{ runner.os }}- + key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - name: Install ROCm if: steps.cache-rocm.outputs.cache-hit != 'true' @@ -1079,7 +1082,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $completed = $proc.WaitForExit(600000) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 553463c41..e959209ab 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -529,11 +529,16 @@ jobs: windows-hip: runs-on: windows-2022 + env: + # The ROCm version must correspond to the version used in the HIP SDK. + ROCM_VERSION: "6.4.2" + HIPSDK_INSTALLER_VERSION: "25.Q3" + strategy: matrix: include: - name: "radeon" - gpu_targets: "gfx1200;gfx1201;gfx1151;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -543,21 +548,19 @@ jobs: - name: Clone rocWMMA repository id: clone_rocwmma run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1 - name: Cache ROCm Installation id: cache-rocm uses: actions/cache@v4 with: path: C:\Program Files\AMD\ROCm - key: rocm-6.1-${{ runner.os }}-v1 - restore-keys: | - rocm-6.1-${{ runner.os }}- + key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-hip-${{ matrix.name }}-x64 + key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64 evict-old-files: 1d - name: Install ROCm @@ -566,7 +569,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-25.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $completed = $proc.WaitForExit(600000) From 261e6a20ffdb79c4875e674b4f6b514bc73cff8f Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 14 Sep 2025 16:56:28 +0200 Subject: [PATCH 07/35] Vulkan: Clean up mul_mm shader (#15987) * vulkan: move mul_mm dequantization steps into a separate file and functions * improve mul_mm vector load code * fix debug mode issues and warnings --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 +- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 547 +---------------- .../vulkan-shaders/mul_mm_funcs.comp | 568 ++++++++++++++++++ .../src/ggml-vulkan/vulkan-shaders/types.comp | 6 - .../vulkan-shaders/vulkan-shaders-gen.cpp | 107 +++- 5 files changed, 663 insertions(+), 577 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4ccc498f3..60a99dc78 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1231,8 +1231,6 @@ static std::string format_size(size_t size) { return oss.str(); } -static std::mutex log_mutex; - class vk_memory_logger { public: vk_memory_logger(): total_device(0), total_host(0) {} @@ -1422,6 +1420,8 @@ struct ggml_backend_vk_buffer_context { }; #ifdef GGML_VULKAN_MEMORY_DEBUG +static std::mutex log_mutex; + void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { std::lock_guard guard(log_mutex); vk_buffer buf = buf_ref.lock(); @@ -13138,16 +13138,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else if (tensor->op == GGML_OP_IM2COL_3D) { const int32_t s0 = tensor->op_params[0]; const int32_t s1 = tensor->op_params[1]; - const int32_t s1 = tensor->op_params[2]; + const int32_t s2 = tensor->op_params[2]; const int32_t p0 = tensor->op_params[3]; const int32_t p1 = tensor->op_params[4]; - const int32_t p1 = tensor->op_params[5]; + const int32_t p2 = tensor->op_params[5]; const int32_t d0 = tensor->op_params[6]; const int32_t d1 = tensor->op_params[7]; - const int32_t d1 = tensor->op_params[8]; + const int32_t d2 = tensor->op_params[8]; const int32_t IC = tensor->op_params[9]; - tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); + tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { const int32_t dim = tensor->op_params[0]; const int32_t max_period = tensor->op_params[1]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index f6a7761ff..193429089 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -183,6 +183,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) { shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; #endif +#include "mul_mm_funcs.comp" + void main() { #ifdef NEEDS_INIT_IQ_SHMEM init_iq_shmem(gl_WorkGroupSize); @@ -310,550 +312,13 @@ void main() { for (uint block = start_k; block < end_k; block += BK) { [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) { - -#if defined(DATA_A_F32) || defined(DATA_A_F16) -#if LOAD_VEC_A == 8 - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - A_TYPE32 aa = A_TYPE32(data_a[idx]); - buf_a[buf_idx ] = FLOAT_TYPE(aa[0].x); - buf_a[buf_idx + 1] = FLOAT_TYPE(aa[0].y); - buf_a[buf_idx + 2] = FLOAT_TYPE(aa[0].z); - buf_a[buf_idx + 3] = FLOAT_TYPE(aa[0].w); - buf_a[buf_idx + 4] = FLOAT_TYPE(aa[1].x); - buf_a[buf_idx + 5] = FLOAT_TYPE(aa[1].y); - buf_a[buf_idx + 6] = FLOAT_TYPE(aa[1].z); - buf_a[buf_idx + 7] = FLOAT_TYPE(aa[1].w); -#elif LOAD_VEC_A == 4 - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - A_TYPE32 aa = A_TYPE32(data_a[idx]); - buf_a[buf_idx ] = FLOAT_TYPE(aa.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(aa.y); - buf_a[buf_idx + 2] = FLOAT_TYPE(aa.z); - buf_a[buf_idx + 3] = FLOAT_TYPE(aa.w); -#else - if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); - } else { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f); - } -#endif -#elif defined(DATA_A_BF16) -#if LOAD_VEC_A == 4 - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - buf_a[buf_idx ] = TO_FLOAT_TYPE(data_a[idx].x); - buf_a[buf_idx + 1] = TO_FLOAT_TYPE(data_a[idx].y); - buf_a[buf_idx + 2] = TO_FLOAT_TYPE(data_a[idx].z); - buf_a[buf_idx + 3] = TO_FLOAT_TYPE(data_a[idx].w); -#else - if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); - } else { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(uint16_t(0)); - } -#endif -#elif defined(DATA_A_Q4_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a; - - const uint ib = idx / 4; - const uint iqs = idx & 0x03; - - const float d = float(data_a_packed16[ib].d); - const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); - const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d; - const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v0.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); - buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); - buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); - buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); - buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); - buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); - buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); -#elif defined(DATA_A_Q4_1) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a; - - const uint ib = idx / 4; - const uint iqs = idx & 0x03; - - const float d = float(data_a_packed16[ib].d); - const float m = float(data_a_packed16[ib].m); - const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); - const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m; - const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m; - - buf_a[buf_idx ] = FLOAT_TYPE(v0.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); - buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); - buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); - buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); - buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); - buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); - buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); -#elif defined(DATA_A_Q5_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]); - const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); - const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); - - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q5_1) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const float m = float(data_a_packed16[ib].m); - const uint uint_qh = data_a_packed16[ib].qh; - const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); - const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); - - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q8_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147 - const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy; - const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 2] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 3] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q2_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30 - const uint scalesi = iqs / 8; // 0..15 - const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 - - const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]); - const uint scales = data_a[ib].scales[scalesi]; - const vec2 d = vec2(data_a[ib].d); - - const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_Q3_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 64; // 0,1 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 - const uint hmi = (iqs % 16) * 2; // 0,2,4..30 - const uint j = (iqs % 64) / 4; // 0..3 - const uint is = iqs / 8; // 0..15 - const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3 - const uint qsshift = halfsplit * 2; // 0,2,4,6 - const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 - - const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF) - | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4)); - const float dl = float(data_a[ib].d) * float(us - 32); - - buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4))); - buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); -#elif defined(DATA_A_Q4_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 32; // 0,1,2,3 - const uint b = (iqs % 32) / 16; // 0,1 - const uint is = 2 * n + b; // 0..7 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 - - const vec2 loadd = vec2(data_a[ib].d); - - const uint scidx0 = (is < 4) ? is : (is + 4); - const uint scidx1 = (is < 4) ? is : (is - 4); - const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint scidxshift1 = (is < 4) ? 0 : 2; - const uint mbidx0 = is + 4; - const uint mbidx1 = (is < 4) ? is + 4 : is; - const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; - const uint mbidxshift0 = (is < 4) ? 0 : 4; - const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint mbidxshift1 = (is < 4) ? 0 : 2; - - const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); - const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); - - const float d = loadd.x * sc; - const float m = -loadd.y * mbyte; - - buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m)); - buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); -#elif defined(DATA_A_Q5_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 32; // 0,1,2,3 - const uint b = (iqs % 32) / 16; // 0,1 - const uint is = 2 * n + b; // 0..7 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 - const uint qhi = (iqs % 16) * 2; // 0,2,4..30 - - const uint8_t hm = uint8_t(1 << (iqs / 16)); - - const vec2 loadd = vec2(data_a[ib].d); - - const uint scidx0 = (is < 4) ? is : (is + 4); - const uint scidx1 = (is < 4) ? is : (is - 4); - const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint scidxshift1 = (is < 4) ? 0 : 2; - const uint mbidx0 = is + 4; - const uint mbidx1 = (is < 4) ? is + 4 : is; - const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; - const uint mbidxshift0 = (is < 4) ? 0 : 4; - const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint mbidxshift1 = (is < 4) ? 0 : 2; - - const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); - const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); - - const float d = loadd.x * sc; - const float m = -loadd.y * mbyte; - - buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m)); - buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); -#elif defined(DATA_A_Q6_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 64; // 0,1 - const uint b = (iqs % 64) / 32; // 0,1 - const uint is_b = (iqs % 16) / 8; // 0,1 - const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 - const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 - const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 - - const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - - buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32)); - buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); -#elif defined(DATA_A_IQ1_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 32; // 8 values per idx - const uint ib32 = (idx % 32) / 4; // 0..7 - const uint ib8 = idx % 32; - - const float d = float(data_a[ib].d); - const uint qh = data_a[ib].qh[ib32]; - const uint qs = data_a[ib].qs[ib8]; - const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1); - const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; - const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); - - [[unroll]] for (int k = 0; k < 8; ++k) { - buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); - } -#elif defined(DATA_A_IQ1_M) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 32; // 8 values per idx - const uint ib8 = idx % 32; - const uint ib16 = ib8 / 2; - - const uint16_t[4] scales = data_a[ib].scales; - const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; - const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x); - const uint sc = scales[ib8 / 8]; - const uint qs = data_a[ib].qs[ib8]; - const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1)); - const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); - const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; - const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); - - [[unroll]] for (int k = 0; k < 8; ++k) { - buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); - } -#elif defined(DATA_A_IQ2_XXS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 32; // 8 values per idx - const uint ib32 = (idx % 32) / 4; // 0..7 - const uint ib8 = idx % 4; - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[8 * ib32 + ib8]; - const uint signs = pack32(u8vec4( - data_a[ib].qs[8*ib32 + 4], - data_a[ib].qs[8*ib32 + 5], - data_a[ib].qs[8*ib32 + 6], - data_a[ib].qs[8*ib32 + 7] - )); - const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28))); - const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); - const uint sign = sign7 | (bitCount(sign7) << 7); - const uvec2 grid = iq2xxs_grid[qs]; - const vec4 grid0 = vec4(unpack8(grid.x)); - const vec4 grid1 = vec4(unpack8(grid.y)); - - buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); - buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); - buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); - buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); - buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); - buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); - buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); - buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); -#elif defined(DATA_A_IQ2_XS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 32; // 8 values per idx - const uint ib32 = (idx % 32) / 4; // 0..7 - const uint ib8 = idx % 4; // 0..3 - - const float d = float(data_a[ib].d); - const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); - const uint qs = data_a[ib].qs[4 * ib32 + ib8]; - const uint sign7 = qs >> 9; - const uint sign = sign7 | (bitCount(sign7) << 7); - const uvec2 grid = iq2xs_grid[qs & 511]; - const vec4 grid0 = vec4(unpack8(grid.x)); - const vec4 grid1 = vec4(unpack8(grid.y)); - - buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); - buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); - buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); - buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); - buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); - buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); - buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); - buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); -#elif defined(DATA_A_IQ2_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 32; // 8 values per idx - const uint ib8 = idx % 32; // 0..31 - const uint ib32 = ib8 / 4; // 0..7 - - const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const uint qs = data_a[ib].qs[ib8]; - const uint qh = data_a[ib].qh[ib32]; - const uint qhshift = 2 * (ib8 % 4); - const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8]; - - const float d = float(data_a[ib].d); - const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); - const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)]; - const vec4 grid0 = vec4(unpack8(grid.x)); - const vec4 grid1 = vec4(unpack8(grid.y)); - - buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); - buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); - buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); - buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); - buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); - buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); - buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); - buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); -#elif defined(DATA_A_IQ3_XXS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 64; // 4 values per idx - const uint iqs = idx % 64; // 0..63 - const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[iqs]; - const uint signs = pack32(u8vec4( - data_a[ib].qs[is+0], - data_a[ib].qs[is+1], - data_a[ib].qs[is+2], - data_a[ib].qs[is+3] - )); - const float db = d * 0.5 * (0.5 + (signs >> 28)); - const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2)); - const uint grid = iq3xxs_grid[qs]; - const vec4 v = db * vec4(unpack8(grid)); - - buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); - buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); - buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); -#elif defined(DATA_A_IQ3_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 64; // 4 values per idx - const uint iqs = idx % 64; // 0..63 - const uint iqh = iqs / 8; - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[iqs]; - const uint qh = data_a[ib].qh[iqh]; - const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2))); - const uint scale = data_a[ib].scales[iqs / 16]; - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); - const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); - const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)]; - const vec4 v = db * vec4(unpack8(grid)); - - buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); - buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); - buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); -#elif defined(DATA_A_IQ4_XS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint iq = 16 * ib32 + 2 * (idx % 8); - - const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF; - const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3; - const uint qshift = (idx & 8) >> 1; - u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]); - qs = (qs >> qshift) & uint8_t(0xF); - - const float d = float(data_a[ib].d); - const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ4_NL) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d); - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - - buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d; - buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d; - buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d; - buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d; -#elif defined(DATA_A_MXFP4) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = (idx & 0x07) * 2; - - const float d = e8m0_to_fp32(data_a[ib].e); - const uint vui = uint(data_a[ib].qs[iqs]); - const uint vui2 = uint(data_a[ib].qs[iqs+1]); - - buf_a[buf_idx ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d); - buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >> 4] * d); - buf_a[buf_idx + 1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d); - buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >> 4] * d); -#endif + load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block + loadr_a, end_k); } [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { -#if LOAD_VEC_B == 8 -#ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[loadc_b + l]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; +#if !defined(MUL_MAT_ID) + load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block + loadr_b, end_k); #else - const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; -#endif - const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; -#if defined(DATA_B_BF16) - B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]); -#else - B_TYPE32 bb = B_TYPE32(data_b[idx]); -#endif - buf_b[buf_idx + 0] = FLOAT_TYPE(bb[0].x); - buf_b[buf_idx + 1] = FLOAT_TYPE(bb[0].y); - buf_b[buf_idx + 2] = FLOAT_TYPE(bb[0].z); - buf_b[buf_idx + 3] = FLOAT_TYPE(bb[0].w); - buf_b[buf_idx + 4] = FLOAT_TYPE(bb[1].x); - buf_b[buf_idx + 5] = FLOAT_TYPE(bb[1].y); - buf_b[buf_idx + 6] = FLOAT_TYPE(bb[1].z); - buf_b[buf_idx + 7] = FLOAT_TYPE(bb[1].w); -#elif LOAD_VEC_B == 4 -#ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[loadc_b + l]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; -#else - const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; -#endif - const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; -#if defined(DATA_B_BF16) - B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]); -#else - B_TYPE32 bb = B_TYPE32(data_b[idx]); -#endif - buf_b[buf_idx + 0] = FLOAT_TYPE(bb.x); - buf_b[buf_idx + 1] = FLOAT_TYPE(bb.y); - buf_b[buf_idx + 2] = FLOAT_TYPE(bb.z); - buf_b[buf_idx + 3] = FLOAT_TYPE(bb.w); -#elif !MUL_MAT_ID - if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); - } else { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); - } -#else - const uint row_i = ic * BN + loadc_b + l; - if (row_i < _ne1 && block + loadr_b < end_k) { - const u16vec2 row_idx = row_ids[loadc_b + l]; - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); - } else { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); - } + load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block + loadr_b, end_k); #endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp new file mode 100644 index 000000000..fe0750f92 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp @@ -0,0 +1,568 @@ +void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint idx_k, const uint end_k) { +#if defined(DATA_A_F32) || defined(DATA_A_F16) +#if LOAD_VEC_A == 8 + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + FLOAT_TYPE_VEC8 aa = FLOAT_TYPE_VEC8(data_a[idx]); + buf_a[buf_idx ] = aa[0].x; + buf_a[buf_idx + 1] = aa[0].y; + buf_a[buf_idx + 2] = aa[0].z; + buf_a[buf_idx + 3] = aa[0].w; + buf_a[buf_idx + 4] = aa[1].x; + buf_a[buf_idx + 5] = aa[1].y; + buf_a[buf_idx + 6] = aa[1].z; + buf_a[buf_idx + 7] = aa[1].w; +#elif LOAD_VEC_A == 4 + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]); + buf_a[buf_idx ] = aa.x; + buf_a[buf_idx + 1] = aa.y; + buf_a[buf_idx + 2] = aa.z; + buf_a[buf_idx + 3] = aa.w; +#else + if (idx_m < p.M && idx_k < end_k) { + buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]); + } else { + buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f); + } +#endif +#elif defined(DATA_A_BF16) +#if LOAD_VEC_A == 4 + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx])); + buf_a[buf_idx ] = aa.x; + buf_a[buf_idx + 1] = aa.y; + buf_a[buf_idx + 2] = aa.z; + buf_a[buf_idx + 3] = aa.w; +#else + if (idx_m < p.M && idx_k < end_k) { + buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]); + } else { + buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(uint16_t(0)); + } +#endif +#elif defined(DATA_A_Q4_0) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 4 * row; + + const uint ib = idx / 4; + const uint iqs = idx & 0x03; + + const float d = float(data_a_packed16[ib].d); + const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); + const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d; + const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d; + + buf_a[buf_idx ] = FLOAT_TYPE(v0.x); + buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); + buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); + buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); + buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); + buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); + buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); + buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); +#elif defined(DATA_A_Q4_1) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 4 * row; + + const uint ib = idx / 4; + const uint iqs = idx & 0x03; + + const float d = float(data_a_packed16[ib].d); + const float m = float(data_a_packed16[ib].m); + const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); + const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m; + const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m; + + buf_a[buf_idx ] = FLOAT_TYPE(v0.x); + buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); + buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); + buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); + buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); + buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); + buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); + buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); +#elif defined(DATA_A_Q5_0) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 2 * row; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]); + const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); + + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d; + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); + buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); + buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); +#elif defined(DATA_A_Q5_1) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 2 * row; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const float m = float(data_a_packed16[ib].m); + const uint uint_qh = data_a_packed16[ib].qh; + const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); + + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m; + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); + buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); + buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); +#elif defined(DATA_A_Q8_0) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147 + const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy; + const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d; + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE(v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE(v.w); +#elif defined(DATA_A_Q2_K) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30 + const uint scalesi = iqs / 8; // 0..15 + const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + + const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]); + const uint scales = data_a[ib].scales[scalesi]; + const vec2 d = vec2(data_a[ib].d); + + const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4); + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); +#elif defined(DATA_A_Q3_K) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint hmi = (iqs % 16) * 2; // 0,2,4..30 + const uint j = (iqs % 64) / 4; // 0..3 + const uint is = iqs / 8; // 0..15 + const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF) + | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4)); + const float dl = float(data_a[ib].d) * float(us - 32); + + buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4))); + buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); +#elif defined(DATA_A_Q4_K) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + + const vec2 loadd = vec2(data_a[ib].d); + + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float d = loadd.x * sc; + const float m = -loadd.y * mbyte; + + buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m)); + buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); +#elif defined(DATA_A_Q5_K) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + const uint qhi = (iqs % 16) * 2; // 0,2,4..30 + + const uint8_t hm = uint8_t(1 << (iqs / 16)); + + const vec2 loadd = vec2(data_a[ib].d); + + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float d = loadd.x * sc; + const float m = -loadd.y * mbyte; + + buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m)); + buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); +#elif defined(DATA_A_Q6_K) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is_b = (iqs % 16) / 8; // 0,1 + const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 + const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + + const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); + + buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32)); + buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); +#elif defined(DATA_A_IQ1_S) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 32; + + const float d = float(data_a[ib].d); + const uint qh = data_a[ib].qh[ib32]; + const uint qs = data_a[ib].qs[ib8]; + const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1); + const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; + const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); + + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } +#elif defined(DATA_A_IQ1_M) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; + const uint ib16 = ib8 / 2; + + const uint16_t[4] scales = data_a[ib].scales; + const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; + const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x); + const uint sc = scales[ib8 / 8]; + const uint qs = data_a[ib].qs[ib8]; + const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1)); + const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); + const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; + const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); + + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } +#elif defined(DATA_A_IQ2_XXS) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[8 * ib32 + ib8]; + const uint signs = pack32(u8vec4( + data_a[ib].qs[8*ib32 + 4], + data_a[ib].qs[8*ib32 + 5], + data_a[ib].qs[8*ib32 + 6], + data_a[ib].qs[8*ib32 + 7] + )); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28))); + const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xxs_grid[qs]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); +#elif defined(DATA_A_IQ2_XS) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; // 0..3 + + const float d = float(data_a[ib].d); + const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); + const uint qs = data_a[ib].qs[4 * ib32 + ib8]; + const uint sign7 = qs >> 9; + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xs_grid[qs & 511]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); +#elif defined(DATA_A_IQ2_S) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; // 0..31 + const uint ib32 = ib8 / 4; // 0..7 + + const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; + const uint qs = data_a[ib].qs[ib8]; + const uint qh = data_a[ib].qh[ib32]; + const uint qhshift = 2 * (ib8 % 4); + const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8]; + + const float d = float(data_a[ib].d); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); + const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); +#elif defined(DATA_A_IQ3_XXS) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 + const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[iqs]; + const uint signs = pack32(u8vec4( + data_a[ib].qs[is+0], + data_a[ib].qs[is+1], + data_a[ib].qs[is+2], + data_a[ib].qs[is+3] + )); + const float db = d * 0.5 * (0.5 + (signs >> 28)); + const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2)); + const uint grid = iq3xxs_grid[qs]; + const vec4 v = db * vec4(unpack8(grid)); + + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); +#elif defined(DATA_A_IQ3_S) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 + const uint iqh = iqs / 8; + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[iqs]; + const uint qh = data_a[ib].qh[iqh]; + const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2))); + const uint scale = data_a[ib].scales[iqs / 16]; + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); + const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); + const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)]; + const vec4 v = db * vec4(unpack8(grid)); + + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); +#elif defined(DATA_A_IQ4_XS) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint ib32 = (idx % 128) / 16; // 0..7 + const uint iq = 16 * ib32 + 2 * (idx % 8); + + const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF; + const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3; + const uint qshift = (idx & 8) >> 1; + u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]); + qs = (qs >> qshift) & uint8_t(0xF); + + const float d = float(data_a[ib].d); + const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]); + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); +#elif defined(DATA_A_IQ4_NL) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 2 * row; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d); + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + + buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d; + buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d; + buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d; + buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d; +#elif defined(DATA_A_MXFP4) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + 2 * row; + + const uint ib = idx / 8; + const uint iqs = (idx & 0x07) * 2; + + const float d = e8m0_to_fp32(data_a[ib].e); + const uint vui = uint(data_a[ib].qs[iqs]); + const uint vui2 = uint(data_a[ib].qs[iqs+1]); + + buf_a[buf_idx ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d); + buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >> 4] * d); + buf_a[buf_idx + 1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d); + buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >> 4] * d); +#endif +} + +#if !defined(MUL_MAT_ID) +void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint idx_k, const uint end_k) { +#if LOAD_VEC_B == 8 + // Not supported for b_type bf16 because bf16mat2x4 does not exist + const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B; + FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]); + buf_b[buf_idx + 0] = bb[0].x; + buf_b[buf_idx + 1] = bb[0].y; + buf_b[buf_idx + 2] = bb[0].z; + buf_b[buf_idx + 3] = bb[0].w; + buf_b[buf_idx + 4] = bb[1].x; + buf_b[buf_idx + 5] = bb[1].y; + buf_b[buf_idx + 6] = bb[1].z; + buf_b[buf_idx + 7] = bb[1].w; +#elif LOAD_VEC_B == 4 + const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B; +#if defined(DATA_B_BF16) + FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx])); +#else + FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]); +#endif + buf_b[buf_idx + 0] = bb.x; + buf_b[buf_idx + 1] = bb.y; + buf_b[buf_idx + 2] = bb.z; + buf_b[buf_idx + 3] = bb.w; +#else // LOAD_VEC_B == 1 + if (idx_n < p.N && idx_k < end_k) { + buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + col * p.stride_b + row]); + } else { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f); + } +#endif +} +#else +void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint idx_k, const uint end_k) { +#if LOAD_VEC_B == 8 + // Not supported for b_type bf16 because bf16mat2x4 does not exist + const u16vec2 row_idx = row_ids[col]; + const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B; + FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]); + buf_b[buf_idx + 0] = bb[0].x; + buf_b[buf_idx + 1] = bb[0].y; + buf_b[buf_idx + 2] = bb[0].z; + buf_b[buf_idx + 3] = bb[0].w; + buf_b[buf_idx + 4] = bb[1].x; + buf_b[buf_idx + 5] = bb[1].y; + buf_b[buf_idx + 6] = bb[1].z; + buf_b[buf_idx + 7] = bb[1].w; +#elif LOAD_VEC_B == 4 + const u16vec2 row_idx = row_ids[col]; + const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B; +#if defined(DATA_B_BF16) + FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx])); +#else + FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]); +#endif + buf_b[buf_idx + 0] = bb.x; + buf_b[buf_idx + 1] = bb.y; + buf_b[buf_idx + 2] = bb.z; + buf_b[buf_idx + 3] = bb.w; +#else // LOAD_VEC_B == 1 + const uint row_i = ic * BN + col; + if (row_i < _ne1 && idx_k < end_k) { + const u16vec2 row_idx = row_ids[col]; + buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row]); + } else { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f); + } +#endif +} +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index c2acc803f..b4b7a126a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -13,13 +13,10 @@ #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float -#define A_TYPE32 float #elif LOAD_VEC_A == 4 #define A_TYPE vec4 -#define A_TYPE32 vec4 #elif LOAD_VEC_A == 8 #define A_TYPE mat2x4 -#define A_TYPE32 mat2x4 #endif #endif @@ -29,13 +26,10 @@ #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float16_t -#define A_TYPE32 float #elif LOAD_VEC_A == 4 #define A_TYPE f16vec4 -#define A_TYPE32 vec4 #elif LOAD_VEC_A == 8 #define A_TYPE f16mat2x4 -#define A_TYPE32 mat2x4 #endif #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index b6570e020..e818166d1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -320,9 +320,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; - std::map base_dict = { - {"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"}, - }; + std::map base_dict; std::string shader_name = "matmul"; if (matmul_id_type == MatMulIdType::DEFAULT) { @@ -349,26 +347,74 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; - auto const &FLOAT_TYPE = [&](const std::string &t) -> std::string { - if (t == "bf16") { - // scalar path promotes to float - if (!coopmat && !coopmat2) { - return "float"; + auto const &FLOAT_TYPE = [&](int vec, const std::string &t) -> std::string { + switch (vec) { + case 1: + if (t == "bf16") { + // scalar path promotes to float + if (!coopmat && !coopmat2) { + return "float"; + } + return "bfloat16_t"; } - return "bfloat16_t"; + if (coopmat2 || fp16) { + return "float16_t"; + } + return "float"; + case 2: + if (t == "bf16") { + // scalar path promotes to float + if (!coopmat && !coopmat2) { + return "vec2"; + } + return "bf16vec2"; + } + if (coopmat2 || fp16) { + return "f16vec2"; + } + return "vec2"; + case 4: + if (t == "bf16") { + // scalar path promotes to float + if (!coopmat && !coopmat2) { + return "vec4"; + } + return "bf16vec4"; + } + if (coopmat2 || fp16) { + return "f16vec4"; + } + return "vec4"; + case 8: + if (t == "bf16") { + // scalar path promotes to float + if (!coopmat && !coopmat2) { + return "mat2x4"; + } + throw std::runtime_error("bf16 vec8 not supported"); + } + if (coopmat2 || fp16) { + return "f16mat2x4"; + } + return "mat2x4"; + default: + throw std::runtime_error("invalid vector size"); } - if (coopmat2 || fp16) { - return "float16_t"; - } - return "float"; + }; + + const std::map float_type_dict_f16 = { + {"FLOAT_TYPE", FLOAT_TYPE(1, "f16")}, + {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "f16")}, + {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "f16")}, + {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, "f16")}, }; // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); // bf16 { @@ -379,13 +425,19 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c // scalar path promotes to float std::string to_float_type = (coopmat || coopmat2) ? "uintBitsToBFloat16EXT" : "bf16_to_fp32"; + const std::map float_type_dict_bf16 = { + {"FLOAT_TYPE", FLOAT_TYPE(1, "bf16")}, + {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "bf16")}, + {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "bf16")}, + }; + // If bfloat16 is not supported, then only compile the scalar (promote to fp32) shader #if !defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (!(coopmat || coopmat2)) #endif { - string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE32", "vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc); } } @@ -406,20 +458,27 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c // For aligned matmul loads std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant; + const std::map float_type_dict = { + {"FLOAT_TYPE", FLOAT_TYPE(1, tname)}, + {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, tname)}, + {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, tname)}, + {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, tname)}, + }; + // don't generate f32 variants for coopmat2 if (!coopmat2) { - string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } if (tname != "f16" && tname != "f32") { - string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && is_legacy_quant(tname)) { - string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc); } #endif } From 0fa154e3502e940df914f03b41475a2b80b985b0 Mon Sep 17 00:00:00 2001 From: Adam Date: Mon, 15 Sep 2025 04:43:54 +1000 Subject: [PATCH 08/35] rocm.Dockerfile: added gfx1200,gfx1201 architectures to support AMD Radeon RX 9000 series (#15994) * rocm.Dockerfile: added gfx1200,gfx1201 architectures to support AMD Radeon RX 9000 series https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html#rdna-os states the Radeon RX 9000 series is supported support from Ubuntu 24.04.2, and the dockerfile is using 24.04 which is ROCm 6.4. This fixed the `ROCm error: invalid device function` I was getting when trying to use the rocm container. --- .devops/rocm.Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index cf19e6e02..221077197 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -4,7 +4,7 @@ ARG UBUNTU_VERSION=24.04 ARG ROCM_VERSION=6.4 ARG AMDGPU_VERSION=6.4 -# Target the CUDA build image +# Target the ROCm build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete ### Build image @@ -15,12 +15,12 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # This is mostly tied to rocBLAS supported archs. # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported # gfx906 is deprecated -#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html +#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html -ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102' +ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201' #ARG ROCM_DOCKER_ARCH=gfx1100 -# Set nvcc architectured +# Set ROCm architectured ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm # ENV CC=/opt/rocm/llvm/bin/clang From 9dcd200d57bc6f05a59a6a8df361d5d183af4124 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Sep 2025 22:02:32 +0300 Subject: [PATCH 09/35] metal : remove memory pools (#15966) * metal : remove mem pool usage ggml-ci * metal : remove mem pool implementation ggml-ci * metal : take into account the actual allocated memory of the tensor ggml-ci * cont : use ggml_backend_buft_get_alloc_size ggml-ci * cont : improve, comments ggml-ci * cont : add functions for the extra tensor sizes * metal : add comments ggml-ci * metal : implement .get_alloc_size for the rest of the buffer types ggml-ci * metal : remove ggml_metal_heap ggml-ci --- ggml/src/ggml-metal/ggml-metal-common.cpp | 67 +-- ggml/src/ggml-metal/ggml-metal.m | 479 +++++----------------- 2 files changed, 144 insertions(+), 402 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-common.cpp b/ggml/src/ggml-metal/ggml-metal-common.cpp index 6a869ff24..cb39e5b2a 100644 --- a/ggml/src/ggml-metal/ggml-metal-common.cpp +++ b/ggml/src/ggml-metal/ggml-metal-common.cpp @@ -1,9 +1,12 @@ #include "ggml-metal-common.h" #include "ggml-impl.h" +#include "ggml-backend-impl.h" #include +// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb) +// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it) struct ggml_mem_range { uint64_t pb; // buffer id @@ -36,8 +39,8 @@ void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) { mrs->ranges.clear(); } -static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mrp) { - mrs->ranges.push_back(mrp); +static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mr) { + mrs->ranges.push_back(mr); return true; } @@ -48,20 +51,24 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm GGML_ASSERT(!tensor->view_src); - ggml_mem_range mrp; + ggml_mem_range mr; if (tensor->buffer) { - // when the tensor is allocated, use the actual memory address range of the buffer - mrp = { + // when the tensor is allocated, use the actual memory address range in the buffer + // + // take the actual allocated size with ggml_backend_buft_get_alloc_size() + // this can be larger than the tensor size if the buffer type allocates extra memory + // ref: https://github.com/ggml-org/llama.cpp/pull/15966 + mr = { /*.pb =*/ (uint64_t) tensor->buffer, /*.p0 =*/ (uint64_t) tensor->data, - /*.p1 =*/ (uint64_t) tensor->data + ggml_nbytes(tensor), + /*.p1 =*/ (uint64_t) tensor->data + ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor), /*.pt =*/ pt, }; } else { - // otherwise, the tensor ptr is used as an unique id of the memory ranges + // otherwise, the pointer address is used as an unique id of the memory ranges // that the tensor will be using when it is allocated - mrp = { + mr = { /*.pb =*/ (uint64_t) tensor, /*.p0 =*/ 0, // /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used @@ -69,7 +76,7 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm }; }; - return mrp; + return mr; } static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) { @@ -83,25 +90,25 @@ static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor) static bool ggml_mem_ranges_add_src(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor); if (mrs->debug > 2) { - GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1); + GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1); } - return ggml_mem_ranges_add(mrs, mrp); + return ggml_mem_ranges_add(mrs, mr); } static bool ggml_mem_ranges_add_dst(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor); if (mrs->debug > 2) { - GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1); + GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1); } - return ggml_mem_ranges_add(mrs, mrp); + return ggml_mem_ranges_add(mrs, mr); } bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { @@ -114,24 +121,26 @@ bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { return ggml_mem_ranges_add_dst(mrs, tensor); } -static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mrp) { +static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr) { for (size_t i = 0; i < mrs->ranges.size(); i++) { const auto & cmp = mrs->ranges[i]; - if (mrp.pb != cmp.pb) { + // two memory ranges cannot intersect if they are in different buffers + if (mr.pb != cmp.pb) { continue; } - if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) { + // intersecting source ranges are allowed + if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) { continue; } - if (mrp.p0 < cmp.p1 && mrp.p1 >= cmp.p0) { + if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) { if (mrs->debug > 2) { GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n", __func__, - mrp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", - mrp.pb, mrp.p0, mrp.p1, + mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", + mr.pb, mr.p0, mr.p1, cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", cmp.pb, cmp.p0, cmp.p1); } @@ -146,9 +155,9 @@ static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor); - const bool res = ggml_mem_ranges_check(mrs, mrp); + const bool res = ggml_mem_ranges_check(mrs, mr); return res; } @@ -156,9 +165,9 @@ static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_te static bool ggml_mem_ranges_check_dst(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor); - const bool res = ggml_mem_ranges_check(mrs, mrp); + const bool res = ggml_mem_ranges_check(mrs, mr); return res; } @@ -222,6 +231,7 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vectorsrc[i]) { @@ -290,7 +300,10 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vector used(n, false); + // the memory ranges for the set of currently concurrent nodes ggml_mem_ranges * mrs0 = ggml_mem_ranges_init(0); + + // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder ggml_mem_ranges * mrs1 = ggml_mem_ranges_init(0); for (int i0 = 0; i0 < n; i0++) { @@ -329,7 +342,7 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vector order(nodes.size()); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 13f9de297..2243c174f 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -532,261 +532,9 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; -// -// ggml_metal_heap -// - -struct ggml_metal_heap { - // number of times the heap was unused - int n_unused; - - // total number of buffer allocations in this heap across all computes - int64_t n_alloc; - - // current offset in the heap - we reset this after each node in order to reuse the memory - size_t offs; - - // the currently allocated MTLBuffer objects in this heap - id obj; - - NSMutableArray * bufs; -}; - -static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { - struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); - - MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; - desc.storageMode = MTLStorageModePrivate; - desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypePlacement; - desc.size = size; - - heap->n_unused = 0; - heap->n_alloc = 0; - - heap->obj = [device newHeapWithDescriptor:desc]; - if (!heap->obj) { - GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); - - free(heap); - - return false; - } - - [desc release]; - - heap->bufs = [[NSMutableArray alloc] init]; - - return heap; -} - -static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { - heap->offs = 0; - - // count how many graph computes the heap ended up being unused - if ([heap->bufs count] > 0) { - heap->n_unused = 0; - } else { - heap->n_unused++; - } - - for (id buf in heap->bufs) { - [buf release]; - } - [heap->bufs removeAllObjects]; - - // tell the OS that it can reuse this memory if needed - // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc - [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; -} - -static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { - if (heap == nil) { - return; - } - - ggml_metal_heap_reset(heap); - - [heap->obj release]; - [heap->bufs release]; - - free(heap); -} - -@interface ggml_metal_heap_ptr : NSObject - -@property (nonatomic, assign) struct ggml_metal_heap * data; - -@end - -@implementation ggml_metal_heap_ptr -@end - -// -// ggml_metal_mem_pool [TAG_MEM_POOL_REMOVE] -// - -struct ggml_metal_mem_pool { - id device; - - int n_heaps; // total number of heaps ever created (including those that were removed) - - NSMutableArray * heaps; - NSMutableArray * heaps_to_remove; -}; - -static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { - struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); - - mem_pool->n_heaps = 0; - - mem_pool->heaps = [[NSMutableArray alloc] init]; - mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; - - return mem_pool; -} - -static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { - GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); - - size_t size_all = 0; - size_t size_cur = 0; - - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); - GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); - GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); - GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); - GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); - - if ([ptr.data->bufs count] > 0) { - size_cur += [ptr.data->obj size]; - } - size_all += [ptr.data->obj size]; - - ggml_metal_heap_free(ptr.data); - [ptr release]; - } - [mem_pool->heaps release]; - [mem_pool->heaps_to_remove release]; - - if (size_all > 0) { - GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); - GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); - } - - free(mem_pool); -} - -static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { - for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { - ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; - - struct ggml_metal_heap * heap = ptr.data; - ggml_metal_heap_reset(heap); - - // if the heap hasn't been used for a while, remove it - if (heap->n_unused >= 128) { - [mem_pool->heaps_to_remove addObject:@(i)]; - } - } - - if (mem_pool->heaps_to_remove.count > 0) { - // remove in reverse order - for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) { - NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; - ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; - - struct ggml_metal_heap * heap = ptr.data; - ggml_metal_heap_free(heap); - - [mem_pool->heaps removeObjectAtIndex:index]; - [ptr release]; - - if (i == 0) { - break; - } - } - - [mem_pool->heaps_to_remove removeAllObjects]; - } -} - -static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - ptr.data->offs = 0; - } -} - -static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { - const size_t alignment = 256; - - const size_t size_aligned = GGML_PAD(size, alignment); - - // try one of the existing heaps - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - struct ggml_metal_heap * heap = ptr.data; - if (heap->offs + size_aligned <= [heap->obj size]) { - // if this is the first buffer in the heap for the current command buffer, tell the OS that - // it cannot free the memory used by the heap - // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc - if ([heap->bufs count] == 0) { - [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; - } - - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; - if (buf == nil) { - GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); - return nil; - } - - heap->n_alloc++; - heap->offs += size_aligned; - - [heap->bufs addObject:buf]; - - return buf; - } - } - - // create a new heap that can fit this buffer - ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; - - struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); - if (heap == NULL) { - GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); - return NULL; - } - - //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); - - heap_ptr.data = heap; - ggml_metal_heap_reset(heap); - - [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; - if (buf == nil) { - GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); - return NULL; - } - - heap->n_alloc++; - heap->offs += size_aligned; - - [heap->bufs addObject:buf]; - - [mem_pool->heaps addObject:heap_ptr]; - mem_pool->n_heaps++; - - return buf; -} - struct ggml_metal_command_buffer { id obj; - // each command buffer has a memory pool from which it can allocate temporary buffers during the compute - struct ggml_metal_mem_pool * mem_pool; - // used to enable concurrent execution of ops in the command buffers struct ggml_mem_ranges * mem_ranges; }; @@ -1103,9 +851,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->cmd_bufs[i].obj = nil; - ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); - ctx->cmd_bufs[i].mem_pool->device = device; - if (ctx_dev->use_concurrency) { ctx->cmd_bufs[i].mem_ranges = ggml_mem_ranges_init(ctx_dev->debug_graph); } @@ -1510,6 +1255,52 @@ static id ggml_metal_compile_kernel(ggml_backend_t back return res; } +// tokens per expert +static size_t ggml_metal_mul_mat_id_extra_tpe(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_MUL_MAT_ID); + + const int64_t ne02 = op->src[0]->ne[2]; // n_expert + + return ggml_type_size(GGML_TYPE_I32)*ne02; +} + +// id map [n_tokens, n_expert] +static size_t ggml_metal_mul_mat_id_extra_ids(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_MUL_MAT_ID); + + const int64_t ne02 = op->src[0]->ne[2]; // n_expert + const int64_t ne21 = op->src[2]->ne[1]; // n_token + + return ggml_type_size(GGML_TYPE_I32)*ne02*ne21; +} + +// return true if we should use the FA vector kernel for this op +static bool ggml_metal_flash_attn_ext_use_vec(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_FLASH_ATTN_EXT); + + const int64_t ne00 = op->src[0]->ne[0]; // head size + const int64_t ne01 = op->src[0]->ne[1]; // batch size + + // use vec kernel if the batch size is small and if the head size is supported + return (ne01 < 20) && (ne00 % 32 == 0); +} + +static size_t ggml_metal_flash_attn_ext_extra_tmp(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_FLASH_ATTN_EXT); + + const int64_t nwg = 32; + + const int64_t ne01 = op->src[0]->ne[1]; + const int64_t ne02 = op->src[0]->ne[2]; + const int64_t ne03 = op->src[0]->ne[3]; + const int64_t ne20 = op->src[2]->ne[0]; + + // temp buffer for writing the results from each workgroup + // - ne20: the size of the Value head + // - + 2: the S and M values for each intermediate result + return ggml_type_size(GGML_TYPE_F32)*(ne01*ne02*ne03*nwg*(ne20 + 2)); +} + static id ggml_metal_get_pipeline_flash_attn_ext( ggml_backend_t backend, struct ggml_tensor * op, bool has_mask, @@ -1760,8 +1551,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->cmd_bufs[i].obj release]; } - ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); - if (ctx->cmd_bufs[i].mem_ranges) { ggml_mem_ranges_free(ctx->cmd_bufs[i].mem_ranges); } @@ -2127,8 +1916,6 @@ struct ggml_metal_encode_context { id encoder; - struct ggml_metal_mem_pool * mem_pool; - struct ggml_mem_ranges * mem_ranges; }; @@ -2165,8 +1952,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in id encoder = ctx_enc->encoder; - struct ggml_metal_mem_pool * mem_pool = ctx_enc->mem_pool; - struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -2207,8 +1992,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ABORT("unsupported op"); } - ggml_metal_mem_pool_clear(mem_pool); - const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2522,7 +2305,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in /*.nb02 =*/ nb02, /*.nb11 =*/ nb11, /*.nb21 =*/ nb21, - }; [encoder setComputePipelineState:pipeline]; @@ -3167,54 +2949,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); -// use this branch to test the ggml_metal_mem_pool functionality -#if 0 - // cpy to tmp buffer in MTLHeap - - id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); - if (!h_src0) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); - return 0; - } - - offs_src0 = 0; - - ggml_metal_kargs_cpy args_cpy = { - /*.ne00 =*/ ne00, - /*.ne01 =*/ ne01, - /*.ne02 =*/ ne02, - /*.ne03 =*/ ne03, - /*.nb00 =*/ nb00, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne0 =*/ ne00, - /*.ne1 =*/ ne01, - /*.ne2 =*/ ne02, - /*.ne3 =*/ ne03, - /*.nb0 =*/ nb00, - /*.nb1 =*/ nb01, - /*.nb2 =*/ nb02, - /*.nb3 =*/ nb03, - }; - - if (src0->type == GGML_TYPE_F16) { - [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; - } else { - [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; - } - [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:h_src0 offset:0 atIndex:2]; - - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; - -#else id h_src0 = id_src0; -#endif + // softmax ggml_metal_kargs_soft_max args = { @@ -4093,28 +3829,9 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in default: break; } - // TODO: using mem pool allocations with enabled concurrency is not safe because the mem pool - // reuses buffers. this can result in 2 concurrent MUL_MAT_ID ops using the same mem pool buffer. - // so we add this extra barrier to prevent the race. - // the correct solution is to remove mem pools and then remove this barrier [TAG_MEM_POOL_REMOVE] - ggml_metal_encode_concurrency_reset(ctx_enc); - - // tokens per expert - const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02; - id h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe); - if (!h_tpe) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe); - return 0; - } - - // id map - // [n_tokens, n_expert] - const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne21*ne02; - id h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids); - if (!h_ids) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids); - return 0; - } + // extra buffers for intermediate id mapping + size_t offs_tpe = offs_dst + ggml_nbytes(dst); + size_t offs_ids = offs_tpe + ggml_metal_mul_mat_id_extra_tpe(dst); { ggml_metal_kargs_mul_mm_id_map0 args = { @@ -4152,8 +3869,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:1]; - [encoder setBuffer: h_tpe offset:0 atIndex:2]; - [encoder setBuffer: h_ids offset:0 atIndex:3]; + [encoder setBuffer:id_dst offset:offs_tpe atIndex:2]; + [encoder setBuffer:id_dst offset:offs_ids atIndex:3]; [encoder setThreadgroupMemoryLength:smem atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(ne02, 1, 1)]; @@ -4215,8 +3932,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer: h_tpe offset:0 atIndex:3]; - [encoder setBuffer: h_ids offset:0 atIndex:4]; + [encoder setBuffer:id_dst offset:offs_tpe atIndex:3]; + [encoder setBuffer:id_dst offset:offs_ids atIndex:4]; [encoder setBuffer:id_dst offset:offs_dst atIndex:5]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; @@ -5306,8 +5023,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ASSERT(ne01 < 65536); - // use non-vec kernel if the batch size is large or if the vec-kernel is not supported for this head size - if (ne01 >= 20 || (ne00 % 32 != 0)) { + if (!ggml_metal_flash_attn_ext_use_vec(dst)) { // half8x8 kernel const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! const int64_t ncpsg = 64; // cache values per simdgroup !! sync with kernel template arguments !! @@ -5532,34 +5248,20 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3); GGML_ASSERT(ne1*ne2*ne3 <= (1u << 31)); - // using mem pool allocations with enabled concurrency is not safe [TAG_MEM_POOL_REMOVE] - // still, we assume that concurrent FA won't happen before we do the refactor - //ggml_metal_encode_concurrency_reset(ctx_enc); - - const int32_t nrows = ne1*ne2*ne3; - - // temp buffer for writing the results from each workgroup - // - ne20: the size of the head vector - // - + 2: the S and M values for each intermediate result - const size_t s_tmp = ggml_type_size(GGML_TYPE_F32)*(nrows*nwg*(ne20 + 2)); - id h_tmp = ggml_metal_mem_pool_alloc(mem_pool, s_tmp); - if (!h_tmp) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tmp); - return 0; - } - - //printf("ne01 = %d, ne02 = %d, ne03 = %d, ne20 = %d\n", ne01, ne02, ne03, ne20); - //printf("needed memory: %.3f MiB\n", (float) (ne01*ne02*ne03*ne20*sizeof(float))/1024.0f/1024.0f); - - [encoder setBuffer:h_tmp offset:0 atIndex:6]; + // write the results from each workgroup into a temp buffer + const size_t offs_tmp = offs_dst + ggml_nbytes(dst); + [encoder setBuffer:id_dst offset:offs_tmp atIndex:6]; [encoder setThreadgroupMemoryLength:smem atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + // sync the 2 kernels ggml_metal_encode_concurrency_reset(ctx_enc); // reduce the results from the workgroups { + const int32_t nrows = ne1*ne2*ne3; + ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = { nrows, }; @@ -5568,7 +5270,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setComputePipelineState:pipeline0]; [encoder setBytes:&args0 length:sizeof(args0) atIndex:0]; - [encoder setBuffer:h_tmp offset:0 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_tmp atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; //printf("ne1 = %d, ne2 = %d, ne3 = %d, ne20 = %d\n", ne1, ne2, ne3, ne20); @@ -5895,12 +5597,7 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed - // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences - // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009 - // [TAG_MEM_POOL_REMOVE] - //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - id cmd_buf = [ctx->queue commandBuffer]; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[n_cb].obj) { @@ -5919,8 +5616,7 @@ static enum ggml_status ggml_metal_graph_compute( // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - id cmd_buf = [ctx->queue commandBuffer]; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[cb_idx].obj) { @@ -6377,6 +6073,31 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return ggml_backend_buffer_init(buft, buf_i, ctx, size); } +static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + size_t res = ggml_nbytes(tensor); + + // some operations require additional memory for fleeting data: + switch (tensor->op) { + case GGML_OP_MUL_MAT_ID: + { + res += ggml_metal_mul_mat_id_extra_tpe(tensor); + res += ggml_metal_mul_mat_id_extra_ids(tensor); + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + if (ggml_metal_flash_attn_ext_use_vec(tensor)) { + res += ggml_metal_flash_attn_ext_extra_tmp(tensor); + } + } break; + default: + break; + } + + return res; + + GGML_UNUSED(buft); +} + // default (shared) buffer type static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) { @@ -6401,6 +6122,10 @@ static size_t ggml_backend_metal_buffer_type_shared_get_max_size(ggml_backend_bu return max_size; } +static size_t ggml_backend_metal_buffer_type_shared_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6414,7 +6139,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6448,6 +6173,10 @@ static size_t ggml_backend_metal_buffer_type_private_get_max_size(ggml_backend_b return max_size; } +static size_t ggml_backend_metal_buffer_type_private_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6461,7 +6190,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6496,6 +6225,10 @@ static size_t ggml_backend_metal_buffer_type_mapped_get_max_size(ggml_backend_bu return max_size; } +static size_t ggml_backend_metal_buffer_type_mapped_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6511,7 +6244,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6711,11 +6444,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; id cmd_buf = ctx->cmd_bufs[cb_idx].obj; - struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; struct ggml_mem_ranges * mem_ranges = ctx->cmd_bufs[cb_idx].mem_ranges; - ggml_metal_mem_pool_reset(mem_pool); - if (mem_ranges) { ggml_mem_ranges_reset(mem_ranges); } @@ -6743,7 +6473,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { struct ggml_metal_encode_context ctx_enc = { /*.backend =*/ backend, /*.encoder =*/ encoder, - /*.mem_pool =*/ mem_pool, /*.mem_ranges =*/ mem_ranges, }; From 6c019cb04e86e2dacfe62ce7666c64e9717dde1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 14 Sep 2025 21:17:04 +0200 Subject: [PATCH 10/35] server : only attempt to enable thinking if using jinja (#15967) --- tools/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 160b97cf7..519704fad 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2313,7 +2313,7 @@ struct server_context { // thinking is enabled if: // 1. It's not explicitly disabled (reasoning_budget == 0) // 2. The chat template supports it - const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); + const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); SRV_INF("Enable thinking? %d\n", enable_thinking); oai_parser_opt = { From b8e09f08b9a91c0401bc67d17a17c90756420346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 14 Sep 2025 23:00:59 +0200 Subject: [PATCH 11/35] model : add grok-2 support (#15539) * add grok-2 support * type fix * type fix * type fix * "fix" vocab for invalid sequences * fix expert tensor mapping and spaces in vocab * add chat template * fix norm tensor mapping * rename layer_out_norm to ffn_post_norm * ensure ffn_post_norm is mapped * fix experts merging * remove erroneous FFN_GATE entry * concatenate split tensors and add more metadata * process all expert layers and try cat instead of hstack * add support for community BPE vocab * fix expert feed forward length and ffn_down concat * commit this too * add ffn_up/gate/down, unsure if sequence is right * add ffn_gate/down/up to tensor names * correct residual moe (still not working) * mess-- * fix embedding scale being applied twice * add built in chat template * change beta fast for grok if default value * remove spm vocab in favor of community bpe vocab * change attention temp length metadata type to integer * update attention temp length metadata * remove comment * replace M_SQRT2 with std::sqrt(2) * add yarn metadata, move defaults to hparams --- common/common.h | 6 +- convert_hf_to_gguf.py | 111 ++++++++++++++++++++++++--------- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 26 +++++--- gguf-py/gguf/gguf_writer.py | 21 +++++++ gguf-py/gguf/tensor_mapping.py | 10 ++- src/llama-arch.cpp | 31 ++++++--- src/llama-arch.h | 7 +++ src/llama-chat.cpp | 17 +++++ src/llama-chat.h | 1 + src/llama-context.cpp | 14 ++--- src/llama-graph.cpp | 6 +- src/llama-hparams.h | 14 ++++- src/llama-model.cpp | 100 ++++++++++++++++++++--------- src/llama-vocab.cpp | 11 ++++ src/llama-vocab.h | 1 + 16 files changed, 281 insertions(+), 96 deletions(-) diff --git a/common/common.h b/common/common.h index cf57d4841..5063d73f9 100644 --- a/common/common.h +++ b/common/common.h @@ -288,9 +288,9 @@ struct common_params { float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim + float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = -1.0f; // YaRN low correction dim + float yarn_beta_slow = -1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length // offload params diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bbc21813f..855789f1b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -735,6 +735,9 @@ class TextModel(ModelBase): if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B res = "qwen2" + if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": + # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer + res = "grok-2" if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -2682,12 +2685,20 @@ class BitnetModel(TextModel): yield (new_name, data_torch) -@ModelBase.register("GrokForCausalLM") +@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): - self._set_vocab_sentencepiece() + if (self.dir_model / 'tokenizer.model').is_file(): + self._set_vocab_sentencepiece() + return + + if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file(): + logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer') + sys.exit(1) + + self._set_vocab_gpt2() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2695,11 +2706,46 @@ class GrokModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - _experts: list[dict[str, Tensor]] | None = None + self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) + self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) + if (final_logit_softcap := self.hparams.get("final_logit_softcapping")): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + # Treat "original" as "yarn", seems to have been a mistake + if self.hparams.get("rope_type") in ("yarn", "original"): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) + self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) + + if temp_len := self.hparams.get("attn_temperature_len"): + self.gguf_writer.add_attn_temperature_length(temp_len) + + self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) + self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) + self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) + + _experts: list[dict[str, list[Tensor]]] | None = None + _cur_expert = "" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors: list[tuple[str, Tensor]] = [] + is_expert = ".moe." in name or ".block_sparse_moe.experts." in name + + if not is_expert: + tensors.append((self.map_tensor_name(name), data_torch)) + # process the experts separately - if name.find(".moe.") != -1: + if is_expert or self._cur_expert: n_experts = self.hparams["num_local_experts"] assert bid is not None @@ -2707,32 +2753,41 @@ class GrokModel(TextModel): if self._experts is None: self._experts = [{} for _ in range(self.block_count)] - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["linear", "linear_1", "linear_v"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: + # concatenate split tensors + if name in self._experts[bid]: + self._cur_expert = name + self._experts[bid][name].append(data_torch) return [] + elif is_expert: + self._cur_expert = name + self._experts[bid][name] = [data_torch] + return [] + else: + self._cur_expert = "" - return [(self.map_tensor_name(name), data_torch)] + for bid in range(self.block_count): + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" + if ename not in self._experts[bid]: + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" + tensor_list = self._experts[bid][ename] + datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield (new_name, data_torch) + + yield from tensors @ModelBase.register("DbrxForCausalLM") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 575e05e19..eb8fdfa7e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -158,6 +158,7 @@ pre_computed_hashes = [ {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"}, {"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"}, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"}, + {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, ] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1e88b6505..c7edef919 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -111,6 +111,7 @@ class Keys: DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" DECODER_BLOCK_COUNT = "{arch}.decoder_block_count" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + ROUTER_LOGIT_SOFTCAPPING = "{arch}.router_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" SWIN_NORM = "{arch}.swin_norm" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" @@ -146,21 +147,27 @@ class Keys: REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" SCALE = "{arch}.attention.scale" + OUTPUT_SCALE = "{arch}.attention.output_scale" + TEMPERATURE_LENGTH = "{arch}.attention.temperature_length" KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor" + SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor" + SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast" + SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow" class Split: LLM_KV_SPLIT_NO = "split.no" @@ -1114,6 +1121,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_POST_NORM, MODEL_TENSOR.LAYER_OUT_NORM, ], MODEL_ARCH.GPTNEOX: [ diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7ff12f7f5..d925fca7e 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -733,6 +733,9 @@ class GGUFWriter: def add_attn_logit_softcapping(self, value: float) -> None: self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + def add_router_logit_softcapping(self, value: float) -> None: + self.add_float32(Keys.LLM.ROUTER_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + def add_final_logit_softcapping(self, value: float) -> None: self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value) @@ -829,6 +832,12 @@ class GGUFWriter: def add_attention_scale(self, value: float) -> None: self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) + def add_attn_output_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.OUTPUT_SCALE.format(arch=self.arch), value) + + def add_attn_temperature_length(self, value: int) -> None: + self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) @@ -859,6 +868,18 @@ class GGUFWriter: def add_rope_scaling_yarn_log_mul(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value) + def add_rope_scaling_yarn_ext_factor(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_YARN_EXT_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_yarn_attn_factor(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_YARN_ATTN_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_yarn_beta_fast(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_YARN_BETA_FAST.format(arch=self.arch), value) + + def add_rope_scaling_yarn_beta_slow(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_YARN_BETA_SLOW.format(arch=self.arch), value) + def add_ssm_conv_kernel(self, value: int) -> None: self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b0c3d65e9..8fd9e454e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -136,6 +136,7 @@ class TensorNameMap: "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba "transformer.decoder_layer.{bid}.rms_norm", # Grok + "model.layers.{bid}.pre_attn_norm", # grok-2 "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm @@ -278,6 +279,7 @@ class TensorNameMap: "transformer.layer.{bid}.sa_layer_norm", # distillbert "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "model.layers.{bid}.post_attn_norm", # grok-2 "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), @@ -313,6 +315,7 @@ class TensorNameMap: "h.{bid}.ln_2", # gpt2 "model.layers.{bid}.ffn_norm", # internlm2 "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "model.layers.{bid}.pre_moe_norm", # grok-2 "encoder.layers.{bid}.post_attention_layernorm", # chatglm "transformer.layers.{bid}.ffn_norm", # openelm "model.layers.{bid}.pre_ff_layernorm", # jamba granite-hybrid @@ -333,11 +336,12 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 - "layers.{bid}.post_feedforward_layernorm", # embeddinggemma - "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + "layers.{bid}.post_feedforward_layernorm", # embeddinggemma + "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 "model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2 "model.layers.{bid}.feed_forward.up_proj", + "model.layers.{bid}.post_moe_norm", # grok-2 ), MODEL_TENSOR.FFN_GATE_INP: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 81f974681..3122331d8 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -139,6 +139,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, + { LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, @@ -169,19 +170,25 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, + { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, + { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" }, + { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" }, + { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" }, + { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" }, { LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_COUNT, "split.count" }, @@ -398,12 +405,16 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 6ee3707dc..a4ac28b52 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -143,6 +143,7 @@ enum llm_kv { LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_DECODER_BLOCK_COUNT, LLM_KV_ATTN_LOGIT_SOFTCAPPING, + LLM_KV_ROUTER_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_SWIN_NORM, LLM_KV_RESCALE_EVERY_N_LAYERS, @@ -173,6 +174,8 @@ enum llm_kv { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, + LLM_KV_ATTENTION_OUTPUT_SCALE, + LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, @@ -186,6 +189,10 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_ROPE_SCALING_YARN_LOG_MUL, + LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, + LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, + LLM_KV_ROPE_SCALING_YARN_BETA_FAST, + LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, LLM_KV_SPLIT_NO, LLM_KV_SPLIT_COUNT, diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 9d8e57eac..66e6c6a38 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -70,6 +70,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, + { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_KIMI_K2; } else if (tmpl_contains("")) { return LLM_CHAT_TEMPLATE_SEED_OSS; + } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) { + return LLM_CHAT_TEMPLATE_GROK_2; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -763,6 +766,20 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "assistant\n"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) { + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "System: " << trim(message->content) << "<|separator|>\n\n"; + } else if (role == "user") { + ss << "Human: " << trim(message->content) << "<|separator|>\n\n"; + } else if (role == "assistant") { + ss << "Assistant: " << message->content << "<|separator|>\n\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } } else { // template not supported return -1; diff --git a/src/llama-chat.h b/src/llama-chat.h index 21d53ed08..5a87d9ab6 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -50,6 +50,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, + LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 289a32b6d..e6f76421c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -35,10 +35,10 @@ llama_context::llama_context( cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.no_perf = params.no_perf; @@ -2263,9 +2263,9 @@ llama_context_params llama_context_default_params() { /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, - /*.yarn_attn_factor =*/ 1.0f, - /*.yarn_beta_fast =*/ 32.0f, - /*.yarn_beta_slow =*/ 1.0f, + /*.yarn_attn_factor =*/ -1.0f, + /*.yarn_beta_fast =*/ -1.0f, + /*.yarn_beta_slow =*/ -1.0f, /*.yarn_orig_ctx =*/ 0, /*.defrag_thold =*/ -1.0f, /*.cb_eval =*/ nullptr, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ddc772b17..9f2e417f1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha( if (arch == LLM_ARCH_GROK) { // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 + // multiply by attn_output_multiplier // and then : // kq = 30 * tanh(kq / 30) // before the softmax below - kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); + kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping)); cb(kq, "kq_tanh", il); - kq = ggml_scale(ctx0, kq, 30); + kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); cb(kq, "kq_scaled", il); } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 4dca2ca41..116d728e8 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -82,8 +82,9 @@ struct llama_hparams { float f_norm_rms_eps; float f_norm_group_eps; - float f_attn_logit_softcapping = 50.0f; - float f_final_logit_softcapping = 30.0f; + float f_attn_logit_softcapping = 50.0f; + float f_router_logit_softcapping = 30.0f; + float f_final_logit_softcapping = 30.0f; // for RWKV uint32_t rescale_every_n_layers = 0; @@ -104,6 +105,11 @@ struct llama_hparams { uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; + float yarn_ext_factor = -1.0f; + float yarn_attn_factor = 1.0f; + float yarn_beta_fast = 32.0f; + float yarn_beta_slow = 1.0f; + std::array rope_sections; // Sliding Window Attention (SWA) @@ -136,6 +142,10 @@ struct llama_hparams { float f_embedding_scale = 0.0f; float f_attention_scale = 0.0f; + // grok-2 + float f_attn_out_scale = 0.0f; + uint32_t attn_temp_length = 0; + bool causal_attn = true; bool use_alibi = false; bool attn_soft_cap = false; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 818b20964..4864ed8e7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -685,7 +685,30 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GROK: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // defaults for old GGUFs + hparams.yarn_beta_fast = 8.0f; + hparams.f_logit_scale = 0.5773502691896257f; + hparams.f_embedding_scale = 78.38367176906169f; + hparams.f_attn_out_scale = 0.08838834764831845f; + hparams.f_attn_logit_softcapping = 30.0f; + hparams.f_router_logit_softcapping = 30.0f; + // no final_logit_softcapping in grok-1 + hparams.f_final_logit_softcapping = 0.0f; + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false); + ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false); + ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false); + ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); + + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); switch (hparams.n_layer) { case 64: type = LLM_TYPE_314B; break; @@ -2540,6 +2563,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -2554,12 +2578,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); - layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + if (!layer.ffn_post_norm) { + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } } } break; case LLM_ARCH_DBRX: @@ -7028,9 +7059,6 @@ struct llm_build_grok : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - // multiply by embedding_multiplier_scale of 78.38367176906169 - inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); - // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -7102,26 +7130,22 @@ struct llm_build_grok : public llm_graph_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - // Grok - // if attn_out_norm is present then apply it before adding the input - if (model.layers[il].attn_out_norm) { - cur = build_norm(cur, - model.layers[il].attn_out_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_out_norm", il); - } + cur = build_norm(cur, + model.layers[il].attn_out_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_out_norm", il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network - // MoE branch cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = build_moe_ffn(cur, + // MoE branch + ggml_tensor * moe_out = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -7132,18 +7156,28 @@ struct llm_build_grok : public llm_graph_context { false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); - cb(cur, "ffn_moe_out", il); + cb(moe_out, "ffn_moe_out", il); - // Grok - // if layer_out_norm is present then apply it before adding the input - // Idea: maybe ffn_out_norm is a better name - if (model.layers[il].layer_out_norm) { - cur = build_norm(cur, - model.layers[il].layer_out_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "layer_out_norm", il); + if (model.layers[il].ffn_up) { + ggml_tensor * ffn_out = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(ffn_out, "ffn_out", il); + + cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; } + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_post_norm", il); + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -7166,10 +7200,14 @@ struct llm_build_grok : public llm_graph_context { // lm_head cur = build_lora_mm(model.output, cur); - // Grok - // multiply logits by output_multiplier_scale of 0.5773502691896257 + cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); - cur = ggml_scale(ctx0, cur, 0.5773502691896257f); + // final logit soft-capping + if (hparams.f_final_logit_softcapping) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + } cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ca02b63a5..b551253af 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_GROK_2: + regex_exprs = { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1974,6 +1981,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "kimi-k2") { pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; clean_spaces = false; + } else if ( + tokenizer_pre == "grok-2") { + pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 61b812421..0d2f28c36 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -47,6 +47,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, + LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39, }; struct LLM_KV; From a68f31edd71cc39141113f05f7133a3e9ece8c61 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Sep 2025 02:54:57 -0500 Subject: [PATCH 12/35] fix KLD percentile output (#15999) In `llama-perplexity`, when using `--kl-divergence`, the KL divergence statistics output mistakenly displays the 99th percentile twice. This change fixes that and correctly displays the 90th percentile as originally intended (presumably). --- tools/perplexity/perplexity.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 80cbb095d..c9fd082db 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -1931,7 +1931,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { LOG("Maximum KLD: %10.6f\n", kld_values.back()); LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + LOG("90.0%% KLD: %10.6f\n", percentile(kld_values, 0.900f)); LOG("Median KLD: %10.6f\n", kld_median); LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); From 106220562aca42b6738b8f51acfce0db1b8a2fb6 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 15 Sep 2025 17:35:11 +0800 Subject: [PATCH 13/35] CUDA: some micro-optimizations in mmf.cuh for mul_mat_id (#15926) --- ggml/src/ggml-cuda/mmf.cuh | 58 +++++++++++++++----------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index bf724bc57..61e3bf301 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -57,31 +57,33 @@ static __global__ void mul_mat_f( T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded); if constexpr (has_ids) { - __shared__ int has_any; - if (threadIdx.y == 0) { - int local_has_any = 0; - for (int j = threadIdx.x; j < cols_per_block; j += warp_size) { - int slot = -1; - for (int k = 0; k < nchannels_dst; ++k) { - const int idv = ids[j*stride_row_id + k*stride_col_id]; - if (idv == expert_idx) { - slot = k; - break; - } - } - if (j < cols_per_block) { - local_has_any |= (slot >= 0); - slot_map[j] = slot; + int found = 0; + + for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { + const int j = j0 + threadIdx.y; + const int32_t * __restrict__ id_row = ids + j*stride_row_id; + + if (threadIdx.x == 0) { + slot_map[j] = -1; + } + + for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) { + int match = id_row[k*stride_col_id] == expert_idx; + + if (match) { + slot_map[j] = k; + found = 1; + break; } } - has_any = warp_reduce_any(local_has_any); } - __syncthreads(); - if (has_any == 0) { + + if (!__syncthreads_or(found)) { return; } } + for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) { tile_A A[ntA][warp_size / tile_A::J]; #pragma unroll @@ -106,14 +108,7 @@ static __global__ void mul_mat_f( if constexpr (!has_ids) { tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; } else { - float val = 0.0f; - if (j < cols_per_block) { - const int slot = slot_map[j]; - if (slot >= 0) { - val = y[slot*stride_channel_y + j*stride_col_y + col]; - } - } - tile_xy[j0*tile_k_padded + threadIdx.x] = val; + tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f; } } } else if constexpr (std::is_same_v || std::is_same_v) { @@ -125,14 +120,7 @@ static __global__ void mul_mat_f( const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; } else { - float2 tmp = make_float2(0.0f, 0.0f); - if (j < cols_per_block) { - const int slot = slot_map[j]; - if (slot >= 0) { - const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y); - tmp = y2_slot[j*stride_col_y + col]; - } - } + float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f); tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; } } @@ -221,7 +209,7 @@ static inline void mul_mat_f_switch_ids( const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) { if (ids) { mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } else { From 28c39da7c645185ade5436767929d7ec33006033 Mon Sep 17 00:00:00 2001 From: Nikolay Popov <131475237+npopov-vst@users.noreply.github.com> Date: Mon, 15 Sep 2025 13:08:30 +0300 Subject: [PATCH 14/35] llama-run: Fix model download on Windows (#15988) * llama-run: Fix model download on Windows * fix SSL error (SSL peer certificate or SSH remote key was not OK) * fix program crash on std::filesystem::rename * llama-run: create a separate method to utilize RAII * llama-run: handle rename exception --- tools/run/run.cpp | 72 +++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/tools/run/run.cpp b/tools/run/run.cpp index 6fe728c68..772d66c92 100644 --- a/tools/run/run.cpp +++ b/tools/run/run.cpp @@ -407,39 +407,22 @@ class HttpClient { } std::string output_file_partial; - curl = curl_easy_init(); - if (!curl) { - return 1; - } - progress_data data; - File out; if (!output_file.empty()) { output_file_partial = output_file + ".partial"; - if (!out.open(output_file_partial, "ab")) { - printe("Failed to open file for writing\n"); - - return 1; - } - - if (out.lock()) { - printe("Failed to exclusively lock file\n"); - - return 1; - } } - set_write_options(response_str, out); - data.file_size = set_resume_point(output_file_partial); - set_progress_options(progress, data); - set_headers(headers); - CURLcode res = perform(url); - if (res != CURLE_OK){ - printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res)); + if (download(url, headers, output_file_partial, progress, response_str)) { return 1; } + if (!output_file.empty()) { - std::filesystem::rename(output_file_partial, output_file); + try { + std::filesystem::rename(output_file_partial, output_file); + } catch (const std::filesystem::filesystem_error & e) { + printe("Failed to rename '%s' to '%s': %s\n", output_file_partial.c_str(), output_file.c_str(), e.what()); + return 1; + } } return 0; @@ -459,6 +442,42 @@ class HttpClient { CURL * curl = nullptr; struct curl_slist * chunk = nullptr; + int download(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + curl = curl_easy_init(); + if (!curl) { + return 1; + } + + progress_data data; + File out; + if (!output_file.empty()) { + if (!out.open(output_file, "ab")) { + printe("Failed to open file for writing\n"); + + return 1; + } + + if (out.lock()) { + printe("Failed to exclusively lock file\n"); + + return 1; + } + } + + set_write_options(response_str, out); + data.file_size = set_resume_point(output_file); + set_progress_options(progress, data); + set_headers(headers); + CURLcode res = perform(url); + if (res != CURLE_OK){ + printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res)); + return 1; + } + + return 0; + } + void set_write_options(std::string * response_str, const File & out) { if (response_str) { curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); @@ -507,6 +526,9 @@ class HttpClient { curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); +#ifdef _WIN32 + curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif return curl_easy_perform(curl); } From b907255f4bd169b0dc7dca9553b4c54af5170865 Mon Sep 17 00:00:00 2001 From: yael-works <106673277+yael-works@users.noreply.github.com> Date: Mon, 15 Sep 2025 19:51:35 +0300 Subject: [PATCH 15/35] SYCL: Add COUNT_EQUAL operator support (#15991) * SYCL: Add COUNT_EQUAL operator support (rebased on master) * SYCL: remove duplicate op_count_equal definition * tests: remove test_count_equal_typed and use test_count_equal for all cases * tests: keep only I32 case for COUNT_EQUAL as suggested * tests: keep only I32 case for COUNT_EQUAL as requested --- ggml/src/ggml-sycl/binbcast.cpp | 9 +++++++++ ggml/src/ggml-sycl/binbcast.hpp | 6 ++++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++++ 3 files changed, 19 insertions(+) diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 0a3883ae1..e0a1de0f3 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -303,6 +303,10 @@ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); } +inline void ggml_sycl_op_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); +} + inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); @@ -328,6 +332,11 @@ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_sub(ctx, dst); } +void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_count_equal(ctx, dst); +} + void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); ggml_sycl_op_mul(ctx, dst); diff --git a/ggml/src/ggml-sycl/binbcast.hpp b/ggml/src/ggml-sycl/binbcast.hpp index 9cce0f053..34c4064f5 100644 --- a/ggml/src/ggml-sycl/binbcast.hpp +++ b/ggml/src/ggml-sycl/binbcast.hpp @@ -16,6 +16,12 @@ static __dpct_inline__ float op_sub(const float a, const float b) { return a - b; } +static __dpct_inline__ float op_count_equal(const float a, const float b) { + return (a == b) ? 1.0f : 0.0f; +} + +void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + static __dpct_inline__ float op_mul(const float a, const float b) { return a * b; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index e06ec613f..9404e3ff4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3577,6 +3577,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_SUB: ggml_sycl_sub(ctx, dst); break; + case GGML_OP_COUNT_EQUAL: + ggml_sycl_count_equal(ctx, dst); + break; case GGML_OP_ACC: ggml_sycl_acc(ctx, dst); break; @@ -4356,6 +4359,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_SUB: + case GGML_OP_COUNT_EQUAL: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_REPEAT: From 10d197409bd9537ff302ad09966fe406882fef9d Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 15 Sep 2025 14:38:42 -0700 Subject: [PATCH 16/35] releases : switch to rocWMMA develop branch, add gfx1151 (#15992) * releases : switch to rocWMMA develop branch, add gfx1151 * remove unused variable ROCM_VERSION --- .github/workflows/release.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e959209ab..f461456ed 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -530,15 +530,13 @@ jobs: runs-on: windows-2022 env: - # The ROCm version must correspond to the version used in the HIP SDK. - ROCM_VERSION: "6.4.2" HIPSDK_INSTALLER_VERSION: "25.Q3" strategy: matrix: include: - name: "radeon" - gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -548,7 +546,7 @@ jobs: - name: Clone rocWMMA repository id: clone_rocwmma run: | - git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1 + git clone https://github.com/rocm/rocwmma --branch develop --depth 1 - name: Cache ROCm Installation id: cache-rocm From dc381aa9a6dc45f00673471d34b8bddd30e77570 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 15 Sep 2025 14:38:52 -0700 Subject: [PATCH 17/35] docker : enable rocWMMA in ROCm images, add gfx1151 (#15997) --- .devops/rocm.Dockerfile | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 221077197..106c62b4d 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -17,14 +17,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # gfx906 is deprecated #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html -ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201' -#ARG ROCM_DOCKER_ARCH=gfx1100 +ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151' +#ARG ROCM_DOCKER_ARCH='gfx1151' -# Set ROCm architectured +# Set ROCm architectures ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -# ENV CC=/opt/rocm/llvm/bin/clang -# ENV CXX=/opt/rocm/llvm/bin/clang++ RUN apt-get update \ && apt-get install -y \ @@ -39,8 +36,16 @@ WORKDIR /app COPY . . +RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1 + RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ + cmake -S . -B build \ + -DGGML_HIP=ON \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ + -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \ + -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \ + -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \ + -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ && cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib \ From 3d4053f77f0f78ee2b791088c02af653ebee42dd Mon Sep 17 00:00:00 2001 From: Jake Karnes Date: Mon, 15 Sep 2025 16:28:31 -0600 Subject: [PATCH 18/35] CUDA: fix im2col_3d to respect non-contiguous inputs (views) (#15956) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix im2col_3d to respect non-contiguous inputs (views) The CUDA 3D im2col kernel computed source addresses assuming compact layout (products of dims), ignoring nb[] strides. This patch switches im2col_3d source indexing to use true strides derived from src1->nb[] (in elements), mirroring the approach used in the 2D CUDA im2col path. Destination indexing is unchanged. * use ggml_element_size() for src strides Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/im2col.cu | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 7737d6a5d..56dc05457 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -122,11 +122,14 @@ static __global__ void im2col_3d_kernel( int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW, int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW, int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) { const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= IC_KD_KH_KW) { return; } + GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH); + GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW); const int64_t iic = i / KD_KH_KW; const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW; @@ -148,7 +151,7 @@ static __global__ void im2col_3d_kernel( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) { dst[offset_dst] = 0.0f; } else { - const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw; + const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x); dst[offset_dst] = src[offset_src]; } } @@ -159,6 +162,7 @@ template static void im2col_3d_cuda(const float * src, T* dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { const int64_t OH_OW = OH*OW; const int64_t KD_KH_KW = KD*KH*KW; @@ -179,23 +183,30 @@ static void im2col_3d_cuda(const float * src, T* dst, OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW, IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW, OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH, + stride_q, stride_z, stride_y, stride_x, s0, s1, s2, p0, p1, p2, d0, d1, d2); } static void im2col_3d_cuda_f16(const float * src, half * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } static void im2col_3d_cuda_f32(const float * src, float * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -235,9 +246,19 @@ void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int64_t OH = ne2; const int64_t OW = ne1; + const size_t es = ggml_element_size(src1); + const int64_t stride_x = src1->nb[0] / es; + const int64_t stride_y = src1->nb[1] / es; + const int64_t stride_z = src1->nb[2] / es; + const int64_t stride_q = src1->nb[3] / es; + if(dst->type == GGML_TYPE_F16) { - im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } else { - im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } } From 6d758839ff741d4966ca92b7f801b7a8b5b96364 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 16 Sep 2025 10:38:28 +0800 Subject: [PATCH 19/35] Add LLaDA-7b-MoE diffusion model (#16003) --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 73 +++++++++++ convert_hf_to_gguf_update.py | 1 + examples/diffusion/diffusion-cli.cpp | 24 ++-- gguf-py/gguf/constants.py | 19 +++ src/llama-arch.cpp | 22 ++++ src/llama-arch.h | 1 + src/llama-model.cpp | 179 +++++++++++++++++++++++++++ src/llama-vocab.cpp | 3 +- 9 files changed, 315 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index c15008fe7..19189d8f9 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1704,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 855789f1b..5a21ba211 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -888,6 +888,9 @@ class TextModel(ModelBase): if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": # ref: https://huggingface.co/JetBrains/Mellum-4b-base res = "mellum" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base + res = "llada-moe" if res is None: logger.warning("\n") @@ -8239,6 +8242,76 @@ class HunYuanMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index eb8fdfa7e..21bb4a9f3 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -139,6 +139,7 @@ models = [ {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, + {"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index abf7fb357..273942a16 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx, n_generated = params.max_length; } -static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { +static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) { if (!use_chat_template) { return prompt; } auto chat_templates = common_chat_templates_init(model, ""); - common_chat_templates_inputs inputs; - common_chat_msg user_msg; - user_msg.role = "user"; - user_msg.content = prompt; - inputs.add_generation_prompt = true; + common_chat_msg system_msg; + + if (!system_prompt.empty()) { + system_msg.role = "system"; + system_msg.content = system_prompt; + inputs.messages.push_back(system_msg); + } + + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = prompt; + inputs.messages.push_back(user_msg); + inputs.add_generation_prompt = true; auto result = common_chat_templates_apply(chat_templates.get(), inputs); @@ -579,7 +587,8 @@ int main(int argc, char ** argv) { llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); const llama_vocab * vocab = llama_model_get_vocab(model); - std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); + + std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model); std::vector input_tokens = common_tokenize(vocab, formatted_prompt, @@ -596,6 +605,7 @@ int main(int argc, char ** argv) { } llama_token mask_token_id = llama_vocab_mask(vocab); + GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); bool visual_mode = params.diffusion.visual_mode; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c7edef919..7e16cbcbd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -399,6 +399,7 @@ class MODEL_ARCH(IntEnum): DREAM = auto() SMALLTHINKER = auto() LLADA = auto() + LLADA_MOE = auto() SEED_OSS = auto() @@ -735,6 +736,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.LLADA_MOE: "llada-moe", MODEL_ARCH.SEED_OSS: "seed_oss", } @@ -2693,6 +2695,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLADA_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 3122331d8..a4d2973ad 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -96,6 +96,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, + { LLM_ARCH_LLADA_MOE, "llada-moe" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2147,6 +2148,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LLADA_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_SEED_OSS, { @@ -2427,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) { switch (arch) { case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index a4ac28b52..d181ce678 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -100,6 +100,7 @@ enum llm_arch { LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, + LLM_ARCH_LLADA_MOE, LLM_ARCH_SEED_OSS, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4864ed8e7..731e87383 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -936,6 +936,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.causal_attn = false; } break; + case LLM_ARCH_LLADA_MOE: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention + hparams.causal_attn = false; + switch (hparams.n_layer) { + case 16: type = LLM_TYPE_A1_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -2387,6 +2399,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_LLADA_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe"); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + } + } break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -12444,6 +12490,132 @@ struct llm_build_olmoe : public llm_graph_context { } }; +struct llm_build_llada_moe : public llm_graph_context { + llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_openelm : public llm_graph_context { llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -18636,6 +18808,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA] case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: { res = nullptr; } break; @@ -18841,6 +19014,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA_MOE: + { + llm = std::make_unique(*this, params); + } + break; case LLM_ARCH_QWEN2VL: { llm = std::make_unique(*this, params); @@ -19307,6 +19485,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3MOE: + case LLM_ARCH_LLADA_MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b551253af..8cb36661a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1962,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; clean_spaces = false; } else if ( - tokenizer_pre == "bailingmoe") { + tokenizer_pre == "bailingmoe" || + tokenizer_pre == "llada-moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else if ( From 07808ebb07e2b1aa19032705e332679ddf967614 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Mon, 15 Sep 2025 22:54:44 -0400 Subject: [PATCH 20/35] cmake : Do not install tools on iOS targets (#15903) --- CMakeLists.txt | 7 +++++++ tools/batched-bench/CMakeLists.txt | 5 ++++- tools/cvector-generator/CMakeLists.txt | 5 ++++- tools/export-lora/CMakeLists.txt | 5 ++++- tools/gguf-split/CMakeLists.txt | 5 ++++- tools/imatrix/CMakeLists.txt | 5 ++++- tools/llama-bench/CMakeLists.txt | 5 ++++- tools/main/CMakeLists.txt | 5 ++++- tools/mtmd/CMakeLists.txt | 2 +- tools/perplexity/CMakeLists.txt | 5 ++++- tools/quantize/CMakeLists.txt | 5 ++++- tools/run/CMakeLists.txt | 4 +++- tools/tokenize/CMakeLists.txt | 4 +++- tools/tts/CMakeLists.txt | 5 ++++- 14 files changed, 54 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36a2078e4..4720e1f1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,12 @@ if (MSVC) add_compile_options("$<$:/bigobj>") endif() +if (CMAKE_SYSTEM_NAME STREQUAL "iOS") + set(LLAMA_TOOLS_INSTALL_DEFAULT OFF) +else() + set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE}) +endif() + # # option list # @@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) +option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 68ad707f3..4a46b57a5 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt index 49ad9561c..baeb4d00c 100644 --- a/tools/cvector-generator/CMakeLists.txt +++ b/tools/cvector-generator/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt index 310455787..cddfa77f0 100644 --- a/tools/export-lora/CMakeLists.txt +++ b/tools/export-lora/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt index c407e2f0a..9b2125087 100644 --- a/tools/gguf-split/CMakeLists.txt +++ b/tools/gguf-split/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47..22f2fe5fd 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 17e3b9b87..b8543a969 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt index af3d9150f..8f8e9d444 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/main/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 097948856..2381012a0 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -55,7 +55,7 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp) set(TARGET llama-mtmd-cli) add_executable (${TARGET} mtmd-cli.cpp) set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) -if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS") +if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 3e6864093..12b28b2be 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 47e5cbe30..bd9ddbd67 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,6 +1,9 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt index d01895969..e52294ccc 100644 --- a/tools/run/CMakeLists.txt +++ b/tools/run/CMakeLists.txt @@ -10,6 +10,8 @@ if (LLAMA_CURL) set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES}) endif () -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt index 1690b53e5..feed9a106 100644 --- a/tools/tokenize/CMakeLists.txt +++ b/tools/tokenize/CMakeLists.txt @@ -1,5 +1,7 @@ set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index c72bd814c..76320d4c2 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-tts) add_executable(${TARGET} tts.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() From 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 05:57:16 +0200 Subject: [PATCH 21/35] ci : update macos-latest* jobs to use macos-latest (#15938) * ci : update macos-latest* jobs to use macos-latest This commit updates the jobs that are named macos-latest* to use the macos-latest label instead explicit versions. The motivation for this is that there is currently a mixuture of versions in this workflow and there are jobs that are failing because they require a newer version. Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17644792595/job/50140010907#step:5:1759 * ci : add xcodebuild -downloadPlatform iOS command --- .github/workflows/build.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1bd35627d..9a668bf8d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -56,7 +56,7 @@ env: jobs: macOS-latest-cmake-arm64: - runs-on: macos-14 + runs-on: macos-latest steps: - name: Clone @@ -97,7 +97,7 @@ jobs: ctest -L 'main|curl' --verbose --timeout 900 macOS-latest-cmake-x64: - runs-on: macos-13 + runs-on: macos-latest steps: - name: Clone @@ -138,7 +138,7 @@ jobs: ctest -L main --verbose --timeout 900 macOS-latest-cmake-arm64-webgpu: - runs-on: macos-14 + runs-on: latest steps: - name: Clone @@ -1171,7 +1171,9 @@ jobs: ./build-xcframework.sh - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + run: | + xcodebuild -downloadPlatform iOS + xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build android-build: runs-on: ubuntu-latest From f1fbffb5c0b34b2a68febb7da3fd0f8333f1ed4c Mon Sep 17 00:00:00 2001 From: Bowen Han Date: Mon, 15 Sep 2025 23:59:19 -0700 Subject: [PATCH 22/35] fix: apply clang-format to CUDA macros (#16017) clang-format previously broke long CUDA macros (e.g. __launch_bounds__) into unreadable line breaks inside template declarations, such as: template __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1) This change adjusts formatting rules so that CUDA macros remain consistent and aligned with the surrounding template syntax. --- .clang-format | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.clang-format b/.clang-format index 117e6986f..742723fc8 100644 --- a/.clang-format +++ b/.clang-format @@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them +AttributeMacros: + - __host__ + - __device__ + - __global__ + - __forceinline__ + - __launch_bounds__ BinPackArguments: true BinPackParameters: false # OnePerLine BitFieldColonSpacing: Both From 76888d202ed2b835ae19ea9f9db6baf39e419297 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 13:41:38 +0200 Subject: [PATCH 23/35] ci : upload xcframework artifact from ios-xcode-build job (#16010) This commit updates the github workflows build.yml file to include steps for uploading and downloading the xcframework artifact. The macos-latest-swift job now depends on the ios-xcode-build job and downloads the xcframework artifact produced by it. The motivation for this changes is that it takes a long time to build the xcframework and we are currently doing this twice in the workflow. With this change, we only build it once and reuse the artifact. --- .github/workflows/build.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9a668bf8d..7f2dccde0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -711,6 +711,7 @@ jobs: macOS-latest-swift: runs-on: macos-latest + needs: ios-xcode-build strategy: matrix: @@ -727,6 +728,12 @@ jobs: key: macOS-latest-swift evict-old-files: 1d + - name: Download xcframework artifact + uses: actions/download-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + - name: Dependencies id: depends continue-on-error: true @@ -748,11 +755,6 @@ jobs: -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - windows-msys2: runs-on: windows-2025 @@ -1170,6 +1172,13 @@ jobs: run: | ./build-xcframework.sh + - name: Upload xcframework artifact + uses: actions/upload-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + retention-days: 1 + - name: Build Xcode project run: | xcodebuild -downloadPlatform iOS From 3913f8730ec6d6245480affc30ae3049107956f4 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 15:25:57 +0200 Subject: [PATCH 24/35] ggml : fix padding in timestep embedding kernels (#15932) * ggml : remove adding extra dim timestep embedding This commit updates the ggml_timestep_embedding function to no longer add an extra dimension when the specified dimension is odd. The motivation for this change is that this introduces an unnecessary dimension when the dimension is odd, which caused an issue in the kernels which were not expecting this extra dimension and it resulted in uninitialized memory for the second to last dimension. * ggml-cuda : fix padding in timestep embedding kernel This commit removes the zeroing out of the last dimension now that we are not adding the extra padding dimension. * ggml-metal : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel * ggml-opencl : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-sycl : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-vulkan : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-cpu : fix padding in timestep embedding function This commit removes the zeroing out of the last dimension now that we are not adding the extra padding dimension. --- ggml/src/ggml-cpu/ops.cpp | 1 - ggml/src/ggml-cuda/tsembd.cu | 6 +++--- ggml/src/ggml-metal/ggml-metal.metal | 2 +- ggml/src/ggml-opencl/kernels/tsembd.cl | 4 ++-- ggml/src/ggml-sycl/tsembd.cpp | 7 ++++--- .../src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp | 7 ++++--- ggml/src/ggml.c | 6 +----- 7 files changed, 15 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 212e52ef6..c4824d145 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32( } if (dim % 2 != 0 && ith == 0) { embed_data[2 * half] = 0.f; - embed_data[dim] = 0.f; } } } diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu index 153ddbcda..b91a26fc8 100644 --- a/ggml/src/ggml-cuda/tsembd.cu +++ b/ggml/src/ggml-cuda/tsembd.cu @@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d int j = threadIdx.x + blockIdx.x * blockDim.x; float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4314c9cc9..5057e264f 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4167,7 +4167,7 @@ kernel void kernel_timestep_embedding_f32( } if (args.dim % 2 != 0 && tpitg.x == 0) { - embed_data[args.dim] = 0.f; + embed_data[2 * half_] = 0.f; } } diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl index 4b1107f70..21444bd95 100644 --- a/ggml/src/ggml-opencl/kernels/tsembd.cl +++ b/ggml/src/ggml-opencl/kernels/tsembd.cl @@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding( local_half_dim = logical_dim / 2; local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes); - if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) { - local_embed_data_ptr[logical_dim] = 0.0f; + if (logical_dim % 2 != 0 && local_j == local_half_dim) { + local_embed_data_ptr[2 * local_half_dim] = 0.0f; } if (local_j >= local_half_dim) { diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index f6ca626ea..f2003794d 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -21,11 +21,12 @@ static void timestep_embedding_f32( int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp index 79e065a93..ce8e09442 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp @@ -24,11 +24,12 @@ void main() { const uint j = gl_GlobalInvocationID.x; const uint d_offset = i * p.nb1; - if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { - data_d[d_offset + p.dim] = 0.f; + const uint half_dim = p.dim / 2; + + if (p.dim % 2 != 0 && j == half_dim) { + data_d[d_offset + 2 * half_dim] = 0.f; } - const uint half_dim = p.dim / 2; if (j >= half_dim) { return; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 50dc1aa24..3584827dc 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4923,12 +4923,8 @@ struct ggml_tensor * ggml_timestep_embedding( struct ggml_tensor * timesteps, int dim, int max_period) { - int actual_dim = dim; - if (dim % 2 != 0) { - actual_dim = dim + 1; - } - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]); ggml_set_op_params_i32(result, 0, dim); ggml_set_op_params_i32(result, 1, max_period); From 77475530b8bbea3bf578632507e1284cdfe2c8c0 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 15:27:52 +0200 Subject: [PATCH 25/35] ci : use macos-latest for arm64 webgpu build (#16029) This commit updates the runs-on field for the macOS arm64 webgpu build job to use macos-latest instead of just latest. The motivation for this is that this job can wait for a runner to pick up the job for a very long time, sometimes over 7 hours. This is an attempt to see if this change can help reduce the wait time. Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17754163447/job/50454257570?pr=16004 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7f2dccde0..0a00c85b7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -138,7 +138,7 @@ jobs: ctest -L main --verbose --timeout 900 macOS-latest-cmake-arm64-webgpu: - runs-on: latest + runs-on: macos-latest steps: - name: Clone From 8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68 Mon Sep 17 00:00:00 2001 From: jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:17:08 +0200 Subject: [PATCH 26/35] llama-bench: add --n-cpu-moe support (#15952) * llama-bench: add --n-cpu-moe support Support --n-cpu-moe in llama-bench the same way it is supported by llama-server. --- common/arg.cpp | 8 +-- common/common.h | 14 +++++ tools/llama-bench/llama-bench.cpp | 87 +++++++++++++++++++++++++------ 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 19189d8f9..9fd8858e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { - params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( @@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex for (int i = 0; i < value; ++i) { // keep strings alive and avoid leaking memory by storing them in a static vector static std::list buft_overrides; - buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides.push_back(llm_ffn_exps_block_regex(i)); params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); } } @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe-draft", "-cmoed"}, "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", [](common_params & params) { - params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (int i = 0; i < value; ++i) { static std::list buft_overrides_draft; - buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); } } diff --git a/common/common.h b/common/common.h index 5063d73f9..83d44dbaa 100644 --- a/common/common.h +++ b/common/common.h @@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } +// +// MoE utils +// + +const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps"; + +static std::string llm_ffn_exps_block_regex(int idx) { + return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX); +} + +static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { + return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() }; +} + // // training utils // diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 95f662a29..ad47bf144 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -250,6 +250,7 @@ struct cmd_params { std::vector cpu_strict; std::vector poll; std::vector n_gpu_layers; + std::vector n_cpu_moe; std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; @@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = { /* cpu_strict */ { false }, /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, + /* n_cpu_moe */ { 0 }, /* rpc_servers */ { "" }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, @@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", + join(cmd_params_defaults.n_cpu_moe, ",").c_str()); if (llama_supports_rpc()) { printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); @@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = parse_int_range(argv[i]); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end()); } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; @@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.n_cpu_moe.empty()) { + params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; + } if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } @@ -901,6 +915,7 @@ struct cmd_params_instance { bool cpu_strict; int poll; int n_gpu_layers; + int n_cpu_moe; std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; @@ -973,20 +988,50 @@ struct cmd_params_instance { mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; - if (tensor_buft_overrides.empty()) { - mparams.tensor_buft_overrides = nullptr; + if (n_cpu_moe <= 0) { + if (tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = nullptr; + } else { + GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && + "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + } } else { - GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); - mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + static std::vector merged; + static std::vector patterns; + + merged.clear(); + patterns.clear(); + + auto first = tensor_buft_overrides.begin(); + auto last = tensor_buft_overrides.end(); + if (first != last && (last - 1)->pattern == nullptr) { + --last; + } + merged.insert(merged.end(), first, last); + + patterns.reserve((size_t) n_cpu_moe); + merged.reserve(merged.size() + (size_t) n_cpu_moe + 1); + + for (int i = 0; i < n_cpu_moe; ++i) { + patterns.push_back(llm_ffn_exps_block_regex(i)); + merged.push_back({ patterns.back().c_str(), + ggml_backend_cpu_buffer_type() }); + } + + merged.push_back({ nullptr, nullptr }); + + mparams.tensor_buft_overrides = merged.data(); } return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && - split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && - tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); + return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && + rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } llama_context_params to_llama_cparams() const { @@ -1014,6 +1059,7 @@ static std::vector get_cmd_params_instances(const cmd_param // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & ncmoe : params.n_cpu_moe) for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) @@ -1051,6 +1097,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1083,6 +1130,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1115,6 +1163,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1152,6 +1201,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + int n_cpu_moe; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1186,6 +1236,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; @@ -1236,12 +1287,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", + "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", + "stddev_ts" }; return fields; } @@ -1251,8 +1304,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1320,6 +1373,7 @@ struct test { ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), + std::to_string(n_cpu_moe), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), @@ -1568,6 +1622,9 @@ struct markdown_printer : public printer { if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } + if (params.n_cpu_moe.size() > 1) { + fields.emplace_back("n_cpu_moe"); + } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); } From d5fabe3682de515fd09d6c981f7a0d1b75614455 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 17 Sep 2025 14:33:08 +0800 Subject: [PATCH 27/35] CANN: Optimize ggml_cann_set_device (#15935) * CANN: Fix ggml_cann_set_device to avoid redundant device switches - Added a check to skip aclrtSetDevice if the current device is already set. - Prevents unnecessary context switches while keeping thread/device consistency. * CANN: add device default id --- ggml/src/ggml-cann/common.h | 5 ++++- ggml/src/ggml-cann/ggml-cann.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index c5fce8dc9..b707b8435 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -526,7 +526,10 @@ struct ggml_backend_cann_context { */ aclrtStream stream(int stream) { if (streams[stream] == nullptr) { - ggml_cann_set_device(device); + // If the device is not set here, destroying the stream later may cause a mismatch + // between the thread contexts where the stream was created and destroyed. + // However, I printed the device_id, thread_id, and stream, and they are all consistent. + ACL_CHECK(aclrtSetDevice(device)); ACL_CHECK(aclrtCreateStream(&streams[stream])); } return streams[stream]; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 19a18a281..56d82b4af 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -75,13 +75,12 @@ * @param device The device ID to set. */ void ggml_cann_set_device(const int32_t device) { - // TODO: uncomment these lines after empty context has fixed. - // int current_device; - // ACL_CHECK(aclrtGetDevice(¤t_device)); + int current_device = -1; + aclrtGetDevice(¤t_device); - // if (device == current_device) { - // return; - // } + if (device == current_device) { + return; + } ACL_CHECK(aclrtSetDevice(device)); } @@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_get_rows(ctx, dst); break; case GGML_OP_SET_ROWS: + std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl; ggml_cann_set_rows(ctx, dst); break; case GGML_OP_DUP: From 85286f354813056f6c835046c0acfa3bf6ba9432 Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 17 Sep 2025 00:01:58 -0700 Subject: [PATCH 28/35] model : add OLMo3 support (#16015) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add HF to gguf conversion logic for Olmo3 * Add Olmo3 implementation * Update rope comment * Fix indentation Co-authored-by: Sigbjørn Skjæret * Apply suggestion from @CISC Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++ src/llama-model.cpp | 47 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5a21ba211..ce83f2469 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6009,9 +6009,34 @@ class SeedOssModel(TextModel): @ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 731e87383..2be807a6a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1350,6 +1350,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(4); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; @@ -12233,6 +12241,7 @@ struct llm_build_olmo : public llm_graph_context { } }; +template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12248,7 +12257,14 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12281,17 +12297,36 @@ struct llm_build_olmo2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( + const bool is_swa = hparams.is_swa(il); + + if (is_swa) { + // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling. + // This is achieved here by setting freq_scale and attn_factor to 1. + // We also set ext_factor to 0 to avoid a few unnecessary computations. + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -19131,7 +19166,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_OLMOE: { From 1cbd80f8cf80a817715b1ccc5680fe2a3c5172c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 17 Sep 2025 15:29:00 +0800 Subject: [PATCH 29/35] examples : support encoder-decoder models in the simple example (#16002) Signed-off-by: Jie Fu --- examples/simple/simple.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 633b87e58..d09771d10 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -145,6 +145,20 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + batch = llama_batch_get_one(&decoder_start_token_id, 1); + } + // main loop const auto t_main_start = ggml_time_us(); From 745cbcf2fe1eb88f8db615ac622f0b944d924ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 17 Sep 2025 15:30:55 +0800 Subject: [PATCH 30/35] llama-quant : fix the verification of attention layers for encoder-decoder models (#16023) Signed-off-by: Jie Fu --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c93e8065a..97228b2a6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // attention layers have a non-zero number of kv heads int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); if (llama_model_has_encoder(&model)) { - n_attn_layer *= 3; + // now n_attn_layer is the number of attention layers in the encoder + // for each decoder block, there are 2 attention layers + n_attn_layer += 2 * model.hparams.dec_n_layer; } GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected"); } From a91d035b901e8a9edf810f63d130ee49adf27be2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 17 Sep 2025 09:34:09 +0200 Subject: [PATCH 31/35] ci : revert back to macos-13 for macOS-latest-cmake-x64 (#16040) This commit reverts the change of the runs-on parameter for the macOS-latest-cmake-x64 job back to macos-13 that was make in Commit 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 ("ci : update macos-latest* jobs to use macos-latest (#15938)"). The motivation for this is that using macos-latest will cause an ARM based runner to be used, and not an x64 based runner. Refs: https://github.com/ggml-org/llama.cpp/pull/15938#issuecomment-3300805127 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0a00c85b7..ff42b19f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,7 +97,7 @@ jobs: ctest -L 'main|curl' --verbose --timeout 900 macOS-latest-cmake-x64: - runs-on: macos-latest + runs-on: macos-13 steps: - name: Clone From cb5bb6cc05119c24e7711ca2956cd0e6d409d396 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 17 Sep 2025 07:35:37 +0000 Subject: [PATCH 32/35] vulkan: automatically remove unsupported devices (#15976) * remove unsupported vulkan devices * make this happen during selection instead * pass by reference --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 60a99dc78..1f1136382 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4423,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { static bool ggml_vk_instance_validation_ext_available(); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); - static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); static void ggml_vk_instance_init() { if (vk_instance_initialized) { @@ -4540,7 +4540,7 @@ static void ggml_vk_instance_init() { new_driver.pNext = &new_id; devices[i].getProperties2(&new_props); - if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { + if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { // Check if there are two physical devices corresponding to the same GPU auto old_device = std::find_if( vk_instance.device_indices.begin(), @@ -12738,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available( UNUSED(instance_extensions); } +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); + + return vk11_features.storageBuffer16BitAccess; +} + static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: From cd08fc3ecc0264b4414b68af3874a6c689ed60c1 Mon Sep 17 00:00:00 2001 From: David Ribeiro Alves Date: Wed, 17 Sep 2025 01:08:02 -0700 Subject: [PATCH 33/35] common : Fix corrupted memory error on json grammar initialization (#16038) Initalizing RESERVED_NAME in is_reserved_name() is not thread safe and leads to corrupted memory when used from multiple threads as can be seen in the asan trace below. This fixes the initialization to make it thread-safe. #0 0x000100abd018 in std::__1::pair, std::__1::allocator>, void*>*>, bool> std::__1::__hash_table, std::__1::allocator>, std::__1::hash, std::__1::allocator>>, std::__1::equal_to, std::__1::allocator>>, std::__1::allocator, std::__1::allocator>>>::__emplace_unique_key_args, std::__1::allocator>, std::__1::basic_string, std::__1::allocator> const&>(std::__1::basic_string, std::__1::allocator> const&, std::__1::basic_string, std::__1::allocator> const&) __hash_table:1565 #1 0x000100ab0320 in SchemaConverter::visit(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&, std::__1::basic_string, std::__1::allocator> const&) json-schema-to-grammar.cpp:802 #2 0x000100aafc48 in std::__1::__function::__func const&, common_grammar_options const&)::$_2, std::__1::allocator const&, common_grammar_options const&)::$_2>, std::__1::basic_string, std::__1::allocator> (std::__1::basic_string, std::__1::allocator> const&, nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>::operator()(std::__1::basic_string, std::__1::allocator> const&, nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&) function.h:319 #3 0x000100a2c938 in std::__1::__function::__func, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&), std::__1::allocator, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>, void (nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>::operator()(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&) function.h:319 #4 0x000100a139f8 in foreach_function(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&, std::__1::function, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)> const&) chat.cpp:762 #5 0x000100a2a7f4 in std::__1::__function::__func, void (common_grammar_builder const&)>::operator()(common_grammar_builder const&) function.h:319 #6 0x000100aa98f4 in build_grammar(std::__1::function const&, common_grammar_options const&) json-schema-to-grammar.cpp:982 #7 0x0001009c9314 in common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool) chat.cpp:1110 #8 0x0001009b8afc in common_chat_templates_apply_jinja(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:1992 #9 0x0001009b533c in common_chat_templates_apply(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:2074 #10 0x000100810120 in llamacpp_apply_chat_template+0x724 (predict_oai-98384e17fb94e863:arm64+0x100090120) ... ==45482==Register values: x[0] = 0x00006020004147f8 x[1] = 0x00006080000013c8 x[2] = 0x0000000000000000 x[3] = 0x0000604006289738 x[4] = 0x0000000000000002 x[5] = 0x0000000000000001 x[6] = 0x04034000004b4000 x[7] = 0x0000000000000001 x[8] = 0xbebebebebebebebe x[9] = 0x17d7d7d7d7d7d7d7 x[10] = 0x00000c04000828ff x[11] = 0x0000000000000001 x[12] = 0x000000002018d383 x[13] = 0x0000000000000000 x[14] = 0xfa0000000000fafa x[15] = 0x000010700001ffff x[16] = 0x000000019dc012c0 x[17] = 0x00000001021284f8 x[18] = 0x0000000000000000 x[19] = 0x00000001700acdc0 x[20] = 0x0000000000000002 x[21] = 0x000000002018d384 x[22] = 0x16dd16fd2e731151 x[23] = 0x0000007000020000 x[24] = 0x0000000100c69c08 x[25] = 0x0000000100c69c20 x[26] = 0x00006080000013c7 x[27] = 0x0000000100c69c00 x[28] = 0x00000001700acd60 fp = 0x00000001700aceb0 lr = 0x0000000100abce30 sp = 0x00000001700acd60 AddressSanitizer can not provide additional info. SUMMARY: AddressSanitizer: SEGV __hash_table:1565 in std::__1::pair, std::__1::allocator>, void*>*>, bool> std::__1::__hash_table, std::__1::allocator>, std::__1::hash, std::__1::allocator>>, std::__1::equal_to, std::__1::allocator>>, std::__1::allocator, std::__1::allocator>>>::__emplace_unique_key_args, std::__1::allocator>, std::__1::basic_string, std::__1::allocator> const&>(std::__1::basic_string, std::__1::allocator> const&, std::__1::basic_string, std::__1::allocator> const&) Thread T5 created by T0 here: #0 0x0001020b99d4 in pthread_create+0x5c (libclang_rt.asan_osx_dynamic.dylib:arm64e+0x359d4) #1 0x000100873910 in std::sys::pal::unix::thread::Thread::new::h77254fdd87a28e05+0x118 (predict_oai-98384e17fb94e863:arm64+0x1000f3910) #2 0x0001007c7a1c in test::run_test::haeb3c2bcd5ed6cf6+0x76c (predict_oai-98384e17fb94e863:arm64+0x100047a1c) #3 0x0001007aedb0 in test::console::run_tests_console::he9d142d704f3a986+0x149c (predict_oai-98384e17fb94e863:arm64+0x10002edb0) #4 0x0001007c5758 in test::test_main::hf86a5e20735245b9+0x118 (predict_oai-98384e17fb94e863:arm64+0x100045758) #5 0x0001007c5da0 in test::test_main_static::h61ee9c8fd30abca0+0x54 (predict_oai-98384e17fb94e863:arm64+0x100045da0) ... ==45482==ABORTING --- common/json-schema-to-grammar.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 182c78754..db1f0b23d 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -257,12 +257,13 @@ std::unordered_map STRING_FORMAT_RULES = { }; static bool is_reserved_name(const std::string & name) { - static std::unordered_set RESERVED_NAMES; - if (RESERVED_NAMES.empty()) { - RESERVED_NAMES.insert("root"); - for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); - for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); - } + static const std::unordered_set RESERVED_NAMES = [] { + std::unordered_set s; + s.insert("root"); + for (const auto & p : PRIMITIVE_RULES) s.insert(p.first); + for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first); + return s; + }(); return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); } From c959b676be29e93f8dbc3bd6056ceba812a9eb72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Sep 2025 15:32:42 +0200 Subject: [PATCH 34/35] CUDA: fix FA occupancy, optimize tile kernel (#15982) --- ggml/src/ggml-cuda/common.cuh | 16 + ggml/src/ggml-cuda/fattn-common.cuh | 13 +- ggml/src/ggml-cuda/fattn-tile.cu | 551 ++++++++++++++++------------ ggml/src/ggml-cuda/vendors/hip.h | 34 +- 4 files changed, 361 insertions(+), 253 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index b0feea362..045c6d300 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -75,6 +75,8 @@ #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) +#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads @@ -325,6 +327,20 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() { #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) } +// Maximum number of bytes that can be copied in a single instruction. +static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() { +#ifdef GGML_USE_HIP + return 16; +#else +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + return 16; +#else + return 8; +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // GGML_USE_HIP +} + + [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b69f57d65..142a3a88d 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -647,9 +647,7 @@ static __global__ void flash_attn_stream_k_fixup( } template // D == head size -#if !defined(GGML_USE_HIP) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIP) static __global__ void flash_attn_combine_results( const float * __restrict__ VKQ_parts, const float2 * __restrict__ VKQ_meta, @@ -692,10 +690,7 @@ static __global__ void flash_attn_combine_results( float VKQ_numerator = 0.0f; float VKQ_denominator = 0.0f; for (int l = 0; l < parallel_blocks; ++l) { - const float diff = meta[l].x - kqmax; - float KQ_max_scale = expf(diff); - const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD); - *((uint32_t *) &KQ_max_scale) &= ftz_mask; + const float KQ_max_scale = expf(meta[l].x - kqmax); VKQ_numerator += KQ_max_scale * VKQ_parts[l*D + tid]; VKQ_denominator += KQ_max_scale * meta[l].y; @@ -836,11 +831,10 @@ void launch_fattn( CUDA_CHECK(cudaGetLastError()); } - int parallel_blocks = 1; - const dim3 block_dim(warp_size, nwarps, 1); int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy. CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared)); + int parallel_blocks = max_blocks_per_sm; dim3 blocks_num; if (stream_k) { @@ -862,9 +856,6 @@ void launch_fattn( GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0); const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size. - // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave: - parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1); - // parallel_blocks must not be larger than what the tensor size allows: parallel_blocks = std::min(parallel_blocks, ntiles_KQ); diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index c6a399ce5..a2d9951ea 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -2,20 +2,30 @@ #include "fattn-common.cuh" #include "fattn-tile.cuh" -#define FATTN_TILE_NTHREADS 256 +// kq_stride == number of KQ rows to process per iteration +// kq_nbatch == number of K columns to load in parallel for KQ calculation static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) { if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_RDNA(cc)) { + switch (D) { + case 64: + return 128; + case 128: + case 256: + return ncols <= 16 ? 128 : 64; + default: + GGML_ABORT("fatal error"); + return -1; + } + } switch (D) { case 64: - return 64; + return ncols == 32 ? 128 : 64; case 128: + return ncols == 32 ? 64 : 32; case 256: - if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { - return ncols <= 16 ? 64 : 32; - } else { - return 64; - } + return 32; default: GGML_ABORT("fatal error"); return -1; @@ -49,24 +59,28 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) { #ifdef GGML_USE_HIP +#ifdef RDNA switch (D) { case 64: - return 64; + return 128; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) + return ncols <= 16 ? 128 : 64; default: return -1; } +#else + switch (D) { + case 64: + return ncols == 32 ? 128 : 64; + case 128: + return ncols == 32 ? 64 : 32; + case 256: + return 32; + default: + return -1; + } +#endif // RDNA #else #ifdef FAST_FP16_AVAILABLE switch (D) { @@ -100,17 +114,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return ncols <= 16 ? 64 : 256; -#endif // defined(GCN) || defined(CDNA) + return 128; default: return -1; } @@ -120,9 +125,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: - return ncols <= 16 ? 128 : 64; case 256: - return ncols <= 16 ? 64 : 128; + return 128; default: return -1; } @@ -142,12 +146,27 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols GGML_UNUSED_VARS(ncols, warp_size); } -template // D == head size -#ifdef GGML_USE_HIP -__launch_bounds__(FATTN_TILE_NTHREADS, 1) +static int fattn_tile_get_nthreads_host(const int cc, const int ncols) { + return 256; + GGML_UNUSED_VARS(cc, ncols); +} + +static constexpr __device__ int fattn_tile_get_nthreads_device(int ncols) { + return 256; + GGML_UNUSED(ncols); +} + +static constexpr __device__ int fattn_tile_get_occupancy_device(int ncols) { +#ifdef RDNA + return 3; #else -__launch_bounds__(FATTN_TILE_NTHREADS, 2) -#endif // GGML_USE_HIP + return ncols <= 16 ? 3 : 2; +#endif // RDNA + GGML_UNUSED(ncols); +} + +template // D == head size +__launch_bounds__(fattn_tile_get_nthreads_device(ncols), fattn_tile_get_occupancy_device(ncols)) static __global__ void flash_attn_tile( const char * __restrict__ Q, const char * __restrict__ K, @@ -193,7 +212,7 @@ static __global__ void flash_attn_tile( } constexpr int warp_size = 32; - constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size; + constexpr int nwarps = fattn_tile_get_nthreads_device(ncols) / warp_size; constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size); static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size."); constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size); @@ -206,90 +225,126 @@ static __global__ void flash_attn_tile( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); - const float * sinksf = (const float *) (sinks); + const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); + const float * sinksf = (const float *) (sinks); const int stride_KV2 = nb11 / sizeof(half2); const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); -#if defined(GGML_USE_HIP) - constexpr int cpy_nb = 16; -#else - constexpr int cpy_nb = 8; -#endif // defined(GGML_USE_HIP) && defined(GCN) + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); constexpr int cpy_ne = cpy_nb / 4; - __shared__ float KQ[ncols][kq_stride]; + constexpr int cpw = ncols/nwarps; // cols per warp + + // softmax_iter_j == number of KQ columns for which to calculate softmax in parallel. + // KQ is originall 2D but uses a Z-shaped memory pattern for larger reads/writes. #ifdef FAST_FP16_AVAILABLE + constexpr int softmax_iter_j = cpw < 2*cpy_ne ? cpw : 2*cpy_ne; + + __shared__ half KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ half2 Q_tmp[ncols][D/2]; - __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. - half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ half2 KV_tmp[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. + half2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #else + constexpr int softmax_iter_j = cpw < 1*cpy_ne ? cpw : 1*cpy_ne; + + __shared__ float KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ float Q_tmp[ncols][D]; - __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. - float2 * KV_tmp_f2 = (float2 *) KV_tmp_f; - float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ float KV_tmp[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. + float2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #endif // FAST_FP16_AVAILABLE + static_assert(cpw % softmax_iter_j == 0, "bad softmax_iter_j"); - - float kqmax[ncols/nwarps]; + float KQ_max[cpw]; #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { - kqmax[j0/nwarps] = -FLT_MAX/2.0f; + KQ_max[j0/nwarps] = -FLT_MAX/2.0f; } - float kqsum[ncols/nwarps] = {0.0f}; + float KQ_sum[cpw] = {0.0f}; + + // Load Q data, convert to FP16 if fast. +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + const int j = j0 + threadIdx.y*cpw; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + float tmp_f[cpy_ne_D] = {0.0f}; + if (ic0 + j < ne01) { + ggml_cuda_memcpy_1(tmp_f, &Q_f[j*(nb01/sizeof(float)) + i0 + threadIdx.x*cpy_ne_D]); + } #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f); + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp_f[i1] *= scale; + } + #ifdef FAST_FP16_AVAILABLE - Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale); + half2 tmp_h2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) { + tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]); + } + ggml_cuda_memcpy_1(&Q_tmp[j][i0/2 + threadIdx.x*(cpy_ne_D/2)], tmp_h2); #else - Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale; - Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale; + ggml_cuda_memcpy_1 (&Q_tmp[j][i0 + threadIdx.x* cpy_ne_D], tmp_f); #endif // FAST_FP16_AVAILABLE } } __syncthreads(); + // Main loop over KV cache: const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11; for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) { // Calculate KQ tile and keep track of new maximum KQ values: - float kqmax_new[ncols/nwarps]; + float KQ_max_new[cpw]; #pragma unroll - for (int j = 0; j < ncols/nwarps; ++j) { - kqmax_new[j] = kqmax[j]; + for (int j = 0; j < cpw; ++j) { + KQ_max_new[j] = KQ_max[j]; } - float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}}; + float KQ_acc[kq_stride/warp_size][cpw] = {{0.0f}}; // Accumulators for KQ matrix multiplication. + // KQ = K @ Q matrix multiplication: #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) { #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) { const int i_KQ = i_KQ_0 + threadIdx.y; -#pragma unroll - for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) { - const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2; -#else - const float2 tmp_f2 = __half22float2(tmp_h2); - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x; - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y; -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/(2*warp_size) ? cpy_ne : kq_nbatch/(2*warp_size); +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size*cpy_ne_kqnb) { + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], + &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x*cpy_ne_kqnb]); } +#else + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/warp_size ? cpy_ne : kq_nbatch/warp_size; +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += warp_size*cpy_ne_kqnb) { + half2 tmp_h2[cpy_ne_kqnb/2]; + ggml_cuda_memcpy_1( + tmp_h2, &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1/2 + threadIdx.x*(cpy_ne_kqnb/2)]); + + float2 tmp_f2[cpy_ne_kqnb/2]; +#pragma unroll + for (int k_KQ_2 = 0; k_KQ_2 < cpy_ne_kqnb/2; ++k_KQ_2) { + tmp_f2[k_KQ_2] = __half22float2(tmp_h2[k_KQ_2]); + } + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); @@ -298,12 +353,12 @@ static __global__ void flash_attn_tile( #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) { half2 K_k[kq_stride/warp_size][cpy_ne]; - half2 Q_k[ncols/nwarps][cpy_ne]; + half2 Q_k[cpw][cpy_ne]; #else #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) { float K_k[kq_stride/warp_size][cpy_ne]; - float Q_k[ncols/nwarps][cpy_ne]; + float Q_k[cpw][cpy_ne]; #endif // FAST_FP16_AVAILABLE #pragma unroll @@ -311,29 +366,29 @@ static __global__ void flash_attn_tile( const int i_KQ = i_KQ_0 + threadIdx.x; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); #else - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); #else - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { #pragma unroll for (int k = 0; k < cpy_ne; ++k) { - ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]); + ggml_cuda_mad(KQ_acc[i_KQ_0/warp_size][j_KQ_0], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0][k]); } } } @@ -344,104 +399,77 @@ static __global__ void flash_attn_tile( } } + // Apply logit softcap, mask, update KQ_max: #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { const int i_KQ = i_KQ_0 + threadIdx.x; #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; if (use_logit_softcap) { - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); + KQ_acc[i_KQ_0/warp_size][j_KQ_0] = logit_softcap * tanhf(KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; + KQ_acc[i_KQ_0/warp_size][j_KQ_0] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; - kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); - - KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps]; + KQ_max_new[j_KQ_0] = fmaxf(KQ_max_new[j_KQ_0], KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } } __syncthreads(); + // Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators: #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]); - kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; - - float kqsum_add = 0.0f; - if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) { - const int i = i0 + 4*threadIdx.x; - - float4 val = *(const float4 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - val.z = expf(val.z - kqmax[j0/nwarps]); - val.w = expf(val.w - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y + val.z + val.w; - + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { #ifdef FAST_FP16_AVAILABLE - const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)}; - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); + half tmp[kq_stride/warp_size][softmax_iter_j]; #else - ggml_cuda_memcpy_1(&KQ[j][i], &val); + float tmp[kq_stride/warp_size][softmax_iter_j]; #endif // FAST_FP16_AVAILABLE - } - } else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) { - const int i = i0 + 2*threadIdx.x; - float2 val = *(const float2 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y; -#ifdef FAST_FP16_AVAILABLE - const half2 tmp = make_half2(val.x, val.y); - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); -#else - ggml_cuda_memcpy_1(&KQ[j][i], &val); -#endif // FAST_FP16_AVAILABLE - } - } else { +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_max_new[j0+j1] = warp_reduce_max(KQ_max_new[j0+j1]); + const float KQ_max_scale = expf(KQ_max[j0+j1] - KQ_max_new[j0+j1]); + KQ_max[j0+j1] = KQ_max_new[j0+j1]; + + float KQ_sum_add = 0.0f; +#pragma unroll for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const float diff = KQ[j][i] - kqmax[j0/nwarps]; - const float val = expf(diff); - kqsum_add += val; -#ifdef FAST_FP16_AVAILABLE - ((half *) KQ[j])[i] = val; -#else - KQ[j][i] = val; -#endif // FAST_FP16_AVAILABLE + const float val = expf(KQ_acc[i0/warp_size][j0+j1] - KQ_max[j0+j1]); + KQ_sum_add += val; + tmp[i0/warp_size][j1] = val; } - } - kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add; + KQ_sum[j0+j1] = KQ_sum[j0+j1]*KQ_max_scale + KQ_sum_add; #ifdef FAST_FP16_AVAILABLE - const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size] *= KQ_max_scale_h2; + } #else #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size].x *= KQ_max_scale; + VKQ[j0+j1][i0/warp_size].y *= KQ_max_scale; + } #endif // FAST_FP16_AVAILABLE + } + +#pragma unroll + for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { + const int i = i0 + threadIdx.x; + + ggml_cuda_memcpy_1( + KQ[j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j)][i], tmp[i0/warp_size]); + } } - constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; + // VKQ = V @ KQ matrix multiplication: + constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; // Number of V columns that fit in SRAM for K. static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter"); #pragma unroll for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) { @@ -449,65 +477,96 @@ static __global__ void flash_attn_tile( for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) { const int k_tile = k1 + threadIdx.y; -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[k_tile*(D/2) + i] = tmp; -#else - KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp); -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_D = cpy_ne < D/(2*warp_size) ? cpy_ne : D/(2*warp_size); +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*(D/2) + i0 + threadIdx.x*cpy_ne_D], + &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0 + threadIdx.x*cpy_ne_D]); } +#else + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + half2 tmp_h2[cpy_ne_D/2]; + ggml_cuda_memcpy_1( + tmp_h2, &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0/2 + threadIdx.x*(cpy_ne_D/2)]); + + float2 tmp_f2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D/2; ++i1) { + tmp_f2[i1] = __half22float2(tmp_h2[i1]); + } + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*D + i0 + threadIdx.x*cpy_ne_D], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); +#ifdef FAST_FP16_AVAILABLE #pragma unroll for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { -#ifdef FAST_FP16_AVAILABLE half2 V_k[(D/2)/warp_size]; - half2 KQ_k[ncols/nwarps]; -#else - float2 V_k[(D/2)/warp_size]; - float KQ_k[ncols/nwarps]; -#endif // FAST_FP16_AVAILABLE + half2 KQ_k[cpw]; + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - -#ifdef FAST_FP16_AVAILABLE - V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i]; -#else - V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i]; -#endif // FAST_FP16_AVAILABLE + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/warp_size], &KV_tmp[k1*(D/2) + i0 + threadIdx.x*cpy_ne_D]); } #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); -#ifdef FAST_FP16_AVAILABLE - KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]); -#else - KQ_k[j0/nwarps] = KQ[j][k0 + k1]; -#endif // FAST_FP16_AVAILABLE + half tmp[softmax_iter_j]; + ggml_cuda_memcpy_1( + &tmp, KQ[j][k0 + k1]); +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_k[j0+j1] = __half2half2(tmp[j1]); + } } #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { -#ifdef FAST_FP16_AVAILABLE - VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps]; -#else - VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps]; - VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps]; -#endif // FAST_FP16_AVAILABLE + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size] += V_k[i0/warp_size]*KQ_k[j0]; } } } +#else +#pragma unroll + for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { + float2 V_k[(D/2)/warp_size]; + float KQ_k[cpw]; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/(2*warp_size)], &KV_tmp[k1*D + i0 + threadIdx.x*cpy_ne_D]); + } +#pragma unroll + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); + + ggml_cuda_memcpy_1( + &KQ_k[j0], KQ[j][k0 + k1]); + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size) { +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0]; + VKQ[j0][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0]; + } + } + } +#endif // FAST_FP16_AVAILABLE __syncthreads(); } @@ -519,69 +578,92 @@ static __global__ void flash_attn_tile( const float sink = sinksf[head]; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink); - kqmax_new_j = warp_reduce_max(kqmax_new_j); + for (int j0 = 0; j0 < cpw; ++j0) { + float KQ_max_new_j = fmaxf(KQ_max[j0], sink); + KQ_max_new_j = warp_reduce_max(KQ_max_new_j); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j); - kqmax[j0/nwarps] = kqmax_new_j; + const float KQ_max_scale = expf(KQ_max[j0] - KQ_max_new_j); + KQ_max[j0] = KQ_max_new_j; - const float val = expf(sink - kqmax[j0/nwarps]); - kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale; + const float val = expf(sink - KQ_max[j0]); + KQ_sum[j0] = KQ_sum[j0] * KQ_max_scale; if (threadIdx.x == 0) { - kqsum[j0/nwarps] += val; + KQ_sum[j0] += val; } #ifdef FAST_FP16_AVAILABLE const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; + VKQ[j0][i0/warp_size] *= KQ_max_scale_h2; } #else #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; + VKQ[j0][i0/warp_size].x *= KQ_max_scale; + VKQ[j0][i0/warp_size].y *= KQ_max_scale; } #endif // FAST_FP16_AVAILABLE } } - float2 * dst2 = (float2 *) dst; - #pragma unroll - for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { - const int j_VKQ = j_VKQ_0 + threadIdx.y; + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + KQ_sum[j_VKQ_0] = warp_reduce_sum(KQ_sum[j_VKQ_0]); + } + if (gridDim.y == 1) { +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { +#ifdef FAST_FP16_AVAILABLE + const half2 KQ_sum_j_inv = make_half2(1.0f/KQ_sum[j_VKQ_0], 1.0f/KQ_sum[j_VKQ_0]); +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i] *= KQ_sum_j_inv; + } +#else + const float KQ_sum_j_inv = 1.0f/KQ_sum[j_VKQ_0]; +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i].x *= KQ_sum_j_inv; + VKQ[j_VKQ_0][i].y *= KQ_sum_j_inv; + } +#endif // FAST_FP16_AVAILABLE + } + } + + // Write back results: +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + const int j_VKQ = j_VKQ_0 + threadIdx.y*cpw; if (ic0 + j_VKQ >= ne01) { return; } - float kqsum_j = kqsum[j_VKQ_0/nwarps]; - kqsum_j = warp_reduce_sum(kqsum_j); - const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; -#pragma unroll - for (int i00 = 0; i00 < D/2; i00 += warp_size) { - const int i0 = i00 + threadIdx.x; - #ifdef FAST_FP16_AVAILABLE - float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]); + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + float2 tmp[cpy_ne_D]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp[i1] = __half22float2(VKQ[j_VKQ_0][i0/warp_size + i1]); + } + ggml_cuda_memcpy_1(&dst[j_dst_unrolled*D + 2*i0 + threadIdx.x*(2*cpy_ne_D)], tmp); + } #else - float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size]; + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &dst[j_dst_unrolled*D + i0 + threadIdx.x*cpy_ne_D], &VKQ[j_VKQ_0][i0/(2*warp_size)]); + } #endif // FAST_FP16_AVAILABLE - if (gridDim.y == 1) { - dst_val.x /= kqsum_j; - dst_val.y /= kqsum_j; - } - dst2[j_dst_unrolled*(D/2) + i0] = dst_val; - } - if (gridDim.y != 1 && threadIdx.x == 0) { - dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + dst_meta[j_dst_unrolled] = make_float2(KQ_max[j_VKQ_0], KQ_sum[j_VKQ_0]); } } #else @@ -602,15 +684,29 @@ template static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * Q = dst->src[0]; - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int warp_size = 32; - const int nwarps = FATTN_TILE_NTHREADS / warp_size; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const int warp_size = 32; constexpr size_t nbytes_shared = 0; +#ifdef GGML_USE_HIP + if constexpr (D <= 128) { + if (Q->ne[1] > 32) { + constexpr int cols_per_block = 64; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; + fattn_kernel_t fattn_kernel = flash_attn_tile; + const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); + launch_fattn + (ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size); + return; + } + } +#endif // GGML_USE_HIP + if (Q->ne[1] > 16) { constexpr int cols_per_block = 32; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn @@ -619,6 +715,7 @@ static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml } constexpr int cols_per_block = 16; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 12bbee455..37386afcd 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -158,41 +158,41 @@ #define __CUDA_ARCH__ 1300 -#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) -#define GCN -#endif - #if defined(__gfx900__) || defined(__gfx906__) #define GCN5 -#endif +#endif // defined(__gfx900__) || defined(__gfx906__) #if defined(__gfx803__) #define GCN4 -#endif +#endif // defined(__gfx803__) -#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) -#define CDNA // For the entire family -#endif +#if defined(GCN5) || defined(GCN4) +#define GCN +#endif // defined(GCN5) || defined(GCN4) #if defined(__gfx942__) #define CDNA3 -#endif +#endif // defined(__gfx942__) #if defined(__gfx90a__) #define CDNA2 -#endif +#endif // defined(__gfx90a__) #if defined(__gfx908__) #define CDNA1 -#endif +#endif // defined(__gfx908__) + +#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1) +#define CDNA // For the entire family +#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1) #if defined(__GFX12__) #define RDNA4 -#endif +#endif // defined(__GFX12__) #if defined(__GFX11__) #define RDNA3 -#endif +#endif // defined(__GFX11__) #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) @@ -201,7 +201,11 @@ #if defined(__gfx1010__) || defined(__gfx1012__) #define RDNA1 -#endif +#endif // defined(__gfx1010__) || defined(__gfx1012__) + +#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) +#define RDNA // For the entire family +#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) #ifndef __has_builtin #define __has_builtin(x) 0 From 8f8f2274ee3601fecf6e2d57b52f701c81bede21 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 18 Sep 2025 00:18:21 +0700 Subject: [PATCH 35/35] convert : add Llama4ForCausalLM (#16042) * convert : add Llama4ForCausalLM * handle swa * half working version * fix use_kq_norm * fix use_kq_norm --- convert_hf_to_gguf.py | 9 +++++++- src/llama-hparams.h | 2 +- src/llama-model.cpp | 48 ++++++++++++++++++++++++++++++++++--------- src/llama-model.h | 3 +++ 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ce83f2469..7ddec48ad 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2393,7 +2393,10 @@ class SmolVLMModel(MmprojModel): return [] # skip other tensors -@ModelBase.register("Llama4ForConditionalGeneration") +@ModelBase.register( + "Llama4ForConditionalGeneration", + "Llama4ForCausalLM", +) class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 undo_permute = False @@ -2411,6 +2414,10 @@ class Llama4Model(LlamaModel): super().set_gguf_parameters() self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + if "layer_types" in self.hparams: + if all(lt == "full_attention" for lt in self.hparams["layer_types"]): + # all layers are full attention (for MobileLLM), disable swa + self.gguf_writer.add_sliding_window(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.startswith("language_model."): diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 116d728e8..202cbbd1b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -149,7 +149,7 @@ struct llama_hparams { bool causal_attn = true; bool use_alibi = false; bool attn_soft_cap = false; - bool use_kq_norm = true; + bool use_kq_norm = false; // for Classifiers uint32_t n_cls_out = 1; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2be807a6a..981e57083 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_80M: return "80M"; case LLM_TYPE_109M: return "109M"; case LLM_TYPE_137M: return "137M"; + case LLM_TYPE_140M: return "140M"; case LLM_TYPE_160M: return "160M"; case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; @@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; case LLM_TYPE_350M: return "350M"; + case LLM_TYPE_360M: return "360M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; @@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_700M: return "700M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; + case LLM_TYPE_950M: return "950M"; case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; @@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); - hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; - hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick - hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa == 0) { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope + } else { + hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; + hparams.n_swa = 8192; + hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full + } switch (hparams.n_expert) { + case 0: { + // MobileLLM (no MoE) + switch (hparams.n_embd) { + case 2048: type = LLM_TYPE_140M; break; + case 4096: type = LLM_TYPE_360M; break; + case 6144: type = LLM_TYPE_950M; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case 16: type = LLM_TYPE_17B_16E; break; case 128: type = LLM_TYPE_17B_128E; break; default: type = LLM_TYPE_UNKNOWN; } - if (type == LLM_TYPE_17B_128E) { - hparams.use_kq_norm = false; - } + hparams.use_kq_norm = type != LLM_TYPE_17B_128E; } break; case LLM_ARCH_ARCEE: { @@ -2454,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0"); for (int i = 0; i < n_layer; ++i) { - bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0; + bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0; auto & layer = layers[i]; @@ -6328,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); + if (hparams.use_kq_norm) { + // Llama4TextL2Norm + Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps); + Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps); + cb(Qcur, "Qcur_normed", il); + cb(Kcur, "Kcur_normed", il); + } + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); @@ -6435,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; - const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0; + const bool use_rope = hparams.n_no_rope_layer_step > 0 && + (il + 1) % hparams.n_no_rope_layer_step != 0; // norm cur = build_norm(inpL, @@ -18981,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_LLAMA4: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) { + llm = std::make_unique(*this, params); + } else { + llm = std::make_unique(*this, params); + } } break; case LLM_ARCH_DECI: { diff --git a/src/llama-model.h b/src/llama-model.h index 10b1767f2..b1981978e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -28,6 +28,7 @@ enum llm_type { LLM_TYPE_80M, LLM_TYPE_109M, LLM_TYPE_137M, + LLM_TYPE_140M, LLM_TYPE_160M, LLM_TYPE_190M, LLM_TYPE_220M, @@ -36,6 +37,7 @@ enum llm_type { LLM_TYPE_270M, LLM_TYPE_335M, LLM_TYPE_350M, + LLM_TYPE_360M, LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, @@ -43,6 +45,7 @@ enum llm_type { LLM_TYPE_700M, LLM_TYPE_770M, LLM_TYPE_780M, + LLM_TYPE_950M, LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B,