From cad2d3884c75e3b2e4736654295d1f1a5c29aa48 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 30 Mar 2026 17:05:11 +0300 Subject: [PATCH 1/5] rpc : fix misleading error log (#21184) When RPC is running with a remote backend which doesn't have init_tensor function (like CPU and Metal), the server log gets full with error messages saying that init_tensor is being called with null buffer which is incorrect. This patch fixes this. --- ggml/src/ggml-rpc/ggml-rpc.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 16f6abdff..1378ba9f5 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1340,7 +1340,9 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) { if (buffer && buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } else { - GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n"); + if (!buffer) { + GGML_LOG_ERROR("Tensor with null buffer passed to init_tensor function\n"); + } } if (tensor->extra != nullptr) { From 64ac9ab66afa57f7012c83841022827dafb4aebe Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Mon, 30 Mar 2026 16:20:00 +0200 Subject: [PATCH 2/5] CUDA : Fix CUB's argsort when nrows % block_size == 0 CCCL < 3.1 (#21181) * CUDA: Fix CUB's argsort when nrows % block_size == 0 CCCL < 3.1 We wrongly calculated offset_grid as `ceildiv(nrows, block_size)`, while it must be `ceildiv(nrows + 1, block_size)`. As a consequence, we had uninitialized values in `offset_iterator[nrows]` for the case when `nrows % block_size == 0`. Fixes #21162 * Reduce nrows in test case to 256, don't need 768 --- ggml/src/ggml-cuda/argsort.cu | 6 ++++-- tests/test-backend-ops.cpp | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 4896669c3..38fdf3678 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -47,9 +47,11 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, #ifdef STRIDED_ITERATOR_AVAILABLE auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols); #else - ggml_cuda_pool_alloc offsets_alloc(pool, nrows + 1); + // offset_iterator needs to populate nrows + 1 elements, so we also have to ceildiv nrows + 1 by block_size + const int nrows_offset = nrows + 1; + ggml_cuda_pool_alloc offsets_alloc(pool, nrows_offset); int * offset_iterator = offsets_alloc.get(); - const dim3 offset_grid((nrows + block_size - 1) / block_size); + const dim3 offset_grid((nrows_offset + block_size - 1) / block_size); init_offsets<<>>(offset_iterator, ncols, nrows); #endif CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6a4f9b634..781c621d9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8424,6 +8424,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order)); + test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 256, 1, 1}, order)); // test ceildiv in CUDA's CUB's DeviceSegmentedSort test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order)); From ead417f01ce9769249e26fd1aaf700de83003a5c Mon Sep 17 00:00:00 2001 From: "Zhihao \"Zephyr\" Yao" Date: Mon, 30 Mar 2026 14:08:46 -0400 Subject: [PATCH 3/5] jinja : handle empty expressions correctly (#20913) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Reject empty computed member expressions before returning slices[0] from parse_member_expression_arguments(). * Treat empty computed member expressions with Jinja2 undefined semantics Treat empty computed member expressions like `a[]` as undefined instead of raising a parser error, to match Jinja2 behavior. - return a noop expression for empty computed member arguments - return undefined when a computed member key evaluates to undefined - add Jinja tests covering `a[]|default('fallback')` and `a[] is undefined` * Handle undefined computed member properties Move undefined-property handling to the common member access path, and add a test covering `a[undefined] is undefined`. * Use default undefined value in member access Initialize val and then return it when property is undefined. Co-authored-by: Sigbjørn Skjæret * empty statement parses to blank_expression instead of noop_statement --------- Co-authored-by: Sigbjørn Skjæret --- common/jinja/parser.cpp | 3 +++ common/jinja/runtime.cpp | 9 +++++++-- common/jinja/runtime.h | 8 ++++++++ tests/test-jinja.cpp | 18 ++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/common/jinja/parser.cpp b/common/jinja/parser.cpp index 4ae447744..2b25654a7 100644 --- a/common/jinja/parser.cpp +++ b/common/jinja/parser.cpp @@ -539,6 +539,9 @@ private: statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr; return mk_stmt(start_pos, std::move(start), std::move(stop), std::move(step)); } + if (slices.empty()) { + return mk_stmt(start_pos); + } return std::move(slices[0]); } diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index dce5bbae3..2232790c3 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -771,10 +771,15 @@ value member_expression::execute_impl(context & ctx) { } JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str()); - ensure_key_type_allowed(property); - value val = mk_val("object_property"); + if (property->is_undefined()) { + JJ_DEBUG("%s", "Member expression property is undefined, returning undefined"); + return val; + } + + ensure_key_type_allowed(property); + if (is_val(object)) { JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined"); return val; diff --git a/common/jinja/runtime.h b/common/jinja/runtime.h index 17a6dff5a..3ca5f1754 100644 --- a/common/jinja/runtime.h +++ b/common/jinja/runtime.h @@ -263,6 +263,14 @@ struct comment_statement : public statement { // Expressions +// Represents an omitted expression in a computed member, e.g. `a[]`. +struct blank_expression : public expression { + std::string type() const override { return "BlankExpression"; } + value execute_impl(context &) override { + return mk_val(); + } +}; + struct member_expression : public expression { statement_ptr object; statement_ptr property; diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index 2cac38f02..5d4b2806a 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -387,6 +387,24 @@ static void test_expressions(testing & t) { "Bob" ); + test_template(t, "empty computed member defaults to undefined", + "{{ a[]|default('fallback') }}", + {{"a", {{"name", "Bob"}}}}, + "fallback" + ); + + test_template(t, "empty computed member is undefined", + "{{ a[] is undefined }}", + {{"a", {{"name", "Bob"}}}}, + "True" + ); + + test_template(t, "undefined computed member is undefined", + "{{ a[undefined] is undefined }}", + {{"a", {{"name", "Bob"}}}}, + "True" + ); + test_template(t, "array access", "{{ items[1] }}", {{"items", json::array({"a", "b", "c"})}}, From 84ae8434d0d00b52a8bab7f584b2963e129ccbfc Mon Sep 17 00:00:00 2001 From: Seungmin Kim <8457324+ehfd@users.noreply.github.com> Date: Tue, 31 Mar 2026 03:24:37 +0900 Subject: [PATCH 4/5] CI : Enable CUDA and Vulkan ARM64 runners and fix CI/CD (#21122) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CI: Enable CUDA and Vulkan ARM64 runners and fix CI/CD Co-authored-by: Ts-sound <44093942+Ts-sound@users.noreply.github.com> * Obtain source tag name from git tag Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Ts-sound <44093942+Ts-sound@users.noreply.github.com> Co-authored-by: Sigbjørn Skjæret --- .devops/cpu.Dockerfile | 2 +- .devops/cuda-new.Dockerfile | 8 +- .devops/cuda.Dockerfile | 13 +- .devops/intel.Dockerfile | 2 +- .devops/musa.Dockerfile | 2 +- .devops/openvino.Dockerfile | 2 +- .devops/rocm.Dockerfile | 4 +- .devops/vulkan.Dockerfile | 17 +- .github/workflows/docker.yml | 565 ++++++++++++++++++++++++----------- docs/docker.md | 26 +- 10 files changed, 436 insertions(+), 205 deletions(-) diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index e5a21eeeb..d6579ecf1 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -36,7 +36,7 @@ RUN mkdir -p /app/full \ FROM ubuntu:$UBUNTU_VERSION AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.devops/cuda-new.Dockerfile b/.devops/cuda-new.Dockerfile index 98dc147d7..890230cd2 100644 --- a/.devops/cuda-new.Dockerfile +++ b/.devops/cuda-new.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=24.04 # This needs to generally match the container host's environment. -ARG CUDA_VERSION=13.1.0 +ARG CUDA_VERSION=13.1.1 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} @@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build ARG CUDA_DOCKER_ARCH=default RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 + apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1 + +ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14 WORKDIR /app @@ -39,7 +41,7 @@ RUN mkdir -p /app/full \ FROM ${BASE_CUDA_RUN_CONTAINER} AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 52f103bc3..b3f6ccfc9 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -1,6 +1,6 @@ -ARG UBUNTU_VERSION=22.04 +ARG UBUNTU_VERSION=24.04 # This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.4.0 +ARG CUDA_VERSION=12.8.1 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} @@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build ARG CUDA_DOCKER_ARCH=default RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 + apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1 + +ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14 WORKDIR /app @@ -39,7 +41,7 @@ RUN mkdir -p /app/full \ FROM ${BASE_CUDA_RUN_CONTAINER} AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ @@ -60,7 +62,8 @@ RUN apt-get update \ git \ python3 \ python3-pip \ - && pip install --upgrade pip setuptools wheel \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ && apt clean -y \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 4568cc08a..955a2962f 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -51,7 +51,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \ && dpkg --install *.deb RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 9eb498520..665a76f58 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -46,7 +46,7 @@ RUN mkdir -p /app/full \ FROM ${BASE_MUSA_RUN_CONTAINER} AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index e22ef16c7..3ee4dd201 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -78,7 +78,7 @@ ARG http_proxy ARG https_proxy RUN apt-get update \ - && apt-get install -y libgomp1 libtbb12 curl\ + && apt-get install -y libgomp1 libtbb12 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 830fe19e3..96aa9dfd4 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -58,7 +58,7 @@ RUN mkdir -p /app/full \ FROM ${BASE_ROCM_DEV_CONTAINER} AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ @@ -79,7 +79,7 @@ RUN apt-get update \ git \ python3-pip \ python3 \ - python3-wheel\ + python3-wheel \ && pip install --break-system-packages --upgrade setuptools \ && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index 3112ec85e..98036c5fd 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -49,17 +49,20 @@ COPY --from=build /app/full /app WORKDIR /app +ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}" + +# Flag for compatibility with pip +ARG UV_INDEX_STRATEGY="unsafe-best-match" RUN apt-get update \ && apt-get install -y \ build-essential \ + curl \ git \ - python3.13 \ - python3.13-dev \ - python3-pip \ - python3-wheel \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \ - && pip install --break-system-packages --upgrade setuptools \ - && pip install --break-system-packages -r requirements.txt \ + ca-certificates \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && uv python install 3.13 \ + && uv venv --python 3.13 /root/.venv \ + && uv pip install --python /root/.venv/bin/python -r requirements.txt \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 75df76a4e..569039c42 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -25,184 +25,13 @@ permissions: packages: write jobs: - push_to_registry: - name: Push Docker image to Docker Hub - - runs-on: ${{ matrix.config.runs_on }} - env: - COMMIT_SHA: ${{ github.sha }} - strategy: - fail-fast: false - matrix: - config: - # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" } - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" } - - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "12.4.0", ubuntu_version: "22.04" } - - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "13.1.0", ubuntu_version: "24.04" } - - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" } - - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" } - - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" } - - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04-s390x" } - - { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" } - - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" } - steps: - - name: Check out the repo - uses: actions/checkout@v6 - with: - fetch-depth: 0 # preserve git history, so we can determine the build number - - - name: Set up QEMU - if: ${{ matrix.config.tag != 's390x' }} - uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3 - with: - image: tonistiigi/binfmt:qemu-v10.2.1 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - - - name: Log in to Docker Hub - uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Determine source tag name - id: srctag - uses: ./.github/actions/get-tag-name - env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - - - name: Determine image tag name - id: tag - shell: bash - run: | - REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case - REPO_NAME="${{ github.event.repository.name }}" - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - - # list all tags possible - tags="${{ matrix.config.tag }}" - for tag in $tags; do - if [[ "$tag" == "cpu" ]]; then - TYPE="" - else - TYPE="-$tag" - fi - CACHETAGS="${PREFIX}buildcache${TYPE}" - FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}" - LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}" - SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}" - done - echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT - echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT - echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT - echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT - echo "cache_output_tags=$CACHETAGS" # print out for debugging - echo "full_output_tags=$FULLTAGS" # print out for debugging - echo "light_output_tags=$LIGHTTAGS" # print out for debugging - echo "server_output_tags=$SERVERTAGS" # print out for debugging - env: - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - - name: Free Disk Space (Ubuntu) - if: ${{ matrix.config.free_disk_space == true }} - uses: ggml-org/free-disk-space@v1.3.1 - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - - name: Build and push Full Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} - uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.full_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: full - provenance: false - build-args: | - ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} - ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} - # using github experimental cache - #cache-from: type=gha - #cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - # using registry cache (no storage limit) - cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }} - cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max - - - name: Build and push Light Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} - uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.light_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: light - provenance: false - build-args: | - ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} - ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} - # using github experimental cache - #cache-from: type=gha - #cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - # using registry cache (no storage limit) - cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }} - cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max - - - name: Build and push Server Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} - uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.server_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: server - provenance: false - build-args: | - ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} - ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} - # using github experimental cache - #cache-from: type=gha - #cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - # using registry cache (no storage limit) - cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }} - cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max - create_tag: name: Create and push git tag - runs-on: ubuntu-22.04 + runs-on: ubuntu-slim permissions: contents: write + outputs: + source_tag: ${{ steps.srctag.outputs.name }} steps: - name: Clone @@ -223,3 +52,391 @@ jobs: run: | git tag ${{ steps.srctag.outputs.name }} || exit 0 git push origin ${{ steps.srctag.outputs.name }} || exit 0 + + prepare_matrices: + name: Prepare Docker matrices + runs-on: ubuntu-24.04 + outputs: + build_matrix: ${{ steps.matrices.outputs.build_matrix }} + merge_matrix: ${{ steps.matrices.outputs.merge_matrix }} + + steps: + - name: Generate build and merge matrices + id: matrices + shell: bash + run: | + set -euo pipefail + + # Keep all build targets in one place and derive merge targets from it. + cat > build-matrix.json <<'JSON' + [ + { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }, + { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" }, + { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }, + { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" } + ] + JSON + + BUILD_MATRIX="$(jq -c . build-matrix.json)" + MERGE_MATRIX="$(jq -c ' + reduce .[] as $entry ({}; .[$entry.tag] |= ( + . // { + tag: $entry.tag, + arches: [], + full: false, + light: false, + server: false + } + | .full = (.full or ($entry.full // false)) + | .light = (.light or ($entry.light // false)) + | .server = (.server or ($entry.server // false)) + | .arches += [($entry.platforms | sub("^linux/"; ""))] + )) + # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform. + | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then + . + { + s390x: { + tag: "s390x", + arches: ["s390x"], + full: .cpu.full, + light: .cpu.light, + server: .cpu.server + } + } + else + . + end + | [.[] | .arches = (.arches | unique | sort | join(" "))] + ' build-matrix.json)" + + echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT" + echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT" + + push_to_registry: + name: Push Docker image to Docker Registry + needs: [prepare_matrices, create_tag] + + runs-on: ${{ matrix.config.runs_on }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }} + steps: + - name: Check out the repo + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ needs.create_tag.outputs.source_tag }} + + - name: Set up QEMU + if: ${{ contains(matrix.config.platforms, 'linux/amd64') }} + uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4 + with: + image: tonistiigi/binfmt:qemu-v10.2.1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4 + + - name: Log in to Docker Registry + uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Determine image metadata + id: meta + shell: bash + run: | + set -euo pipefail + + REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case + REPO_NAME="${{ github.event.repository.name }}" + IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" + PREFIX="${IMAGE_REPO}:" + PLATFORM="${{ matrix.config.platforms }}" + ARCH_SUFFIX="${PLATFORM#linux/}" + + # list all tags possible + tags="${{ matrix.config.tag }}" + for tag in $tags; do + if [[ "$tag" == "cpu" ]]; then + TYPE="" + else + TYPE="-$tag" + fi + CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}" + done + + SAFE_TAGS="$(echo "$tags" | tr ' ' '_')" + + echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT + echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT + echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT + echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT + echo "cache_output_tag=$CACHETAG" # print out for debugging + env: + GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + + - name: Free Disk Space (Ubuntu) + if: ${{ matrix.config.free_disk_space == true }} + uses: ggml-org/free-disk-space@v1.3.1 + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + - name: Build and push Full Docker image by digest + id: build_full + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} + uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7 + with: + context: . + platforms: ${{ matrix.config.platforms }} + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + file: ${{ matrix.config.dockerfile }} + target: full + provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + # using github experimental cache + #cache-from: type=gha + #cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + # using registry cache (no storage limit) + cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }} + cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max + + - name: Build and push Light Docker image by digest + id: build_light + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} + uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7 + with: + context: . + platforms: ${{ matrix.config.platforms }} + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + file: ${{ matrix.config.dockerfile }} + target: light + provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + # using github experimental cache + #cache-from: type=gha + #cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + # using registry cache (no storage limit) + cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }} + cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max + + - name: Build and push Server Docker image by digest + id: build_server + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} + uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7 + with: + context: . + platforms: ${{ matrix.config.platforms }} + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + file: ${{ matrix.config.dockerfile }} + target: server + provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + # using github experimental cache + #cache-from: type=gha + #cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + # using registry cache (no storage limit) + cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }} + cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max + + - name: Export digest metadata + shell: bash + run: | + set -euo pipefail + + TAGS="${{ matrix.config.tag }}" + ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}" + DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv" + mkdir -p /tmp/digests + + add_digest_rows() { + local image_type="$1" + local digest="$2" + + if [[ -z "$digest" ]]; then + echo "Missing digest for image_type=${image_type}" >&2 + exit 1 + fi + + for tag in $TAGS; do + printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE" + done + } + + if [[ "${{ matrix.config.full }}" == "true" ]]; then + add_digest_rows "full" "${{ steps.build_full.outputs.digest }}" + fi + + if [[ "${{ matrix.config.light }}" == "true" ]]; then + add_digest_rows "light" "${{ steps.build_light.outputs.digest }}" + fi + + if [[ "${{ matrix.config.server }}" == "true" ]]; then + add_digest_rows "server" "${{ steps.build_server.outputs.digest }}" + fi + + - name: Upload digest metadata + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + with: + name: digests-${{ steps.meta.outputs.digest_artifact_suffix }} + path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv + if-no-files-found: error + + merge_arch_tags: + name: Create shared tags from digests + needs: [prepare_matrices, push_to_registry, create_tag] + runs-on: ubuntu-24.04 + strategy: + fail-fast: false + matrix: + config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }} + + steps: + - name: Check out the repo + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Download digest metadata + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + pattern: digests-* + path: /tmp/digests + merge-multiple: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4 + + - name: Log in to Docker Registry + uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Create tags from digests + shell: bash + run: | + set -euo pipefail + + REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case + REPO_NAME="${{ github.event.repository.name }}" + IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" + PREFIX="${IMAGE_REPO}:" + SRC_TAG="${{ needs.create_tag.outputs.source_tag }}" + TAGS="${{ matrix.config.tag }}" + ARCHES="${{ matrix.config.arches }}" + DIGEST_GLOB="/tmp/digests/*.tsv" + + if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then + echo "No digest metadata found in /tmp/digests" >&2 + exit 1 + fi + + if [[ -z "$SRC_TAG" ]]; then + echo "Missing source tag from create_tag" >&2 + exit 1 + fi + + find_digest() { + local tag_name="$1" + local arch="$2" + local image_type="$3" + local digest + + digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})" + + # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform. + if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then + digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})" + fi + + if [[ -z "$digest" ]]; then + echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2 + exit 1 + fi + + echo "$digest" + } + + create_manifest_tags() { + local image_type="$1" + local tag_name="$2" + local suffix="$3" + + local merged_tag="${PREFIX}${image_type}${suffix}" + local merged_versioned_tag="${merged_tag}-${SRC_TAG}" + + local refs=() + + for arch in $ARCHES; do + local digest + digest="$(find_digest "$tag_name" "$arch" "$image_type")" + refs+=("${IMAGE_REPO}@${digest}") + done + + echo "Creating ${merged_tag} from ${refs[*]}" + docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}" + + echo "Creating ${merged_versioned_tag} from ${refs[*]}" + docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}" + } + + for tag in $TAGS; do + if [[ "$tag" == "cpu" ]]; then + TYPE="" + else + TYPE="-$tag" + fi + + if [[ "${{ matrix.config.full }}" == "true" ]]; then + create_manifest_tags "full" "$tag" "$TYPE" + fi + + if [[ "${{ matrix.config.light }}" == "true" ]]; then + create_manifest_tags "light" "$tag" "$TYPE" + fi + + if [[ "${{ matrix.config.server }}" == "true" ]]; then + create_manifest_tags "server" "$tag" "$TYPE" + fi + done + env: + GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' diff --git a/docs/docker.md b/docs/docker.md index 9fb5e65ea..7f99bfaad 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -13,24 +13,30 @@ We have three Docker images available for this project: Additionally, there the following images, similar to the above: -- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:full-cuda13`: Same as `full` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:light-cuda13`: Same as `light` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:server-cuda13`: Same as `server` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`) -- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-s390x`: Identical to `full`, an alias for the `s390x` platform. (platforms: `linux/s390x`) +- `ghcr.io/ggml-org/llama.cpp:light-s390x`: Identical to `light`, an alias for the `s390x` platform. (platforms: `linux/s390x`) +- `ghcr.io/ggml-org/llama.cpp:server-s390x`: Identical to `server`, an alias for the `s390x` platform. (platforms: `linux/s390x`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). @@ -82,7 +88,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment The defaults are: -- `CUDA_VERSION` set to `12.4.0` +- `CUDA_VERSION` set to `12.8.1` - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures The resulting images, are essentially the same as the non-CUDA images: From 08f21453aec846867b39878500d725a05bd32683 Mon Sep 17 00:00:00 2001 From: shaofeiqi Date: Mon, 30 Mar 2026 12:19:16 -0700 Subject: [PATCH 5/5] opencl: add q4_K gemm and gemv kernels for Adreno (#20919) * opencl: add q4_K gemm and gemv kernels for Adreno * opencl: fix whitespace * opencl: add workarounds for compiler bugs on older devices * opencl: handle fp16 denorm on X Elite * opencl: fix kernel build error * opencl: fix whitespace * opencl: make q4_K cvt kernels signature consistent --------- Co-authored-by: Li He --- ggml/src/ggml-opencl/CMakeLists.txt | 2 + ggml/src/ggml-opencl/ggml-opencl.cpp | 312 +++++++++++++++++ ggml/src/ggml-opencl/kernels/cvt.cl | 75 ++++- .../kernels/gemm_noshuffle_q4_k_f32.cl | 172 ++++++++++ .../kernels/gemv_noshuffle_q4_k_f32.cl | 318 ++++++++++++++++++ 5 files changed, 877 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index af29f3b8f..540942b19 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -114,6 +114,8 @@ set(GGML_OPENCL_KERNELS gemv_noshuffle_q4_1_f32 gemm_noshuffle_q4_1_f32 gemv_noshuffle_general_q8_0_f32 + gemv_noshuffle_q4_k_f32 + gemm_noshuffle_q4_k_f32 gemv_noshuffle_q6_k_f32 gemm_noshuffle_q6_k_f32 mul diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index c40e1f2d3..0f6628c37 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -538,6 +538,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_restore_block_q4_0_noshuffle; cl_kernel kernel_convert_block_q4_1_noshuffle; cl_kernel kernel_restore_block_q4_1_noshuffle; + cl_kernel kernel_convert_block_q4_K_noshuffle; + cl_kernel kernel_restore_block_q4_K_noshuffle; cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K; cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K; cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; @@ -720,6 +722,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_gemm_noshuffle_q4_1_f32; cl_kernel kernel_mul_mm_q8_0_f32_8x4; cl_kernel CL_mul_mat_vec_q8_0_f32; + cl_kernel kernel_gemv_noshuffle_q4_k_f32; + cl_kernel kernel_gemm_noshuffle_q4_k_f32; cl_kernel kernel_gemv_noshuffle_q6_K_f32; cl_kernel kernel_gemm_noshuffle_q6_K_f32; #endif // GGML_OPENCL_USE_ADRENO_KERNELS @@ -932,6 +936,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err)); @@ -2619,6 +2625,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gemm_noshuffle_q4_k_f32 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_noshuffle_q4_k_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl"); +#endif + cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_k_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_noshuffle_q4_k_f32 + { + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable "; + if (backend_ctx->has_vector_subgroup_broadcast) { + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; + } + +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_noshuffle_q4_k_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_noshuffle_q4_k_f32.cl"); +#endif + + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_k_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std + " -cl-mad-enable " " -cl-fast-relaxed-math"; @@ -5060,12 +5105,25 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); + #ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; + if (use_adreno_kernels(backend_ctx, tensor)) { + kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle; + } + #else + cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; + #endif + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0)); size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -5076,6 +5134,20 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); tensor->extra = extra; +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_kernels(backend_ctx, tensor)) { + + int M = tensor->ne[1]; + int K = tensor->ne[0]; + + GGML_ASSERT(K % 32 == 0); + + // Transpose q, d, dm as ushort + transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M); + transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M); + transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M); + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS return; } if (tensor->type == GGML_TYPE_Q6_K) { @@ -5516,12 +5588,60 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_kernels(backend_ctx, tensor)) { + int M = tensor->ne[1]; + int K = tensor->ne[0]; + + size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + + static ggml_cl_buffer buf_trans_q; + static ggml_cl_buffer buf_trans_d; + static ggml_cl_buffer buf_trans_dm; + + buf_trans_q.allocate(backend_ctx->context, size_q); + buf_trans_d.allocate(backend_ctx->context, size_d); + buf_trans_dm.allocate(backend_ctx->context, size_dm); + + // Transpose q, d, dm back + transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4); + transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256); + transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K_noshuffle; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_dm.buffer)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0)); size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; @@ -9688,6 +9808,192 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t #endif } +static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + ggml_tensor_extra_cl_q4_K * extra0_q4_k = (ggml_tensor_extra_cl_q4_K *)src0->extra; + + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + cl_int err; + cl_image_format img_fmt; + cl_image_desc img_desc; + cl_buffer_region region; + + int M = ne01; + int N = ne1; + int K = ne00; + + cl_uchar mask_d6 = 0x3F; + cl_uchar mask_d4 = 0x0F; + cl_uchar mask_hi2 = 0xC0; + + if (ne1 == 1) { + cl_mem q_img = nullptr; + cl_mem b_sub_buf = nullptr; + cl_mem b_img = nullptr; + + // image for q + img_fmt = { CL_R, CL_UNSIGNED_INT32}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = M * K / 2 / 4; + img_desc.buffer = extra0_q4_k->q; + CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + kernel = backend_ctx->kernel_gemv_noshuffle_q4_k_f32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->dm)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_d6)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d4)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_hi2)); + + size_t local_work_size[3] = {64, 4, 1}; + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(q_img)); + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_img)); + } else { + + cl_mem b_sub_buf = nullptr; + cl_mem b_sub_buf_trans = nullptr; + cl_mem b_img = nullptr; + cl_mem b_img_trans = nullptr; + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // pad N to multiple of 8 + int extra_elements = N % 8; + int padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // subbuffer for transposed activations + region.origin = 0; + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for transposed activations + img_fmt = {CL_RGBA, CL_HALF_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * (N + padding) / 4; + img_desc.buffer = b_sub_buf_trans; + CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err)); + + // transpose activations + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_work_size_t[2] = { 1, 16 }; + size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + + // gemm + kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32; + int padded_N = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_k->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->s)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &padded_N)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d6)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d4)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_hi2)); + + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; + size_t local_work_size[3] = {1, 128, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); + CL_CHECK(clReleaseMemObject(b_img)); + CL_CHECK(clReleaseMemObject(b_img_trans)); + } +#else + GGML_UNUSED(backend); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); @@ -10014,6 +10320,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co return; } + // q4_k x fp32 + if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst); + return; + } + // q6_K x fp32 if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) { ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst); diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 34930dfbe..81fe17fa1 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -424,13 +424,17 @@ kernel void kernel_restore_block_q8_0_trans( // Convert the block_q4_K format to 4 separate arrays (AOS -> SOA). // This kernel does not deshuffle the bits. // Each thread processes a super block. +// Mask args are just to keep the signature consistent with the no-shuffle +// version and they are not used in this kernel. //------------------------------------------------------------------------------ kernel void kernel_convert_block_q4_K( global struct block_q4_K * src0, global uchar * dst_q, global uchar * dst_s, global half * dst_d, - global half * dst_dm + global half * dst_dm, + uchar mask_0F, + uchar mask_F0 ) { global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0); global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0); @@ -451,12 +455,15 @@ kernel void kernel_convert_block_q4_K( // Restore block_q4_K from flattened arrays. // Each thread processes a super block. +// Mask args are just to keep the signature consistent with the no-shuffle ones. kernel void kernel_restore_block_q4_K( global uchar * src_q, global uchar * src_s, global half * src_d, global half * src_dm, - global struct block_q4_K * dst + global struct block_q4_K * dst, + uchar mask_0F, + uchar mask_F0 ) { global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0); global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0); @@ -475,6 +482,70 @@ kernel void kernel_restore_block_q4_K( } } +kernel void kernel_convert_block_q4_K_noshuffle( + global struct block_q4_K * src0, + global uchar * dst_q, + global uchar * dst_s, + global half * dst_d, + global half * dst_dm, + uchar mask_0F, + uchar mask_F0 +) { + global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0); + global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0); + global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + global half * dm = (global half *) dst_dm + get_global_id(0); + + *d = b->d; + *dm = b->dm; + + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->q[i*32 + 2*j]; + uchar x1 = b->q[i*32 + 2*j + 1]; + q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q4_K_noshuffle( + global uchar * src_q, + global uchar * src_s, + global half * src_d, + global half * src_dm, + global struct block_q4_K * dst, + uchar mask_0F, + uchar mask_F0 +) { + global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0); + global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0); + global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + global half * dm = (global half *) src_dm + get_global_id(0); + + b->d = *d; + b->dm = *dm; + + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = q[i*32 + j]; + uchar hi = q[i*32 + j + 16]; + b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } + + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s[i]; + } +} + //------------------------------------------------------------------------------ // kernel_convert_block_q6_K // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). diff --git a/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl new file mode 100644 index 000000000..99fd1fd7b --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl @@ -0,0 +1,172 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m, + uchar mask_d6, + uchar mask_d4, + uchar mask_hi2 +) { + if (j < 4) { + *d = q[j] & mask_d6; + *m = q[j+4] & mask_d6; + } else { + *d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2); + *m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2); + } +} + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_128 +#endif +kernel void kernel_gemm_noshuffle_q4_k_f32( + global const ushort * src0_q, + global const uchar * src0_s, + global const half * src0_d, + global const half * src0_dm, + read_only image1d_buffer_t src1, + global float * dst, + ulong offsetd, + int m, + int n, + int k, + int n_no_padding, + uchar mask_d6, + uchar mask_d4, + uchar mask_hi2 +) { + dst = (global float *)((global char *)dst + offsetd); + int n_4 = n >> 2; + int gy = get_global_id(0); + int gx = get_global_id(1); + int gx_2 = gx << 2; + + half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + half8 B; + half4 dequantized_weights; + + int num_blocks_K = k / QK_K; + + global const ushort * weight_ptr = src0_q + gx_2; + global const half * d_ptr = src0_d + gx_2; + global const half * dm_ptr = src0_dm + gx_2; + + for (int i = 0; i < k; i += 32) { + int sb_idx = i / QK_K; + int sub_idx = (i / 32) % 8; + + half4 d = vload4(0, d_ptr + sb_idx * m); + half4 dm = vload4(0, dm_ptr + sb_idx * m); + + global const uchar * sc0 = src0_s + (gx_2+0) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE; + global const uchar * sc1 = src0_s + (gx_2+1) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE; + global const uchar * sc2 = src0_s + (gx_2+2) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE; + global const uchar * sc3 = src0_s + (gx_2+3) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE; + + uchar sv0, mn0, sv1, mn1, sv2, mn2, sv3, mn3; + get_scale_min_k4(sub_idx, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2); + get_scale_min_k4(sub_idx, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2); + get_scale_min_k4(sub_idx, sc2, &sv2, &mn2, mask_d6, mask_d4, mask_hi2); + get_scale_min_k4(sub_idx, sc3, &sv3, &mn3, mask_d6, mask_d4, mask_hi2); + + half4 scale = convert_half4(convert_float4(d) * convert_float4((uchar4)(sv0, sv1, sv2, sv3))); + half4 mval = convert_half4(convert_float4(dm) * convert_float4((uchar4)(mn0, mn1, mn2, mn3))); + + for (int l = 0; l < 32; l += 4) { + int ki = i + l; + ushort4 bits4 = vload4(0, weight_ptr + (ki/4) * m); + + // j=0 + B.s0123 = read_imageh(src1, gy*2 + (ki+0) * n_4); + B.s4567 = read_imageh(src1, gy*2+1 + (ki+0) * n_4); + dequantized_weights.s0 = (bits4.s0 & 0x000F) * scale.s0 - mval.s0; + dequantized_weights.s1 = (bits4.s1 & 0x000F) * scale.s1 - mval.s1; + dequantized_weights.s2 = (bits4.s2 & 0x000F) * scale.s2 - mval.s2; + dequantized_weights.s3 = (bits4.s3 & 0x000F) * scale.s3 - mval.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=1 + B.s0123 = read_imageh(src1, gy*2 + (ki+1) * n_4); + B.s4567 = read_imageh(src1, gy*2+1 + (ki+1) * n_4); + dequantized_weights.s0 = ((bits4.s0 & 0x00F0) >> 4) * scale.s0 - mval.s0; + dequantized_weights.s1 = ((bits4.s1 & 0x00F0) >> 4) * scale.s1 - mval.s1; + dequantized_weights.s2 = ((bits4.s2 & 0x00F0) >> 4) * scale.s2 - mval.s2; + dequantized_weights.s3 = ((bits4.s3 & 0x00F0) >> 4) * scale.s3 - mval.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=2 + B.s0123 = read_imageh(src1, gy*2 + (ki+2) * n_4); + B.s4567 = read_imageh(src1, gy*2+1 + (ki+2) * n_4); + dequantized_weights.s0 = ((bits4.s0 & 0x0F00) >> 8) * scale.s0 - mval.s0; + dequantized_weights.s1 = ((bits4.s1 & 0x0F00) >> 8) * scale.s1 - mval.s1; + dequantized_weights.s2 = ((bits4.s2 & 0x0F00) >> 8) * scale.s2 - mval.s2; + dequantized_weights.s3 = ((bits4.s3 & 0x0F00) >> 8) * scale.s3 - mval.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=3 + B.s0123 = read_imageh(src1, gy*2 + (ki+3) * n_4); + B.s4567 = read_imageh(src1, gy*2+1 + (ki+3) * n_4); + dequantized_weights.s0 = ((bits4.s0 & 0xF000) >> 12) * scale.s0 - mval.s0; + dequantized_weights.s1 = ((bits4.s1 & 0xF000) >> 12) * scale.s1 - mval.s1; + dequantized_weights.s2 = ((bits4.s2 & 0xF000) >> 12) * scale.s2 - mval.s2; + dequantized_weights.s3 = ((bits4.s3 & 0xF000) >> 12) * scale.s3 - mval.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + } + } + + int idx = (gy<<3)*m + (gx<<2); + + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); + idx += m; + } + if (idx+3 < m*n_no_padding) { + vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl new file mode 100644 index 000000000..dd1e2b55c --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl @@ -0,0 +1,318 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#endif + +#define QK_K 256 +#define NSUBGROUPS 4 +#define SUBGROUP_SIZE 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m, + uchar mask_d6, + uchar mask_d4, + uchar mask_hi2 +) { + if (j < 4) { + *d = q[j] & mask_d6; + *m = q[j+4] & mask_d6; + } else { + *d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2); + *m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2); + } +} + +#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, minv, y) \ + float shared_y; \ + shared_y = sub_group_broadcast(y.s0, 0); \ + total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 0); \ + total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 0); \ + total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 0); \ + total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 0); \ + total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 0); \ + total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 0); \ + total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 0); \ + total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 1); \ + total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 1); \ + total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 1); \ + total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 1); \ + total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 1); \ + total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 1); \ + total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 1); \ + total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 1); \ + total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + + +#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, minv, y) \ + shared_y = sub_group_broadcast(y.s0, 2); \ + total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 2); \ + total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 2); \ + total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 2); \ + total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 2); \ + total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 2); \ + total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 2); \ + total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 2); \ + total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 3); \ + total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 3); \ + total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 3); \ + total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 3); \ + total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 3); \ + total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 3); \ + total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 3); \ + total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 3); \ + total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \ + + +#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, minv, y) \ + float8 shared_y; \ + shared_y = sub_group_broadcast(y, 0); \ + total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \ + total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \ + total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \ + total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \ + total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \ + total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \ + total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \ + total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \ + total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \ + total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \ + total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \ + total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \ + total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \ + total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \ + total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \ + total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 1); \ + total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \ + total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \ + total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \ + total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \ + total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \ + total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \ + total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \ + total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \ + total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \ + total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \ + total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \ + total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \ + total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \ + total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \ + total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \ + total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \ + + +#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, minv, y) \ + shared_y = sub_group_broadcast(y, 2); \ + total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \ + total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \ + total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \ + total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \ + total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \ + total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \ + total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \ + total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \ + total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \ + total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \ + total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \ + total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \ + total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \ + total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \ + total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \ + total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 3); \ + total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \ + total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \ + total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \ + total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \ + total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \ + total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \ + total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \ + total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \ + total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \ + total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \ + total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \ + total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \ + total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \ + total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \ + total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \ + total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \ + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_gemv_noshuffle_q4_k_f32( + read_only image1d_buffer_t src0_q, + global half2 * src0_d, + global half2 * src0_m, + global uchar * src0_s, + read_only image1d_buffer_t src1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + uchar mask_d6, + uchar mask_d4, + uchar mask_hi2) +{ + uint groupId = get_local_id(1); + uint gid = get_global_id(0); + ushort slid = get_sub_group_local_id(); + + uint K = ne00; + uint M = ne01; + + uint LINE_STRIDE_A = M / 2; + uint BLOCK_STRIDE_A = NSUBGROUPS * M; + uint scales_per_row = (K / QK_K) * 12; + + private uint4 regA; + private half2 regS; + private half2 regM; + private float8 regB; + + private float2 totalSum = (float2)(0.0f); + + for (uint k = groupId; k < (K / 32); k += NSUBGROUPS) { + uint sb = k / 8; + uint j = k % 8; + + half2 d = src0_d[gid + sb * LINE_STRIDE_A]; + half2 dm = src0_m[gid + sb * LINE_STRIDE_A]; + + global const uchar * sc0 = src0_s + 2 * gid * scales_per_row + sb * 12; + global const uchar * sc1 = src0_s + (2 * gid + 1) * scales_per_row + sb * 12; + + uchar sv0, mn0, sv1, mn1; + get_scale_min_k4(j, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2); + get_scale_min_k4(j, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2); + + regS = convert_half2(convert_float2(d) * convert_float2((uchar2)(sv0, sv1))); + regM = convert_half2(convert_float2(dm) * convert_float2((uchar2)(mn0, mn1))); + + if (slid < 4) { + regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); + regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); + } + + // load half weights for two blocks in consecutive rows + regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; + regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; + regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; + regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regM, regB); +#else + dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regM, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + + regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; + regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; + regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; + regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regM, regB); +#else + dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regM, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + } + + // reduction in local memory, assumes #wave=4 + local float2 reduceLM[SUBGROUP_SIZE * 3]; + if (groupId == 1) { + reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum; + } + if (groupId == 2) { + reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum; + } + if (groupId == 3) { + reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid]; + } + + // 2 outputs per fiber in wave 0 + if (groupId == 0) { + dst = (global float*)((global char*)dst + offsetd); + vstore2(totalSum, 0, &(dst[gid * 2])); + } + +}