cleanup unwanted stuff

2026-04-28 03:30:20 +00:00 · 2025-08-23 11:41:10 +08:00 · 2025-08-23 11:41:10 +08:00 · eca39d9823
commit eca39d9823
parent 8b8396c30c
80 changed files with 0 additions and 33791 deletions
--- a/docs/multimodal/minicpmo4.0.md
+++ b/docs/multimodal/minicpmo4.0.md
@ -1,47 +0,0 @@
-## MiniCPM-o 4
-
-### Prepare models and code
-
-Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
-
-
-### Build llama.cpp
-Readme modification time: 20250206
-
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-Build llama.cpp using `CMake`:
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-
-### Usage of MiniCPM-o 4
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
-
-```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
-python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
-
-# quantize int4 version
-./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-
-Inference on Linux or Mac
-```bash
-# run in single-turn mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run in conversation mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
-```
--- a/docs/multimodal/minicpmv4.0.md
+++ b/docs/multimodal/minicpmv4.0.md
@ -1,47 +0,0 @@
-## MiniCPM-V 4
-
-### Prepare models and code
-
-Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
-
-
-### Build llama.cpp
-Readme modification time: 20250206
-
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-Build llama.cpp using `CMake`:
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-
-### Usage of MiniCPM-V 4
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
-
-```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
-python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
-
-# quantize int4 version
-./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-
-Inference on Linux or Mac
-```bash
-# run in single-turn mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run in conversation mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
-```
--- a/docs/ops/CANN.csv
+++ b/docs/ops/CANN.csv
--- a/docs/ops/OpenCL.csv
+++ b/docs/ops/OpenCL.csv
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/examples/model-conversion/.gitignore
+++ b/examples/model-conversion/.gitignore
@ -1,3 +0,0 @@
-.model_name
-data
-ppl
--- a/examples/model-conversion/CMakeLists.txt
+++ b/examples/model-conversion/CMakeLists.txt
@ -1,5 +0,0 @@
-set(TARGET llama-logits)
-add_executable(${TARGET} logits.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@ -1,163 +0,0 @@
-# Validation functions
-define validate_model_path
-	@if [ -z "$(MODEL_PATH)" ]; then \
-		echo "Error: MODEL_PATH must be provided either as:"; \
-		echo "  1. Environment variable: export MODEL_PATH=/path/to/model"; \
-		echo "  2. Command line argument: make $(1) MODEL_PATH=/path/to/model"; \
-		exit 1; \
-	fi
-endef
-
-define validate_embedding_model_path
-	@if [ -z "$(EMBEDDING_MODEL_PATH)" ]; then \
-		echo "Error: EMBEDDING_MODEL_PATH must be provided either as:"; \
-		echo "  1. Environment variable: export EMBEDDING_MODEL_PATH=/path/to/model"; \
-		echo "  2. Command line argument: make $(1) EMBEDDING_MODEL_PATH=/path/to/model"; \
-		exit 1; \
-	fi
-endef
-
-###
-### Casual Model targets/recipes
-###
-causal-convert-model-bf16: OUTTYPE=bf16
-causal-convert-model-bf16: causal-convert-model
-
-causal-convert-model:
-	$(call validate_model_path,causal-convert-model)
-	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
-	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
-	./scripts/causal/convert-model.sh
-
-causal-run-original-model:
-	$(call validate_model_path,causal-run-original-model)
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
-
-causal-run-converted-model:
-	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
-
-causal-verify-logits: causal-run-original-model causal-run-converted-model
-	@./scripts/causal/compare-logits.py
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
-
-causal-run-original-embeddings:
-	@./scripts/causal/run-casual-gen-embeddings-org.sh
-
-causal-run-converted-embeddings:
-	@./scripts/causal/run-converted-model-embeddings-logits.sh
-
-causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-embeddings
-	@./scripts/causal/compare-embeddings-logits.sh
-
-causal-inspect-original-model:
-	@./scripts/utils/inspect-org-model.py
-
-causal-inspect-converted-model:
-	@./scripts/utils/inspect-converted-model.sh
-
-causal-start-embedding-server:
-	@./scripts/utils/run-embedding-server.sh ${CONVERTED_MODEL}
-
-causal-curl-embedding-endpoint: causal-run-original-embeddings
-	@./scripts/utils/curl-embedding-server.sh | ./scripts/causal/compare-embeddings-logits.sh
-
-causal-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
-causal-quantize-Q8_0: causal-quantize-model
-
-causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
-causal-quantize-Q4_0: causal-quantize-model
-
-causal-quantize-model:
-	@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
-	@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
-
-causal-run-quantized-model:
-	@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
-
-
-###
-### Embedding Model targets/recipes
-###
-
-embedding-convert-model-bf16: OUTTYPE=bf16
-embedding-convert-model-bf16: embedding-convert-model
-
-embedding-convert-model:
-	$(call validate_embedding_model_path,embedding-convert-model)
-	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
-	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
-	./scripts/embedding/convert-model.sh
-
-embedding-run-original-model:
-	$(call validate_embedding_model_path,embedding-run-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
-
-embedding-run-converted-model:
-	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
-
-embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
-	@./scripts/embedding/compare-embeddings-logits.sh
-
-embedding-inspect-original-model:
-	$(call validate_embedding_model_path,embedding-inspect-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
-
-embedding-inspect-converted-model:
-	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
-
-embedding-start-embedding-server:
-	@./scripts/utils/run-embedding-server.sh ${CONVERTED_EMBEDDING_MODEL}
-
-embedding-curl-embedding-endpoint:
-	@./scripts/utils/curl-embedding-server.sh | ./scripts/embedding/compare-embeddings-logits.sh
-
-embedding-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
-embedding-quantize-Q8_0: embedding-quantize-model
-
-embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
-embedding-quantize-Q4_0: embedding-quantize-model
-
-embedding-quantize-model:
-	@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
-	@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
-
-embedding-run-quantized-model:
-	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
-
-###
-### Perplexity targets/recipes
-###
-perplexity-data-gen:
-	CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/utils/perplexity-gen.sh
-
-perplexity-run-full:
-	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" LOOGITS_FILE="$(LOGITS_FILE)" \
-	./scripts/utils/perplexity-run.sh
-
-perplexity-run:
-	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/utils/perplexity-run-simple.sh
-
-###
-### HuggingFace targets/recipes
-###
-
-hf-create-model:
-	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"
-
-hf-create-model-private:
-	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p
-
-hf-upload-gguf-to-model:
-	@./scripts/utils/hf-upload-gguf-model.py -m "${MODEL_PATH}" -r "${REPO_ID}" -o "${NAME_IN_REPO}"
-
-hf-create-collection:
-	@./scripts/utils/hf-create-collection.py -n "${NAME}" -d "${DESCRIPTION}" -ns "${NAMESPACE}"
-
-hf-add-model-to-collection:
-	@./scripts/utils/hf-add-model-to-collection.py -c "${COLLECTION}" -m "${MODEL}"
-
-
-.PHONY: clean
-clean:
-	@${RM} -rf data .converted_embedding_model.txt .converted_model.txt .embedding_model_name.txt .model_name.txt
-
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -1,335 +0,0 @@
-# Model Conversion Example
-This directory contains scripts and code to help in the process of converting
-HuggingFace PyTorch models to GGUF format.
-
-The motivation for having this is that the conversion process can often be an
-iterative process, where the original model is inspected, converted, updates
-made to llama.cpp, converted again, etc. Once the model has been converted it
-needs to be verified against the original model, and then optionally quantified,
-and in some cases perplexity checked of the quantized model. And finally the
-model/models need to the ggml-org on Hugging Face. This tool/example tries to
-help with this process.
-
-### Overview
-The idea is that the makefile targets and scripts here can be used in the
-development/conversion process assisting with things like:
-
-* inspect/run the original model to figure out how it works
-* convert the original model to GGUF format
-* inspect/run the converted model
-* verify the logits produced by the original model and the converted model
-* quantize the model to GGUF format
-* run perplexity evaluation to verify that the quantized model is performing
-  as expected
-* upload the model to HuggingFace to make it available for others
-
-## Setup
-Create virtual python environment
-```console
-$ python3.11 -m venv venv
-$ source venv/bin/activate
-(venv) $ pip install -r requirements.txt
-```
-
-## Causal Language Model Conversion
-This section describes the steps to convert a causal language model to GGUF and
-to verify that the conversion was successful.
-
-### Download the original model
-First, clone the original model to some local directory:
-```console
-$ mkdir models && cd models
-$ git clone https://huggingface.co/user/model_name
-$ cd model_name
-$ git lfs install
-$ git lfs pull
-```
-
-### Set the MODEL_PATH
-The path to the downloaded model can be provided in two ways:
-
-**Option 1: Environment variable (recommended for iterative development)**
-```console
-export MODEL_PATH=~/work/ai/models/some_model
-```
-
-**Option 2: Command line argument (for one-off tasks)**
-```console
-make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
-```
-
-Command line arguments take precedence over environment variables when both are provided.
-
-In cases where the transformer implementation for the model has not been released
-yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will then cause the transformer implementation to be loaded explicitely and not
-use AutoModelForCausalLM:
-```
-export UNRELEASED_MODEL_NAME=SomeNewModel
-```
-
-### Inspecting the original tensors
-```console
-# Using environment variable
-(venv) $ make causal-inspect-original-model
-
-# Or using command line argument
-(venv) $ make causal-inspect-original-model MODEL_PATH=~/work/ai/models/some_model
-```
-
-### Running the original model
-This is mainly to verify that the original model works, and to compare the output
-from the converted model.
-```console
-# Using environment variable
-(venv) $ make causal-run-original-model
-
-# Or using command line argument
-(venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
-```
-This command will save two files to the `data` directory, one is a binary file
-containing logits which will be used for comparison with the converted model
-later, and the other is a text file which allows for manual visual inspection.
-
-### Model conversion
-After updates have been made to [gguf-py](../../gguf-py) to add support for the
-new model, the model can be converted to GGUF format using the following command:
-```console
-# Using environment variable
-(venv) $ make causal-convert-model
-
-# Or using command line argument
-(venv) $ make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
-```
-
-### Inspecting the converted model
-The converted model can be inspected using the following command:
-```console
-(venv) $ make inspect-converted-model
-```
-
-### Running the converted model
-```console
-(venv) $ make run-converted-model
-```
-
-### Model logits verfication
-The following target will run the original model and the converted model and
-compare the logits:
-```console
-(venv) $ make causal-verify-logits
-```
-
-### Quantizing the model
-The causal model can be quantized to GGUF format using the following command:
-```console
-(venv) $ make causal-quantize-Q8_0
-Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
-Export the quantized model path to QUANTIZED_MODEL variable in your environment
-```
-This will show the path to the quantized model in the terminal, which can then
-be used to set the `QUANTIZED_MODEL` environment variable:
-```console
-export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
-```
-Then the quantized model can be run using the following command:
-```console
-(venv) $ make causal-run-quantized-model
-```
-
-
-## Embedding Language Model Conversion
-
-### Download the original model
-```console
-$ mkdir models && cd models
-$ git clone https://huggingface.co/user/model_name
-$ cd model_name
-$ git lfs install
-$ git lfs pull
-```
-
-The path to the embedding model can be provided in two ways:
-
-**Option 1: Environment variable (recommended for iterative development)**
-```console
-export EMBEDDING_MODEL_PATH=~/path/to/embedding_model
-```
-
-**Option 2: Command line argument (for one-off tasks)**
-```console
-make embedding-convert-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
-```
-
-Command line arguments take precedence over environment variables when both are provided.
-
-### Running the original model
-This is mainly to verify that the original model works and to compare the output
-with the output from the converted model.
-```console
-# Using environment variable
-(venv) $ make embedding-run-original-model
-
-# Or using command line argument
-(venv) $ make embedding-run-original-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
-```
-This command will save two files to the `data` directory, one is a binary
-file containing logits which will be used for comparison with the converted
-model, and the other is a text file which allows for manual visual inspection.
-
-### Model conversion
-After updates have been made to [gguf-py](../../gguf-py) to add support for the
-new model the model can be converted to GGUF format using the following command:
-```console
-(venv) $ make embedding-convert-model
-```
-
-### Run the converted model
-```console
-(venv) $ make embedding-run-converted-model
-```
-
-### Model logits verfication
-The following target will run the original model and the converted model (which
-was done manually in the previous steps) and compare the logits:
-```console
-(venv) $ make embedding-verify-logits
-```
-
-### llama-server verification
-To verify that the converted model works with llama-server, the following
-command can be used:
-```console
-(venv) $ make embedding-start-embedding-server
-```
-Then open another terminal and set the `EMBEDDINGS_MODEL_PATH` environment
-variable as this will not be inherited by the new terminal:
-```console
-(venv) $ make embedding-curl-embedding-endpoint
-```
-This will call the `embedding` endpoing and the output will be piped into
-the same verification script as used by the target `embedding-verify-logits`.
-
-The causal model can also be used to produce embeddings and this can be verified
-using the following commands:
-```console
-(venv) $ make causal-start-embedding-server
-```
-Then open another terminal and set the `MODEL_PATH` environment
-variable as this will not be inherited by the new terminal:
-```console
-(venv) $ make casual-curl-embedding-endpoint
-```
-
-### Quantizing the model
-The embedding model can be quantized to GGUF format using the following command:
-```console
-(venv) $ make embedding-quantize-Q8_0
-Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
-Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
-```
-This will show the path to the quantized model in the terminal, which can then
-be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
-```console
-export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
-```
-Then the quantized model can be run using the following command:
-```console
-(venv) $ make embedding-run-quantized-model
-```
-
-## Perplexity Evaluation
-
-### Simple perplexity evaluation
-This allows to run the perplexity evaluation without having to generate a
-token/logits file:
-```console
-(venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
-```
-This will use the wikitext dataset to run the perplexity evaluation and
-output the perplexity score to the terminal. This value can then be compared
-with the perplexity score of the unquantized model.
-
-### Full perplexity evaluation
-First use the converted, non-quantized, model to generate the perplexity evaluation
-dataset using the following command:
-```console
-$ make perplexity-data-gen CONVERTED_MODEL=~/path/to/converted/model.gguf
-```
-This will generate a file in the `data` directory named after the model and with
-a `.kld` suffix which contains the tokens and the logits for the wikitext dataset.
-
-After the dataset has been generated, the perplexity evaluation can be run using
-the quantized model:
-```console
-$ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LOGITS_FILE=data/model.gguf.ppl
-```
-
-> 📝 **Note:** The `LOGITS_FILE` is the file generated by the previous command
-> can be very large, so make sure you have enough disk space available.
-
-## HuggingFace utilities
-The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a relase
-to script the process for new model releases.
-
-For the following targets a `HF_TOKEN` environment variable is required.
-
-> 📝 **Note:** Don't forget to logout from Hugging Face after running these
-> commands, otherwise you might have issues pulling/cloning repositories as
-> the token will still be in use:
-> $ huggingface-cli logout
-> $ unset HF_TOKEN
-
-### Create a new Hugging Face Model (model repository)
-This will create a new model repsository on Hugging Face with the specified
-model name.
-```console
-(venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev"
-Repository ID:  danbev/TestModel-GGUF
-Repository created: https://huggingface.co/danbev/TestModel-GGUF
-```
-Note that we append a `-GGUF` suffix to the model name to ensure a consistent
-naming convention for GGUF models.
-
-### Upload a GGUF model to model repository
-The following target uploads a model to an existing Hugging Face model repository.
-```console
-(venv) $ make hf-upload-gguf-to-model MODEL_PATH=dummy-model1.gguf REPO_ID=danbev/TestModel-GGUF
-📤 Uploading dummy-model1.gguf to danbev/TestModel-GGUF/dummy-model1.gguf
-✅ Upload successful!
-🔗 File available at: https://huggingface.co/danbev/TestModel-GGUF/blob/main/dummy-model1.gguf
-```
-This command can also be used to update an existing model file in a repository.
-
-### Create a new Collection
-```console
-(venv) $ make hf-new-collection NAME=TestCollection DESCRIPTION="Collection for testing scripts" NAMESPACE=danbev
-🚀 Creating Hugging Face Collection
-Title: TestCollection
-Description: Collection for testing scripts
-Namespace: danbev
-Private: False
-✅ Authenticated as: danbev
-📚 Creating collection: 'TestCollection'...
-✅ Collection created successfully!
-📋 Collection slug: danbev/testcollection-68930fcf73eb3fc200b9956d
-🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
-
-🎉 Collection created successfully!
-Use this slug to add models: danbev/testcollection-68930fcf73eb3fc200b9956d
-```
-
-### Add model to a Collection
-```console
-(venv) $ make hf-add-model-to-collection COLLECTION=danbev/testcollection-68930fcf73eb3fc200b9956d MODEL=danbev/TestModel-GGUF
-✅ Authenticated as: danbev
-🔍 Checking if model exists: danbev/TestModel-GGUF
-✅ Model found: danbev/TestModel-GGUF
-📚 Adding model to collection...
-✅ Model added to collection successfully!
-🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
-
-🎉 Model added successfully!
-
-```
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@ -1,209 +0,0 @@
-#include "llama.h"
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <vector>
-#include <ctype.h>
-#include <filesystem>
-
-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]);
-    printf("\n");
-}
-
-int main(int argc, char ** argv) {
-    std::string model_path;
-    std::string prompt = "Hello, my name is";
-    int ngl = 0;
-    bool embedding_mode = false;
-
-    {
-        int i = 1;
-        for (; i < argc; i++) {
-            if (strcmp(argv[i], "-m") == 0) {
-                if (i + 1 < argc) {
-                    model_path = argv[++i];
-                } else {
-                    print_usage(argc, argv);
-                    return 1;
-                }
-            } else if (strcmp(argv[i], "-ngl") == 0) {
-                if (i + 1 < argc) {
-                    try {
-                        ngl = std::stoi(argv[++i]);
-                    } catch (...) {
-                        print_usage(argc, argv);
-                        return 1;
-                    }
-                } else {
-                    print_usage(argc, argv);
-                    return 1;
-                }
-            } else if (strcmp(argv[i], "-embd-mode") == 0) {
-                if (i + 1 < argc) {
-                    try {
-                        embedding_mode = true;
-                    } catch (...) {
-                        print_usage(argc, argv);
-                        return 1;
-                    }
-                } else {
-                    print_usage(argc, argv);
-                    return 1;
-                }
-            } else {
-                // prompt starts here
-                break;
-            }
-        }
-
-        if (model_path.empty()) {
-            print_usage(argc, argv);
-            return 1;
-        }
-
-        if (i < argc) {
-            prompt = argv[i++];
-            for (; i < argc; i++) {
-                prompt += " ";
-                prompt += argv[i];
-            }
-        }
-    }
-
-    ggml_backend_load_all();
-    llama_model_params model_params = llama_model_default_params();
-    model_params.n_gpu_layers = ngl;
-
-    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    // Extract basename from model_path
-    const char * basename = strrchr(model_path.c_str(), '/');
-    basename = (basename == NULL) ? model_path.c_str() : basename + 1;
-
-    char model_name[256];
-    strncpy(model_name, basename, 255);
-    model_name[255] = '\0';
-
-    char * dot = strrchr(model_name, '.');
-    if (dot != NULL && strcmp(dot, ".gguf") == 0) {
-        *dot = '\0';
-    }
-    printf("Model name: %s\n", model_name);
-
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
-
-    std::vector<llama_token> prompt_tokens(n_prompt);
-    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
-        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.n_ctx = n_prompt;
-    ctx_params.n_batch = n_prompt;
-    ctx_params.no_perf = false;
-    if (embedding_mode) {
-        ctx_params.embeddings = true;
-        ctx_params.n_ubatch = ctx_params.n_batch;
-    }
-
-    llama_context * ctx = llama_init_from_model(model, ctx_params);
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    printf("Input prompt: \"%s\"\n", prompt.c_str());
-    printf("Tokenized prompt (%d tokens): ", n_prompt);
-    for (auto id : prompt_tokens) {
-        char buf[128];
-        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
-        if (n < 0) {
-            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
-            return 1;
-        }
-        std::string s(buf, n);
-        printf("%s", s.c_str());
-    }
-    printf("\n");
-
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
-
-    if (llama_decode(ctx, batch)) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return 1;
-    }
-
-    float * logits;
-    int n_logits;
-    const char * type;
-
-    if (embedding_mode) {
-        logits = llama_get_embeddings(ctx);
-        n_logits = llama_model_n_embd(model) * batch.n_tokens;
-        type = "-embeddings";
-        printf("Embeddings size: %d\n", n_logits);
-    } else {
-        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-        n_logits = llama_vocab_n_tokens(vocab);
-        type = "";
-        printf("Vocab size: %d\n", n_logits);
-    }
-
-    std::filesystem::create_directory("data");
-
-    // Save logits to binary file
-    char bin_filename[512];
-    snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
-    printf("Saving logits to %s\n", bin_filename);
-
-    FILE * f = fopen(bin_filename, "wb");
-    if (f == NULL) {
-        fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
-        return 1;
-    }
-    fwrite(logits, sizeof(float), n_logits, f);
-    fclose(f);
-
-    // Also save as text for debugging
-    char txt_filename[512];
-    snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
-    f = fopen(txt_filename, "w");
-    if (f == NULL) {
-        fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
-        return 1;
-    }
-    for (int i = 0; i < n_logits; i++) {
-        fprintf(f, "%d: %.6f\n", i, logits[i]);  // Added index and changed format
-    }
-    fclose(f);
-
-    // Print first and last 10 logits for quick verification
-    printf("First 10 logits: ");
-    for (int i = 0; i < 10 && i < n_logits; i++) {
-        printf("%.6f ", logits[i]);
-    }
-    printf("\n");
-
-    printf("Last 10 logits: ");
-    for (int i = n_logits - 10; i < n_logits; i++) {
-        if (i >= 0) printf("%.6f ", logits[i]);
-    }
-    printf("\n\n");
-
-    printf("Logits saved to %s\n", bin_filename);
-    printf("Logits saved to %s\n", txt_filename);
-
-    llama_free(ctx);
-    llama_model_free(model);
-
-    return 0;
-}
--- a/examples/model-conversion/requirements.txt
+++ b/examples/model-conversion/requirements.txt
@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.6.0
-torchvision~=0.21.0
-transformers~=4.55.0
-huggingface-hub~=0.34.0
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@ -1,43 +0,0 @@
-#/bin/bash
-
-set -e
-
-MODEL_PATH="${1:-"$MODEL_PATH"}"
-MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
-
-if [ -t 0 ]; then
-    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
-else
-    # Process piped JSON data and convert to binary (matching logits.cpp format)
-    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
-    python3 -c "
-import json
-import sys
-import struct
-
-data = json.load(sys.stdin)
-
-# Flatten all embeddings completely
-flattened = []
-for item in data:
-    embedding = item['embedding']
-    for token_embedding in embedding:
-        flattened.extend(token_embedding)
-
-print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
-
-# Write as binary floats - matches logitc.cpp fwrite format
-with open('$TEMP_FILE', 'wb') as f:
-    for value in flattened:
-        f.write(struct.pack('f', value))
-"
-    CPP_EMBEDDINGS="$TEMP_FILE"
-    trap "rm -f $TEMP_FILE" EXIT
-fi
-
-python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
-    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
-    --cpp-embeddings $CPP_EMBEDDINGS \
-    --prompt "Hello world today" \
-    --causal
-
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-
-import numpy as np
-import sys
-import os
-from pathlib import Path
-
-def quick_logits_check(pytorch_file, llamacpp_file):
-    """Lightweight sanity check before NMSE"""
-
-    try:
-        pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
-        llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
-    except Exception as e:
-        print(f"❌ NOK: Failed to load files - {e}")
-        return False
-
-    # Check shapes match
-    if pytorch_logits.shape != llamacpp_logits.shape:
-        print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
-        return False
-
-    # Calculate key metrics
-    diff = pytorch_logits - llamacpp_logits
-    abs_diff = np.abs(diff)
-    max_diff = np.max(abs_diff)
-
-    # Get top 10 predictions from both models
-    pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
-    llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
-    print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
-    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
-    print(f"Max absolute difference: {max_diff:.4f}")
-
-    if max_diff > 1.0:
-        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
-        return False
-
-    return True
-
-def main():
-    model_path = os.getenv('MODEL_PATH')
-    if not model_path:
-        print("Error: MODEL_PATH environment variable not set")
-        sys.exit(1)
-
-    if not os.path.exists(model_path):
-        print(f"Error: Model file not found: {model_path}")
-        sys.exit(1)
-
-    model_name = os.path.splitext(os.path.basename(model_path))[0]
-    data_dir = Path("data")
-
-    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
-
-    if not pytorch_file.exists():
-        print(f"Error: PyTorch logits file not found: {pytorch_file}")
-        print("Please run scripts/run-org-model.sh first to generate this file.")
-        sys.exit(1)
-
-    if not llamacpp_file.exists():
-        print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
-        print("Please run scripts/run-converted-model.sh first to generate this file.")
-        sys.exit(1)
-
-    print("Checked all required files were found. Proceeding...\n")
-
-
-    print("🔍 GGML Model Validation for model ", model_name)
-    print("=" * 40)
-    print(f"PyTorch logits  : {pytorch_file}")
-    print(f"llama.cpp logits: {llamacpp_file}")
-    print()
-
-    success = quick_logits_check(pytorch_file, llamacpp_file)
-
-    # Exit with appropriate code
-    if success:
-        print("✅ OK: Lightweight model check successful!")
-        print("       Ok to proceed with NMSE check...")
-        sys.exit(0)
-    else:
-        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-
-MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
-OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
-TYPE="${OUTTYPE:-f16}"
-METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
-
-echo "Model path: ${MODEL_PATH}"
-echo "Model name: ${MODEL_NAME}"
-echo "Data  type: ${TYPE}"
-echo "Converted model path:: ${CONVERTED_MODEL}"
-echo "Metadata override: ${METADATA_OVERRIDE}"
-python ../../convert_hf_to_gguf.py --verbose \
-    ${MODEL_PATH} \
-    --outfile ${CONVERTED_MODEL} \
-    --outtype ${TYPE} \
-    --metadata "${METADATA_OVERRIDE}"
-
-echo ""
-echo "The environment variable CONVERTED_MODEL can be set to this path using:"
-echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh
@ -1,113 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import importlib
-import sys
-import torch
-import numpy as np
-
-from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM
-from pathlib import Path
-
-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
-
-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-args = parser.parse_args()
-
-model_path = os.environ.get('MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
-
-config = AutoConfig.from_pretrained(model_path)
-
-print("Model type:       ", config.model_type)
-print("Vocab size:       ", config.vocab_size)
-print("Hidden size:      ", config.hidden_size)
-print("Number of layers: ", config.num_hidden_layers)
-print("BOS token id:     ", config.bos_token_id)
-print("EOS token id:     ", config.eos_token_id)
-
-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-if unreleased_model_name:
-    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-    class_name = f"{unreleased_model_name}ForCausalLM"
-    print(f"Importing unreleased model module: {unreleased_module_path}")
-
-    try:
-        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-        model = model_class.from_pretrained(model_path)
-    except (ImportError, AttributeError) as e:
-        print(f"Failed to import or load model: {e}")
-else:
-    model = AutoModelForCausalLM.from_pretrained(model_path)
-print(f"Model class: {type(model)}")
-#print(f"Model file: {type(model).__module__}")
-
-model_name = os.path.basename(model_path)
-print(f"Model name: {model_name}")
-
-prompt = "Hello world today"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-print(f"Input tokens: {input_ids}")
-print(f"Input text: {repr(prompt)}")
-print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
-
-with torch.no_grad():
-    outputs = model(input_ids, output_hidden_states=True)
-
-    # Extract hidden states from the last layer
-    # outputs.hidden_states is a tuple of (num_layers + 1) tensors
-    # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
-    last_hidden_states = outputs.hidden_states[-1]
-
-    # Get embeddings for all tokens
-    token_embeddings = last_hidden_states[0].cpu().numpy()  # Remove batch dimension
-
-    print(f"Hidden states shape: {last_hidden_states.shape}")
-    print(f"Token embeddings shape: {token_embeddings.shape}")
-    print(f"Hidden dimension: {token_embeddings.shape[-1]}")
-    print(f"Number of tokens: {token_embeddings.shape[0]}")
-
-    # Save raw token embeddings
-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
-
-    # Save all token embeddings as binary
-    print(token_embeddings)
-    token_embeddings.astype(np.float32).tofile(bin_filename)
-
-    # Save as text for inspection
-    with open(txt_filename, "w") as f:
-        for i, embedding in enumerate(token_embeddings):
-            for j, val in enumerate(embedding):
-                f.write(f"{i} {j} {val:.6f}\n")
-
-    # Print embeddings per token in the requested format
-    print("\nToken embeddings:")
-    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
-    for i, embedding in enumerate(token_embeddings):
-        # Format: show first few values, ..., then last few values
-        if len(embedding) > 10:
-            # Show first 3 and last 3 values with ... in between
-            first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
-            last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
-            print(f"embedding {i}: {first_vals}  ... {last_vals}")
-        else:
-            # If embedding is short, show all values
-            vals = " ".join(f"{val:8.6f}" for val in embedding)
-            print(f"embedding {i}: {vals}")
-
-    # Also show token info for reference
-    print(f"\nToken reference:")
-    for i, token in enumerate(tokens):
-        print(f"  Token {i}: {repr(token)}")
-
-    print(f"Saved bin logits to: {bin_filename}")
-    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-cmake --build ../../build --target llama-logits -j8
-
-../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-echo $CONVERTED_MODEL
-
-cmake --build ../../build --target llama-logits -j8
-
-../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import importlib
-from pathlib import Path
-
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
-import torch
-import numpy as np
-
-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
-
-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-args = parser.parse_args()
-
-model_path = os.environ.get('MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
-
-config = AutoConfig.from_pretrained(model_path)
-
-print("Model type:       ", config.model_type)
-print("Vocab size:       ", config.vocab_size)
-print("Hidden size:      ", config.hidden_size)
-print("Number of layers: ", config.num_hidden_layers)
-print("BOS token id:     ", config.bos_token_id)
-print("EOS token id:     ", config.eos_token_id)
-
-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-config = AutoConfig.from_pretrained(model_path)
-
-if unreleased_model_name:
-    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-    class_name = f"{unreleased_model_name}ForCausalLM"
-    print(f"Importing unreleased model module: {unreleased_module_path}")
-
-    try:
-        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
-    except (ImportError, AttributeError) as e:
-        print(f"Failed to import or load model: {e}")
-        exit(1)
-else:
-    model = AutoModelForCausalLM.from_pretrained(model_path)
-
-model_name = os.path.basename(model_path)
-# Printing the Model class to allow for easier debugging. This can be useful
-# when working with models that have not been publicly released yet and this
-# migth require that the concrete class is imported and used directly instead
-# of using AutoModelForCausalLM.
-print(f"Model class: {model.__class__.__name__}")
-
-prompt = "Hello, my name is"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
-print(f"Input tokens: {input_ids}")
-print(f"Input text: {repr(prompt)}")
-print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
-
-with torch.no_grad():
-    outputs = model(input_ids)
-    logits = outputs.logits
-
-    # Extract logits for the last token (next token prediction)
-    last_logits = logits[0, -1, :].cpu().numpy()
-
-    print(f"Logits shape: {logits.shape}")
-    print(f"Last token logits shape: {last_logits.shape}")
-    print(f"Vocab size: {len(last_logits)}")
-
-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}.txt"
-
-    # Save to file for comparison
-    last_logits.astype(np.float32).tofile(bin_filename)
-
-    # Also save as text file for easy inspection
-    with open(txt_filename, "w") as f:
-        for i, logit in enumerate(last_logits):
-            f.write(f"{i}: {logit:.6f}\n")
-
-    # Print some sample logits for quick verification
-    print(f"First 10 logits: {last_logits[:10]}")
-    print(f"Last 10 logits: {last_logits[-10:]}")
-
-    # Show top 5 predicted tokens
-    top_indices = np.argsort(last_logits)[-5:][::-1]
-    print("Top 5 predictions:")
-    for idx in top_indices:
-        token = tokenizer.decode([idx])
-        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
-
-    print(f"Saved bin logits to: {bin_filename}")
-    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@ -1,42 +0,0 @@
-#/bin/bash
-
-set -e
-
-MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
-MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
-
-if [ -t 0 ]; then
-    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
-else
-    # Process piped JSON data and convert to binary (matching logits.cpp format)
-    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
-    python3 -c "
-import json
-import sys
-import struct
-
-data = json.load(sys.stdin)
-
-# Flatten all embeddings completely
-flattened = []
-for item in data:
-    embedding = item['embedding']
-    for token_embedding in embedding:
-        flattened.extend(token_embedding)
-
-print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
-
-# Write as binary floats - matches logitc.cpp fwrite format
-with open('$TEMP_FILE', 'wb') as f:
-    for value in flattened:
-        f.write(struct.pack('f', value))
-"
-    CPP_EMBEDDINGS="$TEMP_FILE"
-    trap "rm -f $TEMP_FILE" EXIT
-fi
-
-python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
-    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
-    --cpp-embeddings $CPP_EMBEDDINGS \
-    --prompt "Hello world today"
-
--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-
-set -e
-
-MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
-OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
-TYPE="${OUTTYPE:-f16}"
-METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
-
-echo "Model path: ${EMBEDDING_MODEL_PATH}"
-echo "Model name: ${MODEL_NAME}"
-echo "Data  type: ${TYPE}"
-echo "Converted model path:: ${CONVERTED_MODEL}"
-python ../../convert_hf_to_gguf.py --verbose \
-    ${EMBEDDING_MODEL_PATH} \
-    --outfile ${CONVERTED_MODEL} \
-    --outtype ${TYPE}
-
-echo ""
-echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
-echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
-    exit 1
-fi
-
-echo $CONVERTED_MODEL
-
-cmake --build ../../build --target llama-logits -j8
-
-../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import numpy as np
-import importlib
-from pathlib import Path
-
-from transformers import AutoTokenizer, AutoConfig, AutoModel
-import torch
-
-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
-
-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-args = parser.parse_args()
-
-model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
-
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-if unreleased_model_name:
-    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-    class_name = f"{unreleased_model_name}Model"
-    print(f"Importing unreleased model module: {unreleased_module_path}")
-
-    try:
-        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
-    except (ImportError, AttributeError) as e:
-        print(f"Failed to import or load model: {e}")
-        exit(1)
-else:
-    model = AutoModel.from_pretrained(model_path)
-print(f"Model class: {type(model)}")
-#print(f"Model file: {type(model).__module__}")
-config = AutoConfig.from_pretrained(model_path)
-
-model_name = os.path.basename(model_path)
-
-texts = [ "Hello world today" ]
-
-encoded = tokenizer(
-    texts,
-    padding=True,
-    truncation=True,
-    return_tensors="pt"
-)
-
-tokens = encoded['input_ids'][0]
-token_strings = tokenizer.convert_ids_to_tokens(tokens)
-for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-    print(f"{token_id:6d} -> '{token_str}'")
-
-with torch.no_grad():
-    outputs = model(**encoded)
-    hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
-
-    # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
-    all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
-
-    print(f"Hidden states shape: {hidden_states.shape}")
-    print(f"All embeddings shape: {all_embeddings.shape}")
-    print(f"Embedding dimension: {all_embeddings.shape[1]}")
-
-    # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
-    n_embd = all_embeddings.shape[1]
-    n_embd_count = all_embeddings.shape[0]
-
-    print()  # Empty line to match C++ output
-
-    for j in range(n_embd_count):
-        embedding = all_embeddings[j]
-        print(f"embedding {j}: ", end="")
-
-        # Print first 3 values
-        for i in range(min(3, n_embd)):
-            print(f"{embedding[i]:9.6f} ", end="")
-
-        print(" ... ", end="")
-
-        # Print last 3 values
-        for i in range(n_embd - 3, n_embd):
-            print(f"{embedding[i]:9.6f} ", end="")
-
-        print()  # New line
-
-    print()  # Final empty line to match C++ output
-
-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
-
-    # Save all embeddings flattened (matching what embedding.cpp would save if it did)
-    flattened_embeddings = all_embeddings.flatten()
-    flattened_embeddings.astype(np.float32).tofile(bin_filename)
-
-    with open(txt_filename, "w") as f:
-        f.write(f"# Model class: {model_name}\n")
-        f.write(f"# Tokens: {token_strings}\n")
-        f.write(f"# Shape: {all_embeddings.shape}\n")
-        f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
-
-        for j in range(n_embd_count):
-            f.write(f"# Token {j} ({token_strings[j]}):\n")
-            for i, value in enumerate(all_embeddings[j]):
-                f.write(f"{j}_{i}: {value:.6f}\n")
-            f.write("\n")
-    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
-    print("")
-    print(f"Saved bin embeddings to: {bin_filename}")
-    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/readme.md.template
+++ b/examples/model-conversion/scripts/readme.md.template
@ -1,13 +0,0 @@
---
-base_model:
- {base_model}
---
-# {model_name} GGUF
-
-Recommended way to run this model:
-
-```sh
-llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
-```
-
-Then, access http://localhost:8080
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@ -1,174 +0,0 @@
-#!/usr/bin/env python3
-
-import numpy as np
-import sys
-import os
-import argparse
-from pathlib import Path
-
-def calculate_nmse(reference, test):
-    mse = np.mean((test - reference) ** 2)
-    ref_var = np.var(reference)
-    if ref_var == 0:
-        nmse = float('inf') if mse > 0 else 0.0
-        return mse, mse, ref_var
-
-    nmse = mse / ref_var
-
-    return nmse, mse, ref_var
-
-def load_logits(file_path):
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    if file_path.suffix == '.npy':
-        return np.load(file_path)
-    elif file_path.suffix == '.bin':
-        return np.fromfile(file_path, dtype=np.float32)
-    else:
-        # Try to load as text file
-        try:
-            # If it has index format "0: value", extract just values
-            data = []
-            with open(file_path, 'r') as f:
-                for line in f:
-                    if ':' in line:
-                        # Format: "index: value"
-                        value = float(line.split(':')[1].strip())
-                    else:
-                        # Just the value
-                        value = float(line.strip())
-                    data.append(value)
-            return np.array(data, dtype=np.float32)
-        except:
-            return np.loadtxt(file_path, dtype=np.float32)
-
-def interpret_nmse(nmse):
-    """Provide interpretation of NMSE value"""
-    if nmse == 0:
-        return "Perfect match", "🎉"
-    elif nmse < 1e-6:
-        return "Essentially identical", "✅"
-    elif nmse < 1e-4:
-        return "Excellent match", "✅"
-    elif nmse < 1e-3:
-        return "Very good match", "👍"
-    elif nmse < 1e-2:
-        return "Good match", "👍"
-    elif nmse < 0.1:
-        return "Acceptable match", "⚠️"
-    elif nmse < 1.0:
-        return "Poor match", "❌"
-    else:
-        return "Very poor match (worse than noise)", "❌"
-
-def main():
-    parser = argparse.ArgumentParser(description='Validate model logits')
-    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
-    args = parser.parse_args()
-
-    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
-    data_dir = Path("data")
-
-    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
-
-    print(f"Model name: {model_name}")
-    print(f"PyTorch logits file: {pytorch_file}")
-    print(f"llama.cpp logits file: {llamacpp_file}")
-
-    reference_file = pytorch_file
-    test_file = llamacpp_file
-
-    print("📊 NMSE Check for Model Comparison")
-    print("=" * 50)
-    print(f"Reference (ground truth): {reference_file}")
-    print(f"Test (to evaluate):       {test_file}")
-    print()
-
-    try:
-        print("Loading reference logits...")
-        reference = load_logits(reference_file)
-        print(f"  Shape: {reference.shape}, Type: {reference.dtype}")
-
-        print("Loading test logits...")
-        test = load_logits(test_file)
-        print(f"  Shape: {test.shape}, Type: {test.dtype}")
-
-        # Check shapes match
-        if reference.shape != test.shape:
-            print(f"\n❌ Error: Shape mismatch!")
-            print(f"  Reference: {reference.shape}")
-            print(f"  Test: {test.shape}")
-            sys.exit(1)
-
-        print(f"\n✅ Shapes match: {reference.shape}")
-
-        nmse, mse, ref_var = calculate_nmse(reference, test)
-
-        # Additional metrics
-        max_abs_error = np.max(np.abs(test - reference))
-        mean_abs_error = np.mean(np.abs(test - reference))
-
-        # Results
-        print(f"\n📈 METRICS")
-        print("=" * 30)
-        print(f"MSE (Mean Squared Error):     {mse:.6e}")
-        print(f"Reference Variance:           {ref_var:.6e}")
-        print(f"NMSE:                         {nmse:.6e}")
-        print(f"Max Absolute Error:           {max_abs_error:.6f}")
-        print(f"Mean Absolute Error:          {mean_abs_error:.6f}")
-
-        # NMSE in dB (common in signal processing)
-        if nmse > 0:
-            nmse_db = 10 * np.log10(nmse)
-            print(f"NMSE (dB):                    {nmse_db:.2f} dB")
-
-        # Interpretation
-        interpretation, emoji = interpret_nmse(nmse)
-        print(f"\n🎯 INTERPRETATION")
-        print("=" * 30)
-        print(f"{emoji} {interpretation}")
-
-        # Detailed guidance
-        print(f"\n📋 GUIDANCE")
-        print("=" * 30)
-        if nmse < 1e-3:
-            print("✅ EXCELLENT: Your GGML conversion is working very well!")
-            print("   The differences are negligible for practical use.")
-        elif nmse < 1e-2:
-            print("👍 GOOD: Your GGML conversion is working well.")
-            print("   Small differences are likely due to precision/quantization.")
-        elif nmse < 0.1:
-            print("⚠️  ACCEPTABLE: Conversion is working but with some differences.")
-            print("   Check if you're using quantization (Q4, Q8, etc.)")
-            print("   Test generation quality to see if it's acceptable.")
-        else:
-            print("❌ PROBLEMATIC: Large differences detected.")
-            print("   Check your conversion process for potential issues.")
-            print("   Verify you're using the same model weights.")
-
-        # NMSE benchmarks
-        print(f"\n📚 NMSE BENCHMARKS")
-        print("=" * 30)
-        print("< 1e-6:  Essentially identical")
-        print("< 1e-4:  Excellent (typical for good conversions)")
-        print("< 1e-3:  Very good")
-        print("< 1e-2:  Good (acceptable for most use cases)")
-        print("< 0.1:   Acceptable (may need verification)")
-        print("> 1.0:   Poor (worse than random)")
-
-        # Exit code based on NMSE
-        if nmse < 1e-2:
-            print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
-            sys.exit(0)
-        else:
-            print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
-            sys.exit(1)
-
-    except Exception as e:
-        print(f"❌ Error: {e}")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()
--- a/examples/model-conversion/scripts/utils/create-collection-add-model.sh
+++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@ -1,6 +0,0 @@
-
-COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
-echo "Created collection: $COLLECTION_SLUG"
-
-# Use it in the next command
-python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
--- a/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-
-from huggingface_hub import HfApi
-import argparse
-import sys
-
-def add_model_to_collection(collection_slug, model_id, note=""):
-    """
-    Add a model to an existing collection
-
-    Args:
-        collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
-        model_id: The model repository ID (e.g., "username/model-name")
-        note: Optional note about the model
-
-    Returns:
-        True if successful, False if failed
-    """
-
-    # Initialize API
-    api = HfApi()
-
-    try:
-        user_info = api.whoami()
-        print(f"✅ Authenticated as: {user_info['name']}")
-
-        # Verify the model exists
-        print(f"🔍 Checking if model exists: {model_id}")
-        try:
-            model_info = api.model_info(model_id)
-        except Exception as e:
-            print(f"❌ Model not found or not accessible: {model_id}")
-            print(f"Error: {e}")
-            return False
-
-        print(f"📚 Adding model to collection...")
-        api.add_collection_item(
-            collection_slug=collection_slug,
-            item_id=model_id,
-            item_type="model",
-            note=note
-        )
-
-        print(f"✅ Model added to collection successfully!")
-        print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
-
-        return True
-
-    except Exception as e:
-        print(f"❌ Error adding model to collection: {e}")
-        return False
-
-def main():
-    # This script requires that the environment variable HF_TOKEN is set with your
-    # Hugging Face API token.
-    api = HfApi()
-
-    parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
-    parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
-    parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
-    parser.add_argument('--note', '-n', help='An optional note/description', required=False)
-    args = parser.parse_args()
-
-    collection = args.collection
-    model = args.model
-    note = args.note
-
-    success = add_model_to_collection(
-        collection_slug=collection,
-        model_id=model,
-        note=note
-    )
-
-    if success:
-        print("\n🎉 Model added successfully!")
-    else:
-        print("\n❌ Failed to add model to collection")
-        sys.exit(1)
-if __name__ == "__main__":
-    main()
--- a/examples/model-conversion/scripts/utils/hf-create-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-create-collection.py
@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-from huggingface_hub import HfApi
-import argparse
-import os
-import sys
-
-
-def create_collection(title, description, private=False, namespace=None, return_slug=False):
-    """
-    Create a new collection on Hugging Face
-
-    Args:
-        title: Collection title
-        description: Collection description
-        private: Whether the collection should be private (default: False)
-        namespace: Optional namespace (defaults to your username)
-
-    Returns:
-        Collection object if successful, None if failed
-    """
-
-    # Check if HF_TOKEN is available
-    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
-    if not token:
-        print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
-        print("Please set your Hugging Face token as an environment variable")
-        return None
-
-    # Initialize API
-    api = HfApi()
-
-    try:
-        # Test authentication first
-        user_info = api.whoami()
-        if not return_slug:
-            print(f"✅ Authenticated as: {user_info['name']}")
-
-        # Create the collection
-        if not return_slug:
-            print(f"📚 Creating collection: '{title}'...")
-        collection = api.create_collection(
-            title=title,
-            description=description,
-            private=private,
-            namespace=namespace
-        )
-
-        if not return_slug:
-            print(f"✅ Collection created successfully!")
-            print(f"📋 Collection slug: {collection.slug}")
-            print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
-
-        return collection
-
-    except Exception as e:
-        print(f"❌ Error creating collection: {e}")
-        return None
-
-def main():
-    # This script requires that the environment variable HF_TOKEN is set with your
-    # Hugging Face API token.
-    api = HfApi()
-
-    parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
-    parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
-    parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
-    parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
-    parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true')  # Fixed
-    parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true')  # Fixed
-
-    args = parser.parse_args()
-
-    name = args.name
-    description = args.description
-    private = args.private
-    namespace = args.namespace
-    return_slug = args.return_slug
-
-    if not return_slug:
-        print("🚀 Creating Hugging Face Collection")
-        print(f"Title: {name}")
-        print(f"Description: {description}")
-        print(f"Namespace: {namespace}")
-        print(f"Private: {private}")
-
-    collection = create_collection(
-        title=name,
-        description=description,
-        private=private,
-        namespace=namespace,
-        return_slug=return_slug
-    )
-
-    if collection:
-        if return_slug:
-            print(collection.slug)
-        else:
-            print("\n🎉 Collection created successfully!")
-            print(f"Use this slug to add models: {collection.slug}")
-    else:
-        print("\n❌ Failed to create collection")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()
--- a/examples/model-conversion/scripts/utils/hf-create-model.py
+++ b/examples/model-conversion/scripts/utils/hf-create-model.py
@ -1,63 +0,0 @@
-#!/usr/bin/env python3
-
-from huggingface_hub import HfApi
-import argparse
-
-# This script requires that the environment variable HF_TOKEN is set with your
-# Hugging Face API token.
-api = HfApi()
-
-def load_template_and_substitute(template_path, **kwargs):
-    try:
-        with open(template_path, 'r', encoding='utf-8') as f:
-            template_content = f.read()
-
-        return template_content.format(**kwargs)
-    except FileNotFoundError:
-        print(f"Template file '{template_path}' not found!")
-        return None
-    except KeyError as e:
-        print(f"Missing template variable: {e}")
-        return None
-
-parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
-parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
-parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
-parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
-parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
-parser.add_argument('--private', '-p', action='store_true', help='Create private model')
-
-args = parser.parse_args()
-
-repo_id = f"{args.namespace}/{args.model_name}-GGUF"
-print("Repository ID: ", repo_id)
-
-repo_url = api.create_repo(
-    repo_id=repo_id,
-    repo_type="model",
-    private=args.private,
-    exist_ok=False
-)
-
-if not args.no_card:
-    template_path = "scripts/readme.md.template"
-    model_card_content = load_template_and_substitute(
-        template_path,
-        model_name=args.model_name,
-        namespace=args.namespace,
-        base_model=args.org_base_model,
-    )
-
-    if model_card_content:
-        api.upload_file(
-            path_or_fileobj=model_card_content.encode('utf-8'),
-            path_in_repo="README.md",
-            repo_id=repo_id
-        )
-        print("Model card created successfully.")
-    else:
-        print("Failed to create model card.")
-
-print(f"Repository created: {repo_url}")
-
-
--- a/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
+++ b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-
-from huggingface_hub import HfApi
-import argparse
-import os
-
-def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
-    """
-    Upload a GGUF file to a Hugging Face model repository
-
-    Args:
-        local_file_path: Path to your local GGUF file
-        repo_id: Your repository ID (e.g., "username/model-name")
-        filename_in_repo: Optional custom name for the file in the repo
-    """
-
-    if not os.path.exists(local_file_path):
-        print(f"❌ File not found: {local_file_path}")
-        return False
-
-    if filename_in_repo is None:
-        filename_in_repo = os.path.basename(local_file_path)
-
-    if filename_in_repo is None or filename_in_repo == "":
-        filename_in_repo = os.path.basename(local_file_path)
-
-    print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
-
-    api = HfApi()
-
-    try:
-        api.upload_file(
-            path_or_fileobj=local_file_path,
-            path_in_repo=filename_in_repo,
-            repo_id=repo_id,
-            repo_type="model",
-            commit_message=f"Upload {filename_in_repo}"
-        )
-
-        print("✅ Upload successful!")
-        print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
-        return True
-
-    except Exception as e:
-        print(f"❌ Upload failed: {e}")
-        return False
-
-# This script requires that the environment variable HF_TOKEN is set with your
-# Hugging Face API token.
-api = HfApi()
-
-parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
-parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
-parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
-parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
-args = parser.parse_args()
-
-upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
--- a/examples/model-conversion/scripts/utils/inspect-converted-model.sh
+++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-
-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import json
-from safetensors import safe_open
-from collections import defaultdict
-
-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-args = parser.parse_args()
-
-model_path = os.environ.get('MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
-
-# Check if there's an index file (multi-file model)
-index_path = os.path.join(model_path, "model.safetensors.index.json")
-single_file_path = os.path.join(model_path, "model.safetensors")
-
-if os.path.exists(index_path):
-    # Multi-file model
-    print("Multi-file model detected")
-
-    with open(index_path, 'r') as f:
-        index_data = json.load(f)
-
-    # Get the weight map (tensor_name -> file_name)
-    weight_map = index_data.get("weight_map", {})
-
-    # Group tensors by file for efficient processing
-    file_tensors = defaultdict(list)
-    for tensor_name, file_name in weight_map.items():
-        file_tensors[file_name].append(tensor_name)
-
-    print("Tensors in model:")
-
-    # Process each shard file
-    for file_name, tensor_names in file_tensors.items():
-        file_path = os.path.join(model_path, file_name)
-        print(f"\n--- From {file_name} ---")
-
-        with safe_open(file_path, framework="pt") as f:
-            for tensor_name in sorted(tensor_names):
-                tensor = f.get_tensor(tensor_name)
-                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
-
-elif os.path.exists(single_file_path):
-    # Single file model (original behavior)
-    print("Single-file model detected")
-
-    with safe_open(single_file_path, framework="pt") as f:
-        keys = f.keys()
-        print("Tensors in model:")
-        for key in sorted(keys):
-            tensor = f.get_tensor(key)
-            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
-
-else:
-    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
-    print("Available files:")
-    if os.path.exists(model_path):
-        for item in sorted(os.listdir(model_path)):
-            print(f"  {item}")
-    else:
-        print(f"  Directory {model_path} does not exist")
-    exit(1)
--- a/examples/model-conversion/scripts/utils/perplexity-gen.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -e
-
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-# Check if data/wikitext-2-raw directory exists
-if [ ! -d "ppl/wikitext-2-raw" ]; then
-    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
-    mkdir -p ppl
-    pushd ppl
-    ./../../../scripts/get-wikitext-2.sh
-    popd
-fi
-
-mkdir -p ppl
-OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
-echo "Model: $CONVERTED_MODEL"
-
-cmake --build ../../build --target llama-perplexity -j8
-
-../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
-    -f ppl/wikitext-2-raw/wiki.test.raw \
-    --kl-divergence-base $OUTPUTFILE
-
-echo "Generated logits in $OUTPUTFILE"
-
--- a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
-
-if [ -z "$QUANTIZED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. QUANTIZED_MODEL environment variable" >&2
-    exit 1
-fi
-
-# Check if data/wikitext-2-raw directory exists
-if [ ! -d "ppl/wikitext-2-raw" ]; then
-    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
-    mkdir -p ppl
-    pushd ppl
-    ./../../../scripts/get-wikitext-2.sh
-    popd
-fi
-
-cmake --build ../../build --target llama-perplexity -j8
-
-../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
-
-
--- a/examples/model-conversion/scripts/utils/perplexity-run.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -e
-
-QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
-LOGITS_FILE="${1:-"$LOGITS_FILE"}"
-
-if [ -z "$QUANTIZED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. QUANTIZED_MODEL environment variable" >&2
-    exit 1
-fi
-
-if [ ! -f ${LOGITS_FILE} ]; then
-    echo "Error: logits file '${LOGITS_FILE} was not found"
-    echo "Did you run the perplexity-gen.sh script?"
-    exit 1
-fi
-
-echo "Model: $QUANTIZED_MODEL"
-echo "Data file: $LOGITS_FILE"
-
-cmake --build ../../build --target llama-perplexity -j8
-
-../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
-    --kl-divergence-base $LOGITS_FILE \
-    --kl-divergence
--- a/examples/model-conversion/scripts/utils/quantize.sh
+++ b/examples/model-conversion/scripts/utils/quantize.sh
@ -1,34 +0,0 @@
-#!/bin/bash
-
-set -e
-
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
-QUANTIZED_MODEL=$CONVERTED_MODEL
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-echo $CONVERTED_MODEL
-
-# Process the quantized model filename
-if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
-    # Remove .gguf suffix, add quantized type, then add .gguf back
-    BASE_NAME="${QUANTIZED_MODEL%.gguf}"
-    QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
-else
-    echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
-    exit 1
-fi
-
-
-cmake --build ../../build --target llama-quantize -j8
-
-../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
-
-echo "Quantized model saved to: $QUANTIZED_MODEL"
--- a/examples/model-conversion/scripts/utils/run-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-
-set -e
-#
-# First try command line argument, then environment variable, then file
-CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-
-# Final check if we have a model path
-if [ -z "$CONVERTED_MODEL" ]; then
-    echo "Error: Model path must be provided either as:" >&2
-    echo "  1. Command line argument" >&2
-    echo "  2. CONVERTED_MODEL environment variable" >&2
-    exit 1
-fi
-
-echo $CONVERTED_MODEL
-
-cmake --build ../../build --target llama-server
-
-../../build/bin/llama-server -m $CONVERTED_MODEL \
-    --embedding \
-    --pooling none
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-
-import numpy as np
-import argparse
-import os
-import importlib
-
-from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-
-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
-
-def cosine_similarity(a, b=None):
-    a = np.asarray(a)
-    if b is None:
-        b = a
-    else:
-        b = np.asarray(b)
-
-    if a.ndim == 1:
-        a = a.reshape(1, -1)
-    if b.ndim == 1:
-        b = b.reshape(1, -1)
-
-    a_norms = np.linalg.norm(a, axis=1, keepdims=True)
-    b_norms = np.linalg.norm(b, axis=1, keepdims=True)
-
-    a_norms = np.where(a_norms == 0, 1e-8, a_norms)
-    b_norms = np.where(b_norms == 0, 1e-8, b_norms)
-
-    a_normalized = a / a_norms
-    b_normalized = b / b_norms
-
-    # Compute cosine similarity
-    return np.dot(a_normalized, b_normalized.T)
-
-def load_embeddings_from_file(filename, n_tokens, n_embd):
-    embeddings = np.fromfile(filename, dtype=np.float32)
-    return embeddings.reshape(n_tokens, n_embd)
-
-def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
-    np.set_printoptions(suppress=True, precision=6)
-    print("pytorch embeddings:");
-    print(python_emb)
-    print("llama.cpp embeddings:");
-    print(cpp_emb)
-    print(f"\n=== Prompt: '{prompt}' ===")
-    print(f"Tokens: {tokens}")
-    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
-
-    n_tokens = len(tokens)
-
-    # 1. Direct embedding comparison
-    print(f"\n1. Raw Embedding Magnitude Comparison:")
-    # Check if the distance of each token embedding from the origin and compare
-    # if the vectors are on the same "sphere". This does not tell us about
-    # direction (meaning of the token embedding), just magnitude.
-    for i in range(n_tokens):
-        py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
-        cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
-        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
-        print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
-
-    # 2. Cosine similarity between tokens within each model
-    # Here we check the direction of token embeddings to see if the have the
-    # same meaning (similarity). This is done by calculating cosine similarity
-    # of a pair of token embeddings within each model.
-    print(f"\n2. Within-Model Token Similarities:")
-    print("   Python model:")
-    for i in range(n_tokens):
-        for j in range(i+1, n_tokens):
-            sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
-            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
-
-    print("   llama.cpp model:")
-    for i in range(n_tokens):
-        for j in range(i+1, n_tokens):
-            sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
-            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
-
-    # 3. Cross-model similarity (same token position)
-    print(f"\n3. Cross-Model Same-Token Similarities:")
-    for i in range(n_tokens):
-        sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
-        print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
-
-    # 4. Similarity matrix comparison
-    print(f"\n4. Similarity Matrix Differences:")
-    py_sim_matrix = cosine_similarity(python_emb)
-    cpp_sim_matrix = cosine_similarity(cpp_emb)
-    diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
-
-    print(f"   Max difference: {np.max(diff_matrix):.4f}")
-    print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
-    print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
-
-    return {
-        'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
-        'similarity_matrix_diff': diff_matrix,
-        'max_diff': np.max(diff_matrix),
-        'mean_diff': np.mean(diff_matrix),
-        'rms_diff': np.sqrt(np.mean(diff_matrix**2))
-    }
-
-def main():
-    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
-    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
-    parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
-    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
-    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
-    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
-
-    args = parser.parse_args()
-
-    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
-    print("=" * 70)
-
-    # Single prompt detailed comparison
-    print(f"\nTesting with prompt: '{args.prompt}'")
-
-    # Load the python model to get configuration information and also to load the tokenizer.
-    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    config = AutoConfig.from_pretrained(args.model_path)
-
-    if unreleased_model_name:
-        model_name_lower = unreleased_model_name.lower()
-        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-        if args.causal:
-            class_name = f"{unreleased_model_name}ForCausalLM"
-        else:
-            class_name = f"{unreleased_model_name}Model"
-        print(f"Model class: {class_name}")
-        print(f"Importing unreleased model module: {unreleased_module_path}")
-
-        try:
-            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(args.model_path)
-        except (ImportError, AttributeError) as e:
-            print(f"Failed to import or load model: {e}")
-            exit(1)
-    else:
-        if args.causal:
-            model = AutoModelForCausalLM.from_pretrained(args.model_path)
-        else:
-            model = AutoModel.from_pretrained(args.model_path)
-
-    encoded = tokenizer(args.prompt, return_tensors="pt")
-    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
-    n_tokens = len(tokens)
-    print(f"n_tokens: {n_tokens}");
-    print(f"hidden_size: {model.config.hidden_size}")
-
-    # Load binary embeddings from data directory.
-    llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
-    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
-
-    # Run comparison
-    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
-
-    # Summary
-    print(f"\n=== SUMMARY ===")
-    avg_cross_sim = np.mean(results['cross_model_similarities'])
-    print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
-    print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
-
-    # Quality assessment
-    if avg_cross_sim > 0.95:
-        print("✅ EXCELLENT: Models are highly similar")
-    elif avg_cross_sim > 0.90:
-        print("✅ VERY GOOD: Models are very similar")
-    elif avg_cross_sim > 0.80:
-        print("⚠️  GOOD: Models are reasonably similar")
-    elif avg_cross_sim > 0.70:
-        print("⚠️  FAIR: Models have some differences")
-    else:
-        print("❌ POOR: Models are significantly different")
-
-if __name__ == "__main__":
-    main()
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml-backend.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @brief Maximum number of CANN devices supported.
- */
-#define GGML_CANN_MAX_DEVICES 16
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
-
-/**
- * @brief Initializes the CANN backend for a specified device.
- *
- * This function initializes the CANN backend for the given device.
- * It verifies the device index, allocates a context, and creates a backend
- * instance.
- *
- * @param device The index of the device to initialize.
- * @return A pointer to the initialized backend instance, or nullptr on failure.
- */
-GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
-
-/**
- * @brief Checks if a given backend is a CANN backend.
- *
- * This function verifies if the provided backend is a CANN backend by comparing
- * its GUID with the CANN backend's GUID.
- *
- * @param backend The backend instance to check.
- * @return True if the backend is a CANN backend, false otherwise.
- */
-GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
-
-/**
- * @brief Retrieves the CANN buffer type for a specified device.
- *
- * This function initializes and returns the buffer type interface associated
- * with the given device. It ensures thread-safe access using a mutex.
- *
- * @param device The device index for which to retrieve the buffer type.
- * @return A pointer to the buffer type interface for the specified device, or
- * nullptr if the device index is out of range.
- */
-GGML_BACKEND_API ggml_backend_buffer_type_t
-ggml_backend_cann_buffer_type(int32_t device);
-
-/**
- * @brief Retrieves the number of CANN devices available.
- *
- * This function returns the number of CANN devices available based on
- * information obtained from `ggml_cann_info()`.
- *
- * @return The number of CANN devices available.
- */
-GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
-
-/**
- * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
- *
- * @return A pointer to the host buffer type interface.
- */
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
-
-/**
- * @brief Retrieves the description of a specific CANN device.
- *
- * This function sets the specified device, retrieves the SoC name,
- * and writes it into the provided description buffer.
- *
- * @param device The device index to retrieve the description for.
- * @param description Pointer to a buffer where the description will be written.
- * @param description_size Size of the description buffer.
- */
-GGML_BACKEND_API void ggml_backend_cann_get_device_description(
-    int32_t device, char* description, size_t description_size);
-
-/**
- * @brief Retrieves the memory information of a specific CANN device.
- *
- * This function sets the specified device, retrieves the free and total
- * memory information of the specified type (ACL_HBM_MEM), and stores them
- * in the provided pointers.
- *
- * @param device The device index to retrieve memory information for.
- * @param free Pointer to a variable where the free memory size will be stored.
- * @param total Pointer to a variable where the total memory size will be
- * stored.
- */
-GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
-                                                  size_t* free,
-                                                  size_t* total);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -1,49 +0,0 @@
-//
-//  MIT license
-//  Copyright (C) 2024 Intel Corporation
-//  SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#define GGML_SYCL_NAME "SYCL"
-#define GGML_SYCL_MAX_DEVICES 48
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
-
-// devide buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-
-GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
-                                                       char *description,
-                                                       size_t description_size);
-GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
-GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
-
-// SYCL doesn't support registering host memory, keep here for reference
-// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-webgpu.h
+++ b/ggml/include/ggml-webgpu.h
@ -1,19 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_WEBGPU_NAME "WebGPU"
-
-// Needed for examples in ggml
-GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@ -1,16 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-musa/mudnn.cu
+++ b/ggml/src/ggml-musa/mudnn.cu
@ -1,112 +0,0 @@
-#include <mutex>
-#include <mudnn.h>
-
-#include "mudnn.cuh"
-
-namespace mudnn = musa::dnn;
-
-// Returns a human-readable error string for mudnn::Status
-const char* mudnnGetErrorString(mudnn::Status err) {
-    switch (err) {
-        case mudnn::Status::SUCCESS:
-            return "Success";
-        case mudnn::Status::INVALID_PARAMETER:
-            return "Invalid parameter";
-        case mudnn::Status::NOT_INITIALIZED:
-            return "Not initialized";
-        case mudnn::Status::ALLOC_FAILED:
-            return "Allocation failed";
-        case mudnn::Status::NOT_SUPPORTED:
-            return "Not supported";
-        case mudnn::Status::INTERNAL_ERROR:
-            return "Internal error";
-        case mudnn::Status::ARCH_MISMATCH:
-            return "Architecture mismatch";
-        case mudnn::Status::EXECUTION_FAILED:
-            return "Execution failed";
-        default:
-            return "Unknown mudnn status";
-    }
-}
-
-// Error checking macro for MUDNN calls
-#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
-
-namespace {
-    // Thread-safe cache for mudnn::Handle objects per device
-    std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
-    std::mutex handle_cache_mutex;
-
-    mudnn::Handle* get_cached_handle(int device_id) {
-        std::lock_guard<std::mutex> lock(handle_cache_mutex);
-        auto it = handle_cache.find(device_id);
-        if (it != handle_cache.end()) {
-            return it->second.get();
-        }
-        auto handle = std::make_unique<mudnn::Handle>(device_id);
-        mudnn::Handle* handle_ptr = handle.get();
-        handle_cache[device_id] = std::move(handle);
-        return handle_ptr;
-    }
-}
-
-// Extracts dimensions and strides from a ggml_tensor
-int get_ggml_dims_and_strides(const ggml_tensor* tensor,
-                              std::vector<int64_t>& dims,
-                              std::vector<int64_t>& strides) {
-    const int ndims = ggml_n_dims(tensor);
-    const size_t element_size = ggml_element_size(tensor);
-
-    dims.resize(ndims);
-    strides.resize(ndims);
-
-    for (int i = 0; i < ndims; ++i) {
-        dims[i] = tensor->ne[i];
-        strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
-    }
-    return ndims;
-}
-
-// Converts ggml_type to mudnn::Tensor::Type
-mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32:
-            return mudnn::Tensor::Type::FLOAT;
-        case GGML_TYPE_F16:
-            return mudnn::Tensor::Type::HALF;
-
-        // TODO: Add support for other types
-
-        default:
-            MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
-    }
-
-    return mudnn::Tensor::Type::FLOAT; // Default fallback
-}
-
-// Asynchronous memory copy using mudnn::Unary::IDENTITY
-musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
-    mudnn::Tensor tensor_dst, tensor_src;
-
-    MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
-    MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
-
-    std::vector<int64_t> dims, strides;
-    const int ndims = get_ggml_dims_and_strides(src, dims, strides);
-
-    MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
-    MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
-    MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
-    MUDNN_CHECK(tensor_src.SetAddr(src->data));
-
-    mudnn::Unary op;
-    MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
-    MUDNN_CHECK(op.SetAlpha(0.0f));
-    MUDNN_CHECK(op.SetBeta(0.0f));
-
-    mudnn::Handle* handle = get_cached_handle(ctx.device);
-    MUDNN_CHECK(handle->SetStream(ctx.stream()));
-    MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
-
-    return musaSuccess;
-}
--- a/ggml/src/ggml-musa/mudnn.cuh
+++ b/ggml/src/ggml-musa/mudnn.cuh
@ -1,12 +0,0 @@
-#pragma once
-
-#include "ggml-cuda/common.cuh"
-#include "ggml.h"
-
-// Asynchronously copies data from src tensor to dst tensor using the provided context.
-// Returns a musaError_t indicating success or failure.
-musaError_t mudnnMemcpyAsync(
-    ggml_backend_cuda_context &ctx,
-    const ggml_tensor *dst,
-    const ggml_tensor *src
-);
--- a/ggml/src/ggml-opencl/kernels/add_id.cl
+++ b/ggml/src/ggml-opencl/kernels/add_id.cl
@ -1,42 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-//------------------------------------------------------------------------------
-// add_id
-//------------------------------------------------------------------------------
-kernel void kernel_add_id(
-    global char * src0,
-    ulong         offset0,
-    global char * src1,
-    ulong         offset1,
-    global char * src2,
-    ulong         offset2,
-    global char * dst,
-    ulong         offsetd,
-    ulong         nb01,
-    ulong         nb02,
-    ulong         nb11,
-    ulong         nb21,
-    int           ne0,
-    int           ne1
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    src2 = (global char*)((global char*)src2 + offset2);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    int i1 = get_group_id(0);
-    int i2 = get_group_id(1);
-
-    const int i11 = *((global const int *) (src2 + i1*sizeof(int) + i2*nb21));
-
-    const size_t nb1 = ne0 * sizeof(float);
-    const size_t nb2 = ne1 * nb1;
-
-    global float * dst_row  = (global float *)((global char *)dst  + i1*nb1 + i2*nb2);
-    global float * src0_row = (global float *)((global char *)src0 + i1*nb01 + i2*nb02);
-    global float * src1_row = (global float *)((global char *)src1 + i11*nb11);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        dst_row[i0] = src0_row[i0] + src1_row[i0];
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/argsort.cl
+++ b/ggml/src/ggml-opencl/kernels/argsort.cl
@ -1,86 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
-
-enum ggml_sort_order {
-    GGML_SORT_ORDER_ASC,
-    GGML_SORT_ORDER_DESC,
-};
-
-kernel void kernel_argsort_f32_i32(
-    global float * src0,
-    ulong          offset0,
-    global int   * dst,
-    ulong          offsetd,
-    const int      ne00,
-    const int      ne00_pad,
-    const int      order,
-    local int    * dst_row
-) {
-    // bitonic sort
-    int col = get_local_id(0);
-    int row = get_group_id(1);
-
-    if (col >= ne00_pad) {
-        return;
-    }
-
-    src0 = (global char  *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
-
-    global float * x_row = src0 + row * ne00;
-
-    // initialize indices
-    dst_row[col] = col;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int k = 2; k <= ne00_pad; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (dst_row[col] >= ne00 ||
-                        (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
-                    ) {
-                        SWAP(dst_row[col], dst_row[ixj], int);
-                    }
-                } else {
-                    if (dst_row[ixj] >= ne00 ||
-                        (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
-                    ) {
-                        SWAP(dst_row[col], dst_row[ixj], int);
-                    }
-                }
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-    }
-
-    // copy the result to dst without the padding
-    if (col < ne00) {
-        dst[row * ne00 + col] = dst_row[col];
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/concat.cl
+++ b/ggml/src/ggml-opencl/kernels/concat.cl
@ -1,109 +0,0 @@
-kernel void kernel_concat_f32_contiguous(
-    global const char * p_src0, ulong off_src0,
-    global const char * p_src1, ulong off_src1,
-    global char * p_dst, ulong off_dst,
-    int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
-    int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
-    int d_ne0,  int d_ne1,  int d_ne2,  // dst->ne[0..2] for the slice
-    int dim
-) {
-    global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
-    global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
-    global float * dst        = (global float*)((global char*)p_dst + off_dst);
-
-    int i0 = get_global_id(0); // Index along dst's 0th dimension
-    int i1 = get_global_id(1); // Index along dst's 1st dimension
-    int i2 = get_global_id(2); // Index along dst's 2nd dimension
-
-    if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
-        return;
-    }
-
-    ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
-    ulong src_idx;
-
-    if (dim == 0) {
-        if (i0 < d_ne00) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
-            dst[dst_idx] = src1[src_idx];
-        }
-    } else if (dim == 1) {
-        if (i1 < d_ne01) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
-            dst[dst_idx] = src1[src_idx];
-        }
-    } else if (dim == 2) {
-        if (i2 < d_ne02) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-
-            src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
-            dst[dst_idx] = src1[src_idx];
-        }
-    }
-}
-
-kernel void kernel_concat_f32_non_contiguous(
-    global const char * p_src0, ulong off_src0,
-    global const char * p_src1, ulong off_src1,
-    global char * p_dst, ulong off_dst,
-
-    long ne00, long ne01, long ne02, long ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
-
-    long d_ne0, long d_ne1, long d_ne2, long d_ne3,
-    ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
-    int dim
-) {
-    global const char * src0_base = p_src0 + off_src0;
-    global const char * src1_base = p_src1 + off_src1;
-    global char * dst_base        = p_dst + off_dst;
-
-    long current_i1 = get_global_id(0); // Index for dst_dim_1
-    long current_i2 = get_global_id(1); // Index for dst_dim_2
-    long current_i3 = get_global_id(2); // Index for dst_dim_3
-
-    if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
-        return;
-    }
-
-    global const float * x_val_ptr;
-    global float * y_val_ptr;
-
-    for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
-        bool use_src0;
-        long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
-
-        if (dim == 0) {
-            use_src0 = (current_i0 < ne00);
-            if (!use_src0) { s_i0 = current_i0 - ne00; }
-        } else if (dim == 1) {
-            use_src0 = (current_i1 < ne01);
-            if (!use_src0) { s_i1 = current_i1 - ne01; }
-        } else if (dim == 2) {
-            use_src0 = (current_i2 < ne02);
-            if (!use_src0) { s_i2 = current_i2 - ne02; }
-        } else { // dim == 3
-            use_src0 = (current_i3 < ne03);
-            if (!use_src0) { s_i3 = current_i3 - ne03; }
-        }
-
-        if (use_src0) {
-            x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
-        } else {
-            x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
-        }
-
-        y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
-        *y_val_ptr = *x_val_ptr;
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/conv2d.cl
+++ b/ggml/src/ggml-opencl/kernels/conv2d.cl
@ -1,185 +0,0 @@
-#ifdef USE_FP16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define T_FLOAT half
-#define T_FLOAT4 half4
-#define VSTORE_T_FLOAT4(data, offset, p) vstore_half4_rte(data, offset, p)
-#else
-#define T_FLOAT float
-#define T_FLOAT4 float4
-#define VSTORE_T_FLOAT4(data, offset, p) vstore4(data, offset, p)
-#endif
-
-#if defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-#define REQD_SUBGROUP_SIZE_128
-#endif
-
-#define T_ACCUM float4
-#define VEC_SIZE 4
-
-#define BS_K 64
-#define BS_NPQ 64
-#define BS_CRS 16
-
-#define TS_K 4
-#define TS_NPQ 8
-
-#define WG_K (BS_K / TS_K)
-#define WG_NPQ (BS_NPQ / TS_NPQ)
-
-#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
-#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
-
-static inline uint splitWork(uint work_size, uint block_size){
-    return (work_size + block_size - 1) / block_size;
-}
-
-REQD_SUBGROUP_SIZE_128
-kernel void kernel_conv_2d(
-    global void* p_knl,
-    ulong off_knl,
-    global void* p_src,
-    ulong off_src,
-    global void* p_dst,
-    ulong off_dst,
-    local void* shared,
-    uint Cout, uint Cin, uint N,
-    uint KW, uint KH, uint W, uint H, uint OW, uint OH,
-    uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
-    uint nb01, uint nb02, uint nb03,
-    uint nb11, uint nb12, uint nb13,
-    uint nb1, uint nb2, uint nb3
-) {
-    global T_FLOAT* knl_data = (global T_FLOAT*) ((global char*)p_knl + off_knl);
-    global T_FLOAT* src_data = (global T_FLOAT*) ((global char*)p_src + off_src);
-    global T_FLOAT* dst_data = (global T_FLOAT*) ((global char*)p_dst + off_dst);
-
-    const uint K = Cout;
-    const uint CRS = Cin*KH*KW;
-    const uint NPQ = N*OH*OW;
-
-    const uint lid_k = get_local_id(0);
-    const uint lid_npq = get_local_id(1);
-    const uint tid = lid_npq * WG_K + lid_k;
-
-    const uint B_idx_K = get_group_id(0);
-    const uint B_idx_NPQ = get_group_id(1);
-
-    const uint offset_k = B_idx_K * BS_K;
-    const uint offset_npq = B_idx_NPQ * BS_NPQ;
-
-    local T_FLOAT* Ash = (local T_FLOAT*)shared;
-    local T_FLOAT4* Bsh = (local T_FLOAT4*) &Ash[BS_K * BS_CRS];
-
-    T_ACCUM regC[TS_K][TS_NPQ_VEC];
-    for (int i = 0; i < TS_K; ++i) {
-        for (int j = 0; j < TS_NPQ_VEC; ++j) {
-            regC[i][j] = (T_ACCUM)(0.0f);
-        }
-    }
-
-    const uint NB_CRS = splitWork(CRS, BS_CRS);
-
-    for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
-        const uint offset_crs = B_idx_CRS * BS_CRS;
-
-        for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
-            const uint k_l = i / BS_CRS;
-            const uint crs_l = i % BS_CRS;
-            const uint k_g = offset_k + k_l;
-            const uint crs_g = offset_crs + crs_l;
-
-            if (k_g < K && crs_g < CRS) {
-                const uint Cin_idx = crs_g / (KW*KH);
-                const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
-                const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
-                const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
-                Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
-            } else {
-                Ash[k_l * BS_CRS + crs_l] = (T_FLOAT)0.0f;
-            }
-        }
-
-        for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
-            const uint crs_l = i / BS_NPQ_VEC;
-            const uint npq_l_vec = i % BS_NPQ_VEC;
-            const uint crs_g = offset_crs + crs_l;
-
-            T_FLOAT4 val = (T_FLOAT4)(0.0f);
-            if (crs_g < CRS) {
-                const uint Cin_idx = crs_g / (KW * KH);
-                const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
-                const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
-                for (int v = 0; v < VEC_SIZE; ++v) {
-                    const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
-                    if (npq_g < NPQ) {
-                        const uint N_idx = npq_g / (OH * OW);
-                        const uint pq_idx = npq_g % (OH * OW);
-                        const uint OH_idx = pq_idx / OW;
-                        const uint OW_idx = pq_idx % OW;
-                        const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
-                        const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
-
-                        if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
-                            const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
-                            ((T_FLOAT*)&val)[v] = src_data[src_idx];
-                        }
-                    }
-                }
-            }
-            Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        #pragma unroll
-        for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
-            T_FLOAT regA[TS_K];
-            for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-                regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
-            }
-
-            for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
-                T_FLOAT4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
-                for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-                    regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), convert_float4(regB), regC[k_l_reg][npq_l_vec_reg]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-        const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
-        if (k_g >= K) continue;
-
-        for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
-            const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
-
-            const uint N_idx = npq_g_base / (OH * OW);
-            const uint pq_idx = npq_g_base % (OH * OW);
-            const uint OH_idx = pq_idx / OW;
-            const uint OW_idx = pq_idx % OW;
-
-            if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
-                const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
-                VSTORE_T_FLOAT4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
-            } else {
-                T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
-                for (int v = 0; v < VEC_SIZE; ++v) {
-                    const uint npq_g = npq_g_base + v;
-                    if (npq_g < NPQ) {
-                        const uint N_idx_s = npq_g / (OH*OW);
-                        const uint pq_idx_s = npq_g % (OH*OW);
-                        const uint OH_idx_s = pq_idx_s / OW;
-                        const uint OW_idx_s = pq_idx_s % OW;
-                        const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
-                        dst_data[dst_idx_s] = (T_FLOAT)(((float*)&res)[v]);
-                    }
-                }
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
@ -1,176 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#if defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-#define REQD_SUBGROUP_SIZE_128
-#endif
-
-#define T_ACCUM float4
-#define VEC_SIZE 4
-
-#define BS_K 64
-#define BS_NPQ 64
-#define BS_CRS 16
-
-#define TS_K 4
-#define TS_NPQ 8
-
-#define WG_K (BS_K / TS_K)
-#define WG_NPQ (BS_NPQ / TS_NPQ)
-
-#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
-#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
-
-static inline uint splitWork(uint work_size, uint block_size){
-    return (work_size + block_size - 1) / block_size;
-}
-
-REQD_SUBGROUP_SIZE_128
-kernel void kernel_conv_2d(
-    global void* p_knl,
-    ulong off_knl,
-    global void* p_src,
-    ulong off_src,
-    global void* p_dst,
-    ulong off_dst,
-    local void* shared,
-    uint Cout, uint Cin, uint N,
-    uint KW, uint KH, uint W, uint H, uint OW, uint OH,
-    uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
-    uint nb01, uint nb02, uint nb03,
-    uint nb11, uint nb12, uint nb13,
-    uint nb1, uint nb2, uint nb3
-) {
-    global half* knl_data = (global half*) ((global char*)p_knl + off_knl);
-    global float* src_data = (global float*) ((global char*)p_src + off_src);
-    global float* dst_data = (global float*) ((global char*)p_dst + off_dst);
-
-    const uint K = Cout;
-    const uint CRS = Cin*KH*KW;
-    const uint NPQ = N*OH*OW;
-
-    const uint lid_k = get_local_id(0);
-    const uint lid_npq = get_local_id(1);
-    const uint tid = lid_npq * WG_K + lid_k;
-
-    const uint B_idx_K = get_group_id(0);
-    const uint B_idx_NPQ = get_group_id(1);
-
-    const uint offset_k = B_idx_K * BS_K;
-    const uint offset_npq = B_idx_NPQ * BS_NPQ;
-
-    local half* Ash = (local half*)shared;
-    local float4* Bsh = (local float4*) &Ash[BS_K * BS_CRS];
-
-    T_ACCUM regC[TS_K][TS_NPQ_VEC];
-    for (int i = 0; i < TS_K; ++i) {
-        for (int j = 0; j < TS_NPQ_VEC; ++j) {
-            regC[i][j] = (T_ACCUM)(0.0f);
-        }
-    }
-
-    const uint NB_CRS = splitWork(CRS, BS_CRS);
-
-    for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
-        const uint offset_crs = B_idx_CRS * BS_CRS;
-
-        for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
-            const uint k_l = i / BS_CRS;
-            const uint crs_l = i % BS_CRS;
-            const uint k_g = offset_k + k_l;
-            const uint crs_g = offset_crs + crs_l;
-
-            if (k_g < K && crs_g < CRS) {
-                const uint Cin_idx = crs_g / (KW*KH);
-                const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
-                const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
-                const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
-                Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
-            } else {
-                Ash[k_l * BS_CRS + crs_l] = (half)0.0f;
-            }
-        }
-
-        for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
-            const uint crs_l = i / BS_NPQ_VEC;
-            const uint npq_l_vec = i % BS_NPQ_VEC;
-            const uint crs_g = offset_crs + crs_l;
-
-            float4 val = (float4)(0.0f);
-            if (crs_g < CRS) {
-                const uint Cin_idx = crs_g / (KW * KH);
-                const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
-                const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
-                for (int v = 0; v < VEC_SIZE; ++v) {
-                    const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
-                    if (npq_g < NPQ) {
-                        const uint N_idx = npq_g / (OH * OW);
-                        const uint pq_idx = npq_g % (OH * OW);
-                        const uint OH_idx = pq_idx / OW;
-                        const uint OW_idx = pq_idx % OW;
-                        const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
-                        const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
-
-                        if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
-                            const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
-                            ((float*)&val)[v] = src_data[src_idx];
-                        }
-                    }
-                }
-            }
-            Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        #pragma unroll
-        for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
-            half regA[TS_K];
-            for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-                regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
-            }
-
-            for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
-                float4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
-                for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-                    regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), regB, regC[k_l_reg][npq_l_vec_reg]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
-        const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
-        if (k_g >= K) continue;
-
-        for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
-            const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
-
-            const uint N_idx = npq_g_base / (OH * OW);
-            const uint pq_idx = npq_g_base % (OH * OW);
-            const uint OH_idx = pq_idx / OW;
-            const uint OW_idx = pq_idx % OW;
-
-            if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
-                const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
-                vstore4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
-            } else {
-                T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
-                for (int v = 0; v < VEC_SIZE; ++v) {
-                    const uint npq_g = npq_g_base + v;
-                    if (npq_g < NPQ) {
-                        const uint N_idx_s = npq_g / (OH*OW);
-                        const uint pq_idx_s = npq_g % (OH*OW);
-                        const uint OH_idx_s = pq_idx_s / OW;
-                        const uint OW_idx_s = pq_idx_s % OW;
-                        const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
-                        dst_data[dst_idx_s] = ((float*)&res)[v];
-                    }
-                }
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/div.cl
+++ b/ggml/src/ggml-opencl/kernels/div.cl
@ -1,138 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-//------------------------------------------------------------------------------
-// div
-//------------------------------------------------------------------------------
-kernel void kernel_div(
-        global char * src0,
-        ulong offset0,
-        global char * src1,
-        ulong offset1,
-        global char * dst,
-        ulong offsetd,
-        ulong nb00,
-        ulong nb01,
-        ulong nb02,
-        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13,
-        int ne0,
-        ulong nb0,
-        ulong nb1,
-        ulong nb2,
-        ulong nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0);
-
-    int i13 = i03 % ne13;
-    int i12 = i02 % ne12;
-    int i11 = i01 % ne11;
-
-    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const int i10 = i0 % ne10;
-        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
-    }
-}
-
-// assumption: src1 is a row
-// broadcast src1 into src0
-kernel void kernel_div_row(
-        global float4 * src0,
-        ulong offset0,
-        global float4 * src1,
-        ulong offset1,
-        global float4 * dst,
-        ulong offsetd,
-        int ne
-) {
-    src0 = (global float4*)((global char*)src0 + offset0);
-    src1 = (global float4*)((global char*)src1 + offset1);
-    dst = (global float4*)((global char*)dst + offsetd);
-
-    // This performs better than using %.
-    uint gid = get_global_id(0);
-    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
-    dst[gid] = src0[gid] / src1[idx1];
-}
-
-kernel void kernel_div_f16(
-        global char * src0,
-        ulong offset0,
-        global char * src1,
-        ulong offset1,
-        global char * dst,
-        ulong offsetd,
-        ulong nb00,
-        ulong nb01,
-        ulong nb02,
-        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13,
-        int ne0,
-        ulong nb0,
-        ulong nb1,
-        ulong nb2,
-        ulong nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0);
-
-    int i13 = i03 % ne13;
-    int i12 = i02 % ne12;
-    int i11 = i01 % ne11;
-
-    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const int i10 = i0 % ne10;
-        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_div_row_f16(
-        global half4 * src0,
-        ulong offset0,
-        global half4 * src1,
-        ulong offset1,
-        global half4 * dst,
-        ulong offsetd,
-        int ne
-) {
-    src0 = (global half4*)((global char*)src0 + offset0);
-    src1 = (global half4*)((global char*)src1 + offset1);
-    dst = (global half4*)((global char*)dst + offsetd);
-
-    // This performs better than using %.
-    uint gid = get_global_id(0);
-    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
-    dst[gid] = src0[gid] / src1[idx1];
-}
--- a/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl
@ -1,343 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define ACC_TYPE float
-#define ACC_TYPE4 float4
-#define DATA_TYPE half
-#define DATA_TYPE4 half4
-#define CONVERT_ACC4(x) convert_float4(x)
-#define CONVERT_DATA4(x) convert_half4(x)
-
-#define DK_VEC (DK/4)
-#define DV_VEC (DV/4)
-#define WG_SIZE (BLOCK_M)
-#define Q1_WG_SIZE 64
-
-inline float get_alibi_slope(
-    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
-) {
-    if (max_bias <= 0.0f) {
-        return 1.0f;
-    }
-    const float base = h < n_head_log2 ? m0 : m1;
-    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-    return pow(base, exph);
-}
-__kernel void flash_attn_f16(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int block_q_idx = get_group_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int my_query_row = block_q_idx * BLOCK_M + tid;
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    if (my_query_row < n_q) {
-        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
-        const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
-        for (int i = 0; i < DK_VEC; ++i) {
-            q_priv[i] = CONVERT_ACC4(q_ptr[i]);
-        }
-    }
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) {
-        o_acc[i] = (ACC_TYPE4)(0.0f);
-    }
-    ACC_TYPE m_i = -INFINITY;
-    ACC_TYPE l_i = 0.0f;
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    __local DATA_TYPE4 l_k[BLOCK_N][DK_VEC];
-    __local DATA_TYPE4 l_v[BLOCK_N][DV_VEC];
-
-    for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) {
-        for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) {
-            const int row = i / DK_VEC;
-            const int col = i % DK_VEC;
-            const int k_row_idx = k_start + row;
-            if (k_row_idx < n_kv) {
-                const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1;
-                l_k[row][col] = ((__global DATA_TYPE4*)(k_base + k_row_offset))[col];
-            }
-        }
-        for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) {
-            const int row = i / DV_VEC;
-            const int col = i % DV_VEC;
-            const int v_row_idx = k_start + row;
-            if (v_row_idx < n_kv) {
-                const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1;
-                l_v[row][col] = ((__global DATA_TYPE4*)(v_base + v_row_offset))[col];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (my_query_row >= n_q) {
-            continue;
-        }
-
-        for (int j = 0; j < BLOCK_N; j += 2) {
-            const int k_row0 = k_start + j;
-            const int k_row1 = k_start + j + 1;
-
-            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
-            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
-            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
-            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
-
-            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
-            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
-
-            if (mask_base != NULL) {
-                const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
-            }
-
-            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
-            }
-
-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
-
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
-            }
-            l_i = l_i * scale_prev + p0 + p1;
-            m_i = m_new;
-        }
-    }
-
-    if (my_query_row < n_q) {
-        const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
-        global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
-        if (l_i > 0.0f) {
-            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
-            }
-        } else {
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = (DATA_TYPE4)(0.0f);
-            }
-        }
-    }
-}
-
-__kernel void flash_attn_f16_q1(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
-    const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
-    for (int i = 0; i < DK_VEC; ++i) {
-        q_priv[i] = CONVERT_ACC4(q_ptr[i]);
-    }
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    ACC_TYPE m_i = -INFINITY;
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        m_i = max(m_i, score);
-    }
-
-    __local ACC_TYPE local_m[Q1_WG_SIZE];
-    local_m[tid] = m_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    const ACC_TYPE m_final = local_m[0];
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
-    ACC_TYPE l_i = 0.0f;
-
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1;
-        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
-        const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        const ACC_TYPE p = exp(score - m_final);
-        l_i += p;
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; i++) {
-            o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
-        }
-    }
-
-    __local ACC_TYPE local_l[Q1_WG_SIZE];
-    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
-    local_l[tid] = l_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_l[tid] += local_l[tid + s];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
-    global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
-    const ACC_TYPE l_final = local_l[0];
-
-    if (l_final > 0.0f) {
-        const ACC_TYPE l_inv = 1.0f / l_final;
-        for (int i = 0; i < DV_VEC; i++) {
-            local_o_comp[tid] = o_acc[i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
-            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-            if (tid == 0) {
-                o_row[i] = CONVERT_DATA4(local_o_comp[0] * l_inv);
-            }
-        }
-    } else if (tid == 0) {
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl
@ -1,343 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define ACC_TYPE float
-#define ACC_TYPE4 float4
-#define DATA_TYPE float
-#define DATA_TYPE4 float4
-#define CONVERT_ACC4(x) (x)
-#define CONVERT_DATA4(x) (x)
-
-#define DK_VEC (DK/4)
-#define DV_VEC (DV/4)
-#define WG_SIZE (BLOCK_M)
-#define Q1_WG_SIZE 64
-
-inline float get_alibi_slope(
-    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
-) {
-    if (max_bias <= 0.0f) {
-        return 1.0f;
-    }
-    const float base = h < n_head_log2 ? m0 : m1;
-    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-    return pow(base, exph);
-}
-__kernel void flash_attn_f32(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int block_q_idx = get_group_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int my_query_row = block_q_idx * BLOCK_M + tid;
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    if (my_query_row < n_q) {
-        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
-        const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
-        for (int i = 0; i < DK_VEC; ++i) {
-            q_priv[i] = CONVERT_ACC4(q_ptr[i]);
-        }
-    }
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) {
-        o_acc[i] = (ACC_TYPE4)(0.0f);
-    }
-    ACC_TYPE m_i = -INFINITY;
-    ACC_TYPE l_i = 0.0f;
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    __local DATA_TYPE4 l_k[BLOCK_N][DK_VEC];
-    __local DATA_TYPE4 l_v[BLOCK_N][DV_VEC];
-
-    for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) {
-        for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) {
-            const int row = i / DK_VEC;
-            const int col = i % DK_VEC;
-            const int k_row_idx = k_start + row;
-            if (k_row_idx < n_kv) {
-                const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1;
-                l_k[row][col] = ((__global DATA_TYPE4*)(k_base + k_row_offset))[col];
-            }
-        }
-        for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) {
-            const int row = i / DV_VEC;
-            const int col = i % DV_VEC;
-            const int v_row_idx = k_start + row;
-            if (v_row_idx < n_kv) {
-                const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1;
-                l_v[row][col] = ((__global DATA_TYPE4*)(v_base + v_row_offset))[col];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (my_query_row >= n_q) {
-            continue;
-        }
-
-        for (int j = 0; j < BLOCK_N; j += 2) {
-            const int k_row0 = k_start + j;
-            const int k_row1 = k_start + j + 1;
-
-            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
-            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
-            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
-            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
-
-            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
-            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
-
-            if (mask_base != NULL) {
-                const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
-            }
-
-            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
-            }
-
-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
-
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
-            }
-            l_i = l_i * scale_prev + p0 + p1;
-            m_i = m_new;
-        }
-    }
-
-    if (my_query_row < n_q) {
-        const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
-        global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
-        if (l_i > 0.0f) {
-            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
-            }
-        } else {
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = (DATA_TYPE4)(0.0f);
-            }
-        }
-    }
-}
-
-__kernel void flash_attn_f32_q1(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
-    const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
-    for (int i = 0; i < DK_VEC; ++i) {
-        q_priv[i] = CONVERT_ACC4(q_ptr[i]);
-    }
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    ACC_TYPE m_i = -INFINITY;
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        m_i = max(m_i, score);
-    }
-
-    __local ACC_TYPE local_m[Q1_WG_SIZE];
-    local_m[tid] = m_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    const ACC_TYPE m_final = local_m[0];
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
-    ACC_TYPE l_i = 0.0f;
-
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1;
-        const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
-        const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        const ACC_TYPE p = exp(score - m_final);
-        l_i += p;
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; i++) {
-            o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
-        }
-    }
-
-    __local ACC_TYPE local_l[Q1_WG_SIZE];
-    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
-    local_l[tid] = l_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_l[tid] += local_l[tid + s];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
-    global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
-    const ACC_TYPE l_final = local_l[0];
-
-    if (l_final > 0.0f) {
-        const ACC_TYPE l_inv = 1.0f / l_final;
-        for (int i = 0; i < DV_VEC; i++) {
-            local_o_comp[tid] = o_acc[i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
-            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-            if (tid == 0) {
-                o_row[i] = CONVERT_DATA4(local_o_comp[0] * l_inv);
-            }
-        }
-    } else if (tid == 0) {
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl
@ -1,346 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define ACC_TYPE float
-#define ACC_TYPE4 float4
-#define Q_DATA_TYPE4 float4
-#define KV_DATA_TYPE4 half4
-#define O_DATA_TYPE4 float4
-#define MASK_DATA_TYPE half
-#define CONVERT_Q_ACC4(x) (x)
-#define CONVERT_KV_ACC4(x) convert_float4(x)
-#define CONVERT_O_DATA4(x) (x)
-
-#define DK_VEC (DK/4)
-#define DV_VEC (DV/4)
-#define WG_SIZE (BLOCK_M)
-#define Q1_WG_SIZE 64
-
-inline float get_alibi_slope(
-    const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
-) {
-    if (max_bias <= 0.0f) {
-        return 1.0f;
-    }
-    const float base = h < n_head_log2 ? m0 : m1;
-    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-    return pow(base, exph);
-}
-__kernel void flash_attn_f32_f16(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int block_q_idx = get_group_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int my_query_row = block_q_idx * BLOCK_M + tid;
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    if (my_query_row < n_q) {
-        const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
-        const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
-        #pragma unroll
-        for (int i = 0; i < DK_VEC; ++i) {
-            q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
-        }
-    }
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) {
-        o_acc[i] = (ACC_TYPE4)(0.0f);
-    }
-    ACC_TYPE m_i = -INFINITY;
-    ACC_TYPE l_i = 0.0f;
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    __local KV_DATA_TYPE4 l_k[BLOCK_N][DK_VEC];
-    __local KV_DATA_TYPE4 l_v[BLOCK_N][DV_VEC];
-
-    for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) {
-        for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) {
-            const int row = i / DK_VEC;
-            const int col = i % DK_VEC;
-            const int k_row_idx = k_start + row;
-            if (k_row_idx < n_kv) {
-                const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1;
-                l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_base + k_row_offset))[col];
-            }
-        }
-        for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) {
-            const int row = i / DV_VEC;
-            const int col = i % DV_VEC;
-            const int v_row_idx = k_start + row;
-            if (v_row_idx < n_kv) {
-                const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1;
-                l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_base + v_row_offset))[col];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (my_query_row >= n_q) {
-            continue;
-        }
-
-        for (int j = 0; j < BLOCK_N; j += 2) {
-            const int k_row0 = k_start + j;
-            const int k_row1 = k_start + j + 1;
-
-            ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
-            ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
-            #pragma unroll
-            for (int k = 0; k < DK_VEC; k++) {
-                dot_acc0 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][k]), dot_acc0);
-                dot_acc1 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1);
-            }
-            ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
-            ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
-
-            if (is_causal) {
-                if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
-                if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
-            }
-
-            if (k_row0 >= n_kv) score0 = -INFINITY;
-            if (k_row1 >= n_kv) score1 = -INFINITY;
-
-            if (mask_base != NULL) {
-                const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
-                if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
-                if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
-            }
-
-            if (logit_softcap > 0.0f) {
-                score0 = logit_softcap * tanh(score0 / logit_softcap);
-                score1 = logit_softcap * tanh(score1 / logit_softcap);
-            }
-
-            const ACC_TYPE m_new = max(m_i, max(score0, score1));
-            const ACC_TYPE p0 = exp(score0 - m_new);
-            const ACC_TYPE p1 = exp(score1 - m_new);
-            const ACC_TYPE scale_prev = exp(m_i - m_new);
-
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_KV_ACC4(l_v[j][i]) + p1 * CONVERT_KV_ACC4(l_v[j+1][i]);
-            }
-            l_i = l_i * scale_prev + p0 + p1;
-            m_i = m_new;
-        }
-    }
-
-    if (my_query_row < n_q) {
-        const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
-        global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
-        if (l_i > 0.0f) {
-            const ACC_TYPE l_inv = 1.0f / l_i;
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = CONVERT_O_DATA4(o_acc[i] * l_inv);
-            }
-        } else {
-            #pragma unroll
-            for (int i = 0; i < DV_VEC; ++i) {
-                o_row[i] = (O_DATA_TYPE4)(0.0f);
-            }
-        }
-    }
-}
-
-__kernel void flash_attn_f32_f16_q1(
-    const global void * q_void, ulong q_offset,
-    const global void * k_void, ulong k_offset,
-    const global void * v_void, ulong v_offset,
-    global void * o_void, ulong o_offset,
-    const float scale,
-    const int n_q,
-    const int n_kv,
-    const int is_causal,
-    const int n_head,
-    const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
-    const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
-    const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
-    const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
-    const float max_bias,
-    const float m0,
-    const float m1,
-    const int n_head_log2,
-    const float logit_softcap,
-    const int n_head_kv,
-    const global void* mask_void,
-    const ulong mask_offset,
-    const ulong mask_nb1,
-    const ulong mask_nb2,
-    const ulong mask_nb3,
-    const int mask_ne2,
-    const int mask_ne3
-) {
-    const int tid = get_local_id(0);
-    const int head_batch_idx = get_global_id(1);
-
-    const int batch_idx = head_batch_idx / n_head;
-    const int head_idx = head_batch_idx % n_head;
-
-    const int gqa_ratio = n_head / n_head_kv;
-    const int head_kv_idx = head_idx / gqa_ratio;
-
-    const global char* q_base = (const global char*)q_void + q_offset;
-    const global char* k_base = (const global char*)k_void + k_offset;
-    const global char* v_base = (const global char*)v_void + v_offset;
-    global char* o_base = (global char*)o_void + o_offset;
-
-    const global char* mask_base = NULL;
-    if (mask_void != NULL) {
-        const int mask_head_idx = head_idx % mask_ne2;
-        const int mask_batch_idx = batch_idx % mask_ne3;
-        mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
-    }
-
-    ACC_TYPE4 q_priv[DK_VEC];
-    const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
-    const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
-    #pragma unroll
-    for (int i = 0; i < DK_VEC; ++i) {
-        q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
-    }
-
-    float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
-
-    ACC_TYPE m_i = -INFINITY;
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        m_i = max(m_i, score);
-    }
-
-    __local ACC_TYPE local_m[Q1_WG_SIZE];
-    local_m[tid] = m_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    const ACC_TYPE m_final = local_m[0];
-
-    ACC_TYPE4 o_acc[DV_VEC];
-    #pragma unroll
-    for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
-    ACC_TYPE l_i = 0.0f;
-
-    for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
-        const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
-        const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1;
-        const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
-        const global KV_DATA_TYPE4* v_ptr = (const global KV_DATA_TYPE4*)(v_base + v_row_offset);
-        ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
-        #pragma unroll
-        for (int k = 0; k < DK_VEC; k++) {
-            dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
-        }
-        ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
-        if (mask_base != NULL) {
-            const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base);
-            score += slope * (ACC_TYPE)mask_ptr[k_idx];
-        }
-        if (logit_softcap > 0.0f) {
-            score = logit_softcap * tanh(score / logit_softcap);
-        }
-        const ACC_TYPE p = exp(score - m_final);
-        l_i += p;
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; i++) {
-            o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]);
-        }
-    }
-
-    __local ACC_TYPE local_l[Q1_WG_SIZE];
-    __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
-    local_l[tid] = l_i;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    #pragma unroll
-    for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) local_l[tid] += local_l[tid + s];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1;
-    global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
-    const ACC_TYPE l_final = local_l[0];
-
-    if (l_final > 0.0f) {
-        const ACC_TYPE l_inv = 1.0f / l_final;
-        for (int i = 0; i < DV_VEC; i++) {
-            local_o_comp[tid] = o_acc[i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            #pragma unroll
-            for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
-                if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-            if (tid == 0) {
-                o_row[i] = CONVERT_O_DATA4(local_o_comp[0] * l_inv);
-            }
-        }
-    } else if (tid == 0) {
-        #pragma unroll
-        for (int i = 0; i < DV_VEC; ++i) o_row[i] = (O_DATA_TYPE4)(0.0f);
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/glu.cl
+++ b/ggml/src/ggml-opencl/kernels/glu.cl
@ -1,378 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define GELU_COEF_A     0.044715f
-#define GELU_QUICK_COEF -1.702f
-#define SQRT_2_OVER_PI  0.79788456080286535587989211986876f
-#define SQRT_2_INV      0.70710678118654752440084436210484f
-
-//------------------------------------------------------------------------------
-// geglu
-//------------------------------------------------------------------------------
-kernel void kernel_geglu(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const float x0 = src0_row[i0];
-        const float x1 = src1_row[i0];
-
-        const float gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
-
-        dst_row[i0] = gelu*x1;
-    }
-}
-
-kernel void kernel_geglu_f16(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global half * dst_row  = (global half *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const half x0 = src0_row[i0];
-        const half x1 = src1_row[i0];
-
-        const half gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
-
-        dst_row[i0] = gelu*x1;
-    }
-}
-
-//------------------------------------------------------------------------------
-// reglu
-//------------------------------------------------------------------------------
-kernel void kernel_reglu(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const float x0 = src0_row[i0];
-        const float x1 = src1_row[i0];
-
-        dst_row[i0] = x0*x1*(x0 > 0.0f);
-    }
-}
-
-kernel void kernel_reglu_f16(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global half * dst_row  = (global half *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const half x0 = src0_row[i0];
-        const half x1 = src1_row[i0];
-
-        dst_row[i0] = x0*x1*(x0 > 0.0f);
-    }
-}
-
-//------------------------------------------------------------------------------
-// swiglu
-//------------------------------------------------------------------------------
-kernel void kernel_swiglu(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const float x0 = src0_row[i0];
-        const float x1 = src1_row[i0];
-
-        const float silu = x0 / (1.0f + exp(-x0));
-
-        dst_row[i0] = silu*x1;
-    }
-}
-
-kernel void kernel_swiglu_f16(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global half * dst_row  = (global half *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const half x0 = src0_row[i0];
-        const half x1 = src1_row[i0];
-
-        const half silu = x0 / (1.0f + exp(-x0));
-
-        dst_row[i0] = silu*x1;
-    }
-}
-
-//------------------------------------------------------------------------------
-// swiglu_oai
-//------------------------------------------------------------------------------
-kernel void kernel_swiglu_oai(
-    global char * src0,
-    ulong         offset0,
-    global char * src1,
-    ulong         offset1,
-    global char * dst,
-    ulong         offsetd,
-    ulong         nb01,
-    ulong         nb11,
-    int           ne0,
-    ulong         nb1,
-    int           ne00_off,
-    int           ne10_off,
-    float         limit,
-    float         alpha
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        float x0 = src0_row[i0];
-        float x1 = src1_row[i0];
-
-        x0 = min(x0, limit);
-        x1 = max(min(x1, limit), -limit);
-
-        float out_glu = x0 / (1.0f + exp(-x0 * alpha));
-        out_glu = out_glu * (1.0f + x1);
-
-        dst_row[i0] = out_glu;
-    }
-}
-
-//------------------------------------------------------------------------------
-// geglu_erf
-//------------------------------------------------------------------------------
-kernel void kernel_geglu_erf(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const float x0 = src0_row[i0];
-        const float x1 = src1_row[i0];
-
-        const float gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
-
-        dst_row[i0] = gelu_erf*x1;
-    }
-}
-
-kernel void kernel_geglu_erf_f16(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global half * dst_row  = (global half *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const half x0 = src0_row[i0];
-        const half x1 = src1_row[i0];
-
-        const half gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
-
-        dst_row[i0] = gelu_erf*x1;
-    }
-}
-
-//------------------------------------------------------------------------------
-// geglu_quick
-//------------------------------------------------------------------------------
-kernel void kernel_geglu_quick(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global float * dst_row  = (global float *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const float x0 = src0_row[i0];
-        const float x1 = src1_row[i0];
-
-        const float gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
-
-        dst_row[i0] = gelu_quick*x1;
-    }
-}
-
-kernel void kernel_geglu_quick_f16(
-    global char * src0,
-    ulong  offset0,
-    global char * src1,
-    ulong  offset1,
-    global char * dst,
-    ulong  offsetd,
-    ulong nb01,
-    ulong nb11,
-    int ne0,
-    ulong nb1,
-    int ne00_off,
-    int ne10_off
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
-    global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
-    global half * dst_row  = (global half *) ((global char *) dst  + get_group_id(0)*nb1);
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const half x0 = src0_row[i0];
-        const half x1 = src1_row[i0];
-
-        const half gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
-
-        dst_row[i0] = gelu_quick*x1;
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/group_norm.cl
+++ b/ggml/src/ggml-opencl/kernels/group_norm.cl
@ -1,72 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-// Workgroup must be a subgroup
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_32
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_group_norm(
-        global float * src0,
-        ulong offset0,
-        global float * dst,
-        ulong offsetd,
-        int ne,
-        int group_size,
-        float eps
-) {
-    src0 = (global float  *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
-
-    int start = get_group_id(0) * group_size;
-    int end   = start + group_size;
-
-    start += get_local_id(0);
-
-    if (end >= ne) {
-        end = ne;
-    }
-
-    float tmp = 0.0f;
-
-    for (int j = start; j < end; j += get_local_size(0)) {
-        tmp += src0[j];
-    }
-
-    tmp = sub_group_reduce_add(tmp);
-
-    const float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += get_local_size(0)) {
-        float xi = src0[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = sub_group_reduce_add(tmp);
-
-    const float variance = tmp / group_size;
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int j = start; j < end; j += get_local_size(0)) {
-        dst[j] *= scale;
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
@ -1,130 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#if defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-#define REQD_SUBGROUP_SIZE_128
-#endif
-
-#define OPWM 64
-#define OPWN 64
-#define CPWK 8
-#define OPTM 4
-#define OPTN 8
-
-#define WG_M (OPWM / OPTM)
-#define WG_N (OPWN / OPTN)
-#define VEC_K (CPWK / 4)
-
-REQD_SUBGROUP_SIZE_128
-__kernel void mul_mat_f16_f32(
-    const int M, const int N, const int K,
-    __global const void* A_void, ulong A_offset,
-    __global const void* B_void, ulong B_offset,
-    __global       void* C_void, ulong C_offset) {
-
-    __global const half*  A = (__global const half* )((__global const char*)A_void + A_offset);
-    __global const float* B = (__global const float*)((__global const char*)B_void + B_offset);
-    __global       float* C = (__global       float*)((__global       char*)C_void + C_offset);
-
-    const int lidm = get_local_id(0);
-    const int lidn = get_local_id(1);
-    const int lid = lidn * WG_M + lidm;
-
-    const int offsetM = get_group_id(0) * OPWM;
-    const int offsetN = get_group_id(1) * OPWN;
-
-    __local half4  Alocal[OPWM][VEC_K];
-    __local float4 Blocal[OPWN][VEC_K];
-
-    float sum[OPTM][OPTN];
-
-    for (int wm = 0; wm < OPTM; wm++) {
-        for (int wn = 0; wn < OPTN; wn++) {
-            sum[wm][wn] = 0.0f;
-        }
-    }
-
-    const int numTiles = (K + CPWK - 1) / CPWK;
-
-    const int load_row_a = lid % OPWM;
-    const int load_vec_k_a = lid / OPWM;
-    const int global_row_a = offsetM + load_row_a;
-
-    const int load_row_b = lid % OPWN;
-    const int load_vec_k_b = lid / OPWN;
-    const int global_row_b = offsetN + load_row_b;
-
-    for (int t = 0; t < numTiles; t++) {
-        const int k_start = t * CPWK;
-        const int k_vec_start_a = k_start + load_vec_k_a * 4;
-        const int k_vec_start_b = k_start + load_vec_k_b * 4;
-
-        if (global_row_a < M && k_vec_start_a < K) {
-            if (k_vec_start_a + 3 < K) {
-                Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a);
-            } else {
-                half4 tempA = (half4)(0.0h);
-                if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a];
-                if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1];
-                if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2];
-                Alocal[load_row_a][load_vec_k_a] = tempA;
-            }
-        } else {
-            Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h);
-        }
-
-        if (global_row_b < N && k_vec_start_b < K) {
-            if (k_vec_start_b + 3 < K) {
-                Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b);
-            } else {
-                float4 tempB = (float4)(0.0f);
-                if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b];
-                if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1];
-                if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2];
-                Blocal[load_row_b][load_vec_k_b] = tempB;
-            }
-        } else {
-            Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f);
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        #pragma unroll
-        for (int k_vec = 0; k_vec < VEC_K; k_vec++) {
-            float4 a_fvecs[OPTM];
-            int current_row_a = lidm;
-            for (int wm = 0; wm < OPTM; wm++) {
-                a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]);
-                current_row_a += WG_M;
-            }
-
-            float4 b_fvecs[OPTN];
-            int current_row_b = lidn;
-            for (int wn = 0; wn < OPTN; wn++) {
-                b_fvecs[wn] = Blocal[current_row_b][k_vec];
-                current_row_b += WG_N;
-            }
-
-            for (int wm = 0; wm < OPTM; wm++) {
-                for (int wn = 0; wn < OPTN; wn++) {
-                    sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    for (int wm = 0; wm < OPTM; wm++) {
-        int globalRow = offsetM + lidm + wm * WG_M;
-        if (globalRow < M) {
-            for (int wn = 0; wn < OPTN; wn++) {
-                int globalCol = offsetN + lidn + wn * WG_N;
-                if (globalCol < N) {
-                    C[globalCol * M + globalRow] = sum[wm][wn];
-                }
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
@ -1,132 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define LOAD_VEC_A 4
-#define LOAD_VEC_B 4
-
-#define BM 64
-#define BN 64
-#define BK 16
-#define TM 4
-#define TN 8
-
-kernel void kernel_mul_mm_f16_f32_l4_lm(
-    global half4 * src0,
-    ulong offset0,
-    global float4 * src1,
-    ulong offset1,
-    global float * dst,
-    ulong offsetd,
-
-    int ne00,
-    int ne01,
-    int ne02,
-    int ne11,
-    int ne12,
-
-    int stride_a,
-    int stride_b,
-    int stride_d,
-
-    int batch_stride_a,
-    int batch_stride_b,
-    int batch_stride_d,
-
-    int r2,
-    int r3
-) {
-    src0 = (global half4*)((global char*)src0 + offset0);
-    src1 = (global float4*)((global char*)src1 + offset1);
-    dst = (global float*)((global char*)dst + offsetd);
-
-    local half  buf_a[BM * BK];
-    local float buf_b[BN * BK];
-
-    const int batch_idx = get_global_id(2);
-
-    const int i13 = batch_idx / ne12;
-    const int i12 = batch_idx % ne12;
-
-    const int i03 = i13 / r3;
-    const int i02 = i12 / r2;
-
-    const int batch_idx_a = i03 * ne02 + i02;
-
-    const int ir = get_group_id(0);
-    const int ic = get_group_id(1);
-
-    const int tid = get_local_id(0);
-    const int th_r  = tid % (BM / TM);
-    const int th_c  = tid / (BM / TM);
-
-    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
-    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
-    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
-    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
-
-    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
-    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
-
-    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
-    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
-
-    float sums[TM * TN];
-    half  cache_a[TM];
-    float cache_b[TN];
-
-    for (int i = 0; i < TM * TN; i++) {
-        sums[i] = 0.0f;
-    }
-
-    for (int block = 0; block < ne00; block += BK) {
-        for (int l = 0; l < BM; l += loadstride_a) {
-            const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
-            buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
-            buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
-            buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
-            buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
-        }
-
-        for (int l = 0; l < BN; l += loadstride_b) {
-            const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
-            buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
-            buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
-            buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
-            buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        pos_a += BK / LOAD_VEC_A;
-        pos_b += BK / LOAD_VEC_B;
-
-        for (int i = 0; i < BK; i++) {
-            for (int j = 0; j < TM; j++) {
-                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
-            }
-            for (int j = 0; j < TN; j++) {
-                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
-            }
-
-            for (int cc = 0; cc < TN; cc++) {
-                for (int cr = 0; cr < TM; cr++) {
-                    const int sums_idx = cc*TM + cr;
-                    sums[sums_idx] = mad(convert_float(cache_a[cr]), cache_b[cc], sums[sums_idx]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    const int dr = ir * BM + th_r * TM;
-    const int dc = ic * BN + th_c * TN;
-
-    const int offsets = batch_idx * batch_stride_d;
-
-    for (int cc = 0; cc < TN; cc++) {
-        for (int cr = 0; cr < TM; cr++) {
-            if (dr + cr < ne01 && dc + cc < ne11) {
-                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
@ -1,133 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define LOAD_VEC_A 4
-#define LOAD_VEC_B 4
-
-#define BM 64
-#define BN 64
-#define BK 16
-#define TM 4
-#define TN 8
-
-kernel void kernel_mul_mm_f32_f32_l4_lm(
-    global float4 * src0,
-    ulong offset0,
-    global float4 * src1,
-    ulong offset1,
-    global float * dst,
-    ulong offsetd,
-
-    int ne00,
-    int ne01,
-    int ne02,
-    int ne11,
-    int ne12,
-
-    int stride_a,
-    int stride_b,
-    int stride_d,
-
-    int batch_stride_a,
-    int batch_stride_b,
-    int batch_stride_d,
-
-    int r2,
-    int r3
-) {
-    src0 = (global float4*)((global char*)src0 + offset0);
-    src1 = (global float4*)((global char*)src1 + offset1);
-    dst = (global float*)((global char*)dst + offsetd);
-
-    local float buf_a[BM * BK];
-    local float buf_b[BN * BK];
-
-    const int batch_idx = get_global_id(2);
-
-    const int i13 = batch_idx / ne12;
-    const int i12 = batch_idx % ne12;
-
-    const int i03 = i13 / r3;
-    const int i02 = i12 / r2;
-
-    const int batch_idx_a = i03 * ne02 + i02;
-
-    const int ir = get_group_id(0);
-    const int ic = get_group_id(1);
-
-    const int tid = get_local_id(0);
-    const int th_r  = tid % (BM / TM);
-    const int th_c  = tid / (BM / TM);
-
-    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
-    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
-    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
-    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
-
-    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
-    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
-
-    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
-    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
-
-    float sums[TM * TN];
-    float cache_a[TM];
-    float cache_b[TN];
-
-    for (int i = 0; i < TM * TN; i++) {
-        sums[i] = 0.0f;
-    }
-
-    for (int block = 0; block < ne00; block += BK) {
-        for (int l = 0; l < BM; l += loadstride_a) {
-            const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
-            buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
-            buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
-            buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
-            buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
-        }
-
-        for (int l = 0; l < BN; l += loadstride_b) {
-            const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
-            buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
-            buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
-            buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
-            buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        pos_a += BK / LOAD_VEC_A;
-        pos_b += BK / LOAD_VEC_B;
-
-        for (int i = 0; i < BK; i++) {
-            for (int j = 0; j < TM; j++) {
-                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
-            }
-
-            for (int j = 0; j < TN; j++) {
-                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
-            }
-
-            for (int cc = 0; cc < TN; cc++) {
-                for (int cr = 0; cr < TM; cr++) {
-                    const int sums_idx = cc*TM + cr;
-                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    const int dr = ir * BM + th_r * TM;
-    const int dc = ic * BN + th_c * TN;
-
-    const int offsets = batch_idx * batch_stride_d;
-
-    for (int cc = 0; cc < TN; cc++) {
-        for (int cr = 0; cr < TM; cr++) {
-            if (dr + cr < ne01 && dc + cc < ne11) {
-                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
@ -1,189 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#define QK_MXFP4 32
-typedef struct {
-    uchar e; // E8M0
-    uchar qs[QK_MXFP4/2];
-} block_mxfp4;
-
-constant static float kvalues_mxfp4_f[16] = {
-    0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
-};
-
-static inline float e8m0_to_fp32(uchar x) {
-    int bits;
-
-    if (x == 0) {
-        bits = 0x00400000;
-    } else {
-        bits = (uint) x << 23;
-    }
-
-    return as_float(bits);
-}
-
-#ifdef INTEL_GPU
-#define N_R0_MXFP4 2 // number of rows each subgroup works on
-#define N_SG_MXFP4 2 // number of subgroups in a work group
-#define N_SIMDWIDTH 16 // subgroup size
-#elif defined (ADRENO_GPU)
-#define N_R0_MXFP4 2
-#define N_SG_MXFP4 2
-#define N_SIMDWIDTH 64
-#endif
-
-inline void mul_mv_mxfp4_f32(
-    global char * src0,
-    global char * src1,
-    global char * dst,
-    int ne00,
-    ulong nb01,
-    ulong nb02,
-    ulong nb03,
-    int ne12,
-    ulong nb11,
-    ulong nb12,
-    ulong nb13,
-    int ne0,
-    int ne1,
-    int r2,
-    int r3,
-    local  char * shmem
-) {
-    local float * shmem_f32 = (local float *) shmem;
-    int nb = ne00/QK_MXFP4;
-
-    int r0 = get_group_id(0);
-    int r1 = get_group_id(1);
-    int im = 0;
-
-    int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
-
-    uint i12 = im%ne12;
-    uint i13 = im/ne12;
-
-    ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    ulong offset_src1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
-
-    global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
-    global float       * y = (global float       *) (src1 + offset_src1);
-
-    const short ix = get_sub_group_local_id()/2;  // 0...15
-    const short it = get_sub_group_local_id()%2;  // 0 or 1
-
-    shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 yl[4];
-    float sumf[N_R0_MXFP4] = {0.f};
-
-    global float * yb = y + ix * QK_MXFP4 + it * 8;
-
-    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
-        global float4 * y4 = (global float4 *)yb;
-        yl[0] = y4[0];
-        yl[1] = y4[4];
-        yl[2] = y4[1];
-        yl[3] = y4[5];
-
-        for (short row = 0; row < N_R0_MXFP4; row++) {
-            global block_mxfp4 * xb = x + row*nb + ib;
-            global uchar       * q2 = (global uchar *)(xb->qs + 8*it);
-
-            float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] &  0x0F], shmem_f32[q2[1] &  0x0F], shmem_f32[q2[2] &  0x0F], shmem_f32[q2[3] &  0x0F]);
-            float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4   ], shmem_f32[q2[1] >> 4   ], shmem_f32[q2[2] >> 4   ], shmem_f32[q2[3] >> 4   ]);
-            float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] &  0x0F], shmem_f32[q2[5] &  0x0F], shmem_f32[q2[6] &  0x0F], shmem_f32[q2[7] &  0x0F]);
-            float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4   ], shmem_f32[q2[5] >> 4   ], shmem_f32[q2[6] >> 4   ], shmem_f32[q2[7] >> 4   ]);
-
-            acc1 = (acc1 + acc3) + (acc2 + acc4);
-
-            sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
-        }
-
-        yb += (N_SIMDWIDTH/2) * QK_MXFP4;
-    }
-
-    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
-
-    for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
-        float sum_all = sub_group_reduce_add(sumf[row]);
-        if (get_sub_group_local_id() == 0) {
-            dst_f32[first_row + row] = sum_all;
-        }
-    }
-}
-
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_16
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_mul_mv_id_mxfp4_f32(
-    global char * src0,
-    ulong         offset0,
-    global char * src1,
-    ulong         offset1,
-    global char * src2,
-    ulong         offset2,
-    global char * dst,
-    ulong         offsetd,
-    int           ne00,
-    ulong         nb01,
-    ulong         nb02,
-    ulong         nb03,
-    int           ne11,
-    int           ne12,
-    ulong         nb11,
-    ulong         nb12,
-    ulong         nb13,
-    int           ne20,
-    int           ne21,
-    ulong         nb21,
-    int           ne0,
-    int           ne1,
-    int           r2,
-    int           r3,
-    local  char * shmem
-) {
-    src0 = (global char *)((global char *)src0 + offset0);
-    src1 = (global char *)((global char *)src1 + offset1);
-    src2 = (global char *)((global char *)src2 + offset2);
-    dst  = (global char *)((global char *)dst  + offsetd);
-
-    const int iid1 = get_group_id(2)/ne20;
-    const int idx  = get_group_id(2)%ne20;
-
-    int i02 = ((global int *) (src2 + iid1*nb21))[idx];
-
-    int i11 = idx % ne11;
-    int i12 = iid1;
-
-    int i1 = idx;
-    int i2 = i12;
-
-    global char * src0_cur = src0 + i02*nb02;
-    global char * src1_cur = src1 + i11*nb11 + i12*nb12;
-
-    global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);
-
-    mul_mv_mxfp4_f32(src0_cur, src1_cur, dst_cur,
-        ne00, nb01, nb02, nb03, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shmem);
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@ -1,283 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#define QK4_0                   32
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-//------------------------------------------------------------------------------
-// block_q4_0
-//------------------------------------------------------------------------------
-struct block_q4_0
-{
-    half d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-// This function requires the original shuffled weights.
-// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
-// packed together in a byte, so are (q[1], q[17]) and so on.
-inline float block_q_4_0_dot_y_flat(
-        global uchar * x,
-        global half  * dh,
-        float sumy,
-        float16 yl,
-        int il
-) {
-    float           d   = *dh;
-    global ushort * qs  = ((global ushort *)x + il/2);
-    float           acc = 0.f;
-
-    acc += yl.s0 * (qs[0] & 0x000F);
-    acc += yl.s1 * (qs[0] & 0x0F00);
-    acc += yl.s8 * (qs[0] & 0x00F0);
-    acc += yl.s9 * (qs[0] & 0xF000);
-
-    acc += yl.s2 * (qs[1] & 0x000F);
-    acc += yl.s3 * (qs[1] & 0x0F00);
-    acc += yl.sa * (qs[1] & 0x00F0);
-    acc += yl.sb * (qs[1] & 0xF000);
-
-    acc += yl.s4 * (qs[2] & 0x000F);
-    acc += yl.s5 * (qs[2] & 0x0F00);
-    acc += yl.sc * (qs[2] & 0x00F0);
-    acc += yl.sd * (qs[2] & 0xF000);
-
-    acc += yl.s6 * (qs[3] & 0x000F);
-    acc += yl.s7 * (qs[3] & 0x0F00);
-    acc += yl.se * (qs[3] & 0x00F0);
-    acc += yl.sf * (qs[3] & 0xF000);
-
-    return d * (sumy * -8.f + acc);
-}
-
-//
-// This variant outputs 8 values.
-//
-#undef N_DST
-#undef N_SIMDGROUP
-#undef N_SIMDWIDTH
-
-#ifdef INTEL_GPU
-#define N_DST 8 // each SIMD group works on 8 rows
-#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 16 // subgroup size
-#elif defined (ADRENO_GPU)
-#define N_DST 8
-#define N_SIMDGROUP 1
-#define N_SIMDWIDTH 64
-#endif
-
-inline void mul_vec_q_n_f32_8x_flat(
-        global char  * src0_q,
-        global half  * src0_d,
-        global float * src1,
-        global float * dst,
-        int ne00,
-        int ne01,
-        int ne02,
-        int ne10,
-        int ne12,
-        int ne0,
-        int ne1,
-        int r2,
-        int r3
-) {
-    const ulong nb = ne00/QK4_0;
-
-    int r0 = get_group_id(0);
-    int r1 = get_group_id(1);
-    int im = 0;
-
-    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
-
-    int i12 = im%ne12;
-    int i13 = im/ne12;
-
-    // The number of scales is the same as the number of blocks.
-    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
-    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
-
-    global uchar * x = (global uchar *) src0_q + offset0_q;
-    global half  * d = (global half  *) src0_d + offset0_d;
-    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
-
-    float16 yl;
-    float8 sumf = 0.f;
-
-    int ix = get_sub_group_local_id()/2;
-    int il = 8*(get_sub_group_local_id()%2);
-
-    global float * yb = y + ix*QK4_0 + il;
-
-    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
-        float sumy = 0.f;
-
-        sumy += yb[0];
-        sumy += yb[1];
-        sumy += yb[2];
-        sumy += yb[3];
-        sumy += yb[4];
-        sumy += yb[5];
-        sumy += yb[6];
-        sumy += yb[7];
-
-        sumy += yb[16];
-        sumy += yb[17];
-        sumy += yb[18];
-        sumy += yb[19];
-        sumy += yb[20];
-        sumy += yb[21];
-        sumy += yb[22];
-        sumy += yb[23];
-
-        yl.s0 = yb[0];
-        yl.s1 = yb[1]/256.f;
-
-        yl.s2 = yb[2];
-        yl.s3 = yb[3]/256.f;
-
-        yl.s4 = yb[4];
-        yl.s5 = yb[5]/256.f;
-
-        yl.s6 = yb[6];
-        yl.s7 = yb[7]/256.f;
-
-        yl.s8 = yb[16]/16.f;
-        yl.s9 = yb[17]/4096.f;
-
-        yl.sa = yb[18]/16.f;
-        yl.sb = yb[19]/4096.f;
-
-        yl.sc = yb[20]/16.f;
-        yl.sd = yb[21]/4096.f;
-
-        yl.se = yb[22]/16.f;
-        yl.sf = yb[23]/4096.f;
-
-        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
-        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
-        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
-        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
-
-        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
-        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
-        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
-        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
-
-        yb += QK4_0 * (N_SIMDWIDTH/2);
-    }
-
-    float8 tot = (float8)(
-        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
-        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
-        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
-        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
-    );
-
-    if (get_sub_group_local_id() == 0) {
-        if (first_row + 0 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
-        }
-        if (first_row + 1 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
-        }
-        if (first_row + 2 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
-        }
-        if (first_row + 3 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
-        }
-
-        if (first_row + 4 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
-        }
-        if (first_row + 5 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
-        }
-        if (first_row + 6 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
-        }
-        if (first_row + 7 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
-        }
-    }
-}
-
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_16
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
-        global char  *  src0_q,
-        global half  *  src0_d,
-        global float *  src1,
-        ulong           offset1,
-        global char  *  src2,
-        ulong           offset2,
-        global float *  dst,
-        ulong           offsetd,
-        int             ne00,
-        int             ne01,
-        int             ne02,
-        ulong           nb00,
-        ulong           nb02,
-        int             ne10,
-        int             ne11,
-        int             ne12,
-        ulong           nb11,
-        ulong           nb12,
-        int             ne20,
-        int             ne21,
-        ulong           nb21,
-        int             ne0,
-        int             ne1,
-        int             r2,
-        int             r3
-) {
-    src1 = (global float *)((global char *)src1 + offset1);
-    src2 = (global char  *)((global char *)src2 + offset2);
-    dst  = (global float *)((global char *)dst  + offsetd);
-
-    const int iid1 = get_group_id(2)/ne20;
-    const int idx  = get_group_id(2)%ne20;
-
-    const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
-
-    const int i11 = idx%ne11;
-    const int i12 = iid1;
-
-    const int i1 = idx;
-    const int i2 = i12;
-
-    global char  * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
-    global half  * src0_d_cur = src0_d + (i02*nb02/nb00);
-    global float * src1_cur   = (global float *)((global char *) src1  + i11*nb11 + i12*nb12);
-    global float * dst_cur    = dst + i1*ne0 + i2*ne1*ne0;
-
-    mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
-}
--- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
@ -1,144 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_subgroups
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#define QK_MXFP4 32
-typedef struct {
-    uchar e; // E8M0
-    uchar qs[QK_MXFP4/2];
-} block_mxfp4;
-
-constant static float kvalues_mxfp4_f[16] = {
-    0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
-};
-
-static inline float e8m0_to_fp32(uchar x) {
-    int bits;
-
-    if (x == 0) {
-        bits = 0x00400000;
-    } else {
-        bits = (uint) x << 23;
-    }
-
-    return as_float(bits);
-}
-
-#ifdef INTEL_GPU
-#define N_R0_MXFP4 2 // number of rows each subgroup works on
-#define N_SG_MXFP4 2 // number of subgroups in a work group
-#define N_SIMDWIDTH 16 // subgroup size
-#elif defined (ADRENO_GPU)
-#define N_R0_MXFP4 2
-#define N_SG_MXFP4 2
-#define N_SIMDWIDTH 64
-#endif
-
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_16
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_mul_mv_mxfp4_f32(
-    global char * src0,
-    ulong         offset0,
-    global char * src1,
-    ulong         offset1,
-    global char * dst,
-    ulong         offsetd,
-    int ne00,
-    ulong nb01,
-    ulong nb02,
-    ulong nb03,
-    int ne12,
-    ulong nb11,
-    ulong nb12,
-    ulong nb13,
-    int ne0,
-    int ne1,
-    int r2,
-    int r3,
-    local  char * shmem
-) {
-    src0 = (global char*)((global char*)src0 + offset0);
-    src1 = (global char*)((global char*)src1 + offset1);
-    dst  = (global char*)((global char*)dst  + offsetd);
-
-    local float * shmem_f32 = (local float *) shmem;
-    int nb = ne00/QK_MXFP4;
-
-    int r0 = get_group_id(0);
-    int r1 = get_group_id(1);
-    int im = get_group_id(2);
-
-    int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
-
-    uint i12 = im%ne12;
-    uint i13 = im/ne12;
-
-    ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
-    ulong offset_src1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
-
-    global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
-    global float       * y = (global float       *) (src1 + offset_src1);
-
-    const short ix = get_sub_group_local_id()/2;  // 0...15
-    const short it = get_sub_group_local_id()%2;  // 0 or 1
-
-    shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 yl[4];
-    float sumf[N_R0_MXFP4] = {0.f};
-
-    global float * yb = y + ix * QK_MXFP4 + it * 8;
-
-    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
-        global float4 * y4 = (global float4 *)yb;
-        yl[0] = y4[0];
-        yl[1] = y4[4];
-        yl[2] = y4[1];
-        yl[3] = y4[5];
-
-        for (short row = 0; row < N_R0_MXFP4; row++) {
-            global block_mxfp4 * xb = x + row*nb + ib;
-            global uchar       * q2 = (global uchar *)(xb->qs + 8*it);
-
-            float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] &  0x0F], shmem_f32[q2[1] &  0x0F], shmem_f32[q2[2] &  0x0F], shmem_f32[q2[3] &  0x0F]);
-            float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4   ], shmem_f32[q2[1] >> 4   ], shmem_f32[q2[2] >> 4   ], shmem_f32[q2[3] >> 4   ]);
-            float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] &  0x0F], shmem_f32[q2[5] &  0x0F], shmem_f32[q2[6] &  0x0F], shmem_f32[q2[7] &  0x0F]);
-            float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4   ], shmem_f32[q2[5] >> 4   ], shmem_f32[q2[6] >> 4   ], shmem_f32[q2[7] >> 4   ]);
-
-            acc1 = (acc1 + acc3) + (acc2 + acc4);
-
-            sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
-        }
-
-        yb += (N_SIMDWIDTH/2) * QK_MXFP4;
-    }
-
-    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
-
-    for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
-        float sum_all = sub_group_reduce_add(sumf[row]);
-        if (get_sub_group_local_id() == 0) {
-            dst_f32[first_row + row] = sum_all;
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/pad.cl
+++ b/ggml/src/ggml-opencl/kernels/pad.cl
@ -1,30 +0,0 @@
-kernel void kernel_pad(
-        global const void * src0_ptr,
-        ulong src0_offset,
-        global void * dst_ptr,
-        ulong dst_offset,
-        int s_ne0, int s_ne1, int s_ne2,
-        int d_ne0, int d_ne1, int d_ne2
-) {
-    global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
-    global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
-
-    int nidx   = get_global_id(0);
-    int idx_d1 = get_group_id(1);
-    int idx_d2 = get_group_id(2);
-
-    if (nidx >= d_ne0) {
-        return;
-    }
-
-    int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
-
-    bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
-
-    if (in_src_bounds) {
-        int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
-        dst[dst_el_offset] = src0[src_el_offset];
-    } else {
-        dst[dst_el_offset] = 0.0f;
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/repeat.cl
+++ b/ggml/src/ggml-opencl/kernels/repeat.cl
@ -1,39 +0,0 @@
-kernel void kernel_repeat(
-    global const char * src0_data_in,
-    global       char * dst_data_in,
-    ulong src0_offset,
-    ulong dst_offset,
-    int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
-    ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
-    int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
-    ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
-) {
-    global const char * src0_data = src0_data_in + src0_offset;
-    global       char * dst_data  = dst_data_in + dst_offset;
-
-    const int d3 = get_global_id(2);
-    const int d2 = get_global_id(1);
-    const int d1 = get_global_id(0);
-
-    if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
-        return;
-    }
-
-    const int s3 = d3 % src0_ne3;
-    const int s2 = d2 % src0_ne2;
-    const int s1 = d1 % src0_ne1;
-
-    const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
-    global char * p_dst_slice  = dst_data  + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
-
-    for (int d0 = 0; d0 < dst_ne0; ++d0) {
-        // Determine source index for dimension 0 based on tiling/broadcasting.
-        const int s0 = d0 % src0_ne0;
-
-        const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
-        global char * restrict current_dst_el_ptr  = p_dst_slice  + (ulong)d0*dst_nb0;
-        for (int k = 0; k < src0_nb0; ++k) {
-            current_dst_el_ptr[k] = current_src_el_ptr[k];
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/set_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/set_rows.cl
@ -1,95 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-kernel void kernel_set_rows_f32(
-        global char * src0,
-        ulong         offset0,
-        global char * src1,
-        ulong         offset1,
-        global char * dst,
-        ulong         offsetd,
-        int           ne01,
-        ulong         nb01,
-        ulong         nb02,
-        ulong         nb03,
-        int           ne11,
-        int           ne12,
-        ulong         nb10,
-        ulong         nb11,
-        ulong         nb12,
-        int           nblk0,
-        ulong         nb1,
-        ulong         nb2,
-        ulong         nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst  + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
-
-    if (i01 >= ne01) {
-        return;
-    }
-
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
-
-    int i10 = i01;
-    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
-
-    global float * dst_row = (global float *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
-    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
-
-    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
-        dst_row[ind] = (float)src_row[ind];
-    }
-}
-
-kernel void kernel_set_rows_f16(
-        global char * src0,
-        ulong         offset0,
-        global char * src1,
-        ulong         offset1,
-        global char * dst,
-        ulong         offsetd,
-        int           ne01,
-        ulong         nb01,
-        ulong         nb02,
-        ulong         nb03,
-        int           ne11,
-        int           ne12,
-        ulong         nb10,
-        ulong         nb11,
-        ulong         nb12,
-        int           nblk0,
-        ulong         nb1,
-        ulong         nb2,
-        ulong         nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst  + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
-
-    if (i01 >= ne01) {
-        return;
-    }
-
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
-
-    int i10 = i01;
-    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
-
-    global half  * dst_row = (global half  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
-    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
-
-    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
-        dst_row[ind] = src_row[ind];
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/sigmoid.cl
+++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl
@ -1,29 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-//------------------------------------------------------------------------------
-// sigmoid
-//------------------------------------------------------------------------------
-
-kernel void kernel_sigmoid_f32(
-        global float * src0,
-        ulong offset0,
-        global float * dst,
-        ulong offsetd
-) {
-    src0 = (global float*)((global char*)src0 + offset0);
-    dst = (global float*)((global char*)dst + offsetd);
-
-    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
-}
-
-kernel void kernel_sigmoid_f16(
-        global half * src0,
-        ulong offset0,
-        global half * dst,
-        ulong offsetd
-) {
-    src0 = (global half*)((global char*)src0 + offset0);
-    dst = (global half*)((global char*)dst + offsetd);
-
-    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
-}
--- a/ggml/src/ggml-opencl/kernels/sub.cl
+++ b/ggml/src/ggml-opencl/kernels/sub.cl
@ -1,138 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-//------------------------------------------------------------------------------
-// div
-//------------------------------------------------------------------------------
-kernel void kernel_sub(
-        global char * src0,
-        ulong offset0,
-        global char * src1,
-        ulong offset1,
-        global char * dst,
-        ulong offsetd,
-        ulong nb00,
-        ulong nb01,
-        ulong nb02,
-        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13,
-        int ne0,
-        ulong nb0,
-        ulong nb1,
-        ulong nb2,
-        ulong nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0);
-
-    int i13 = i03 % ne13;
-    int i12 = i02 % ne12;
-    int i11 = i01 % ne11;
-
-    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const int i10 = i0 % ne10;
-        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
-    }
-}
-
-// assumption: src1 is a row
-// broadcast src1 into src0
-kernel void kernel_sub_row(
-        global float4 * src0,
-        ulong offset0,
-        global float4 * src1,
-        ulong offset1,
-        global float4 * dst,
-        ulong offsetd,
-        int ne
-) {
-    src0 = (global float4*)((global char*)src0 + offset0);
-    src1 = (global float4*)((global char*)src1 + offset1);
-    dst = (global float4*)((global char*)dst + offsetd);
-
-    // This performs better than using %.
-    uint gid = get_global_id(0);
-    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
-    dst[gid] = src0[gid] - src1[idx1];
-}
-
-kernel void kernel_sub_f16(
-        global char * src0,
-        ulong offset0,
-        global char * src1,
-        ulong offset1,
-        global char * dst,
-        ulong offsetd,
-        ulong nb00,
-        ulong nb01,
-        ulong nb02,
-        ulong nb03,
-        int ne10,
-        int ne11,
-        int ne12,
-        int ne13,
-        ulong nb10,
-        ulong nb11,
-        ulong nb12,
-        ulong nb13,
-        int ne0,
-        ulong nb0,
-        ulong nb1,
-        ulong nb2,
-        ulong nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0);
-
-    int i13 = i03 % ne13;
-    int i12 = i02 % ne12;
-    int i11 = i01 % ne11;
-
-    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
-        const int i10 = i0 % ne10;
-        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_sub_row_f16(
-        global half4 * src0,
-        ulong offset0,
-        global half4 * src1,
-        ulong offset1,
-        global half4 * dst,
-        ulong offsetd,
-        int ne
-) {
-    src0 = (global half4*)((global char*)src0 + offset0);
-    src1 = (global half4*)((global char*)src1 + offset1);
-    dst = (global half4*)((global char*)dst + offsetd);
-
-    // This performs better than using %.
-    uint gid = get_global_id(0);
-    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
-    dst[gid] = src0[gid] - src1[idx1];
-}
--- a/ggml/src/ggml-opencl/kernels/sum_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/sum_rows.cl
@ -1,39 +0,0 @@
-
-kernel void kernel_sum_rows_f32(
-    global float *  src0,
-    ulong           offset0,
-    global float *  dst,
-    ulong           offsetd,
-    int             ne00,
-    int             ne01,
-    int             ne02,
-    int             ne03,
-    ulong           nb01,
-    ulong           nb02,
-    ulong           nb03,
-    ulong           nb1,
-    ulong           nb2,
-    ulong           nb3
-) {
-    src0 = (global float *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
-
-    int i3 = get_global_id(2);
-    int i2 = get_global_id(1);
-    int i1 = get_global_id(0);
-
-    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
-        return;
-    }
-
-    global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
-    global float * dst_row = (global float *) ((global char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
-
-    float row_sum = 0;
-
-    for (int i0 = 0; i0 < ne00; i0++) {
-        row_sum += src_row[i0];
-    }
-
-    dst_row[0] = row_sum;
-}
--- a/ggml/src/ggml-opencl/kernels/tanh.cl
+++ b/ggml/src/ggml-opencl/kernels/tanh.cl
@ -1,63 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-kernel void kernel_tanh_f32_nd(
-    global void * p_src0_base, ulong off_src0_abs,
-    global void * p_dst_base,  ulong off_dst_abs,
-    int ne00, int ne01, int ne02, int ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-    int ne10, int ne11, int ne12, int ne13,
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13
-) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
-
-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
-
-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
-
-            *dst_val_ptr = tanh(*src_val_ptr);
-        }
-    }
-}
-
-kernel void kernel_tanh_f16_nd(
-    global void * p_src0_base, ulong off_src0_abs,
-    global void * p_dst_base,  ulong off_dst_abs,
-    int ne00, int ne01, int ne02, int ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-    int ne10, int ne11, int ne12, int ne13,
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13
-) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
-
-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
-
-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
-
-            *dst_val_ptr = tanh(*src_val_ptr);
-        }
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/tsembd.cl
+++ b/ggml/src/ggml-opencl/kernels/tsembd.cl
@ -1,48 +0,0 @@
-kernel void kernel_timestep_embedding(
-    global const void * p_timesteps,
-    ulong off_timesteps,
-    global void * p_dst,
-    ulong off_dst,
-    int dst_nb1_bytes,
-    int logical_dim,
-    int max_period
-) {
-    int local_i;
-    int local_j;
-    int local_half_dim;
-    float local_timestep_val;
-    float local_freq;
-    float local_arg;
-    global float * local_embed_data_ptr;
-    global const float * local_timesteps_input_ptr;
-    global float * local_dst_output_base_ptr;
-
-    local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
-    local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
-
-    local_i = get_global_id(1);
-    local_j = get_global_id(0);
-
-    local_half_dim = logical_dim / 2;
-    local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
-
-    if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
-        local_embed_data_ptr[logical_dim] = 0.0f;
-    }
-
-    if (local_j >= local_half_dim) {
-        return;
-    }
-
-    local_timestep_val = local_timesteps_input_ptr[local_i];
-
-    if (local_half_dim == 0) {
-        local_freq = 1.0f;
-    } else {
-        local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
-    }
-
-    local_arg = local_timestep_val * local_freq;
-    local_embed_data_ptr[local_j] = cos(local_arg);
-    local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
-}
--- a/ggml/src/ggml-opencl/kernels/upscale.cl
+++ b/ggml/src/ggml-opencl/kernels/upscale.cl
@ -1,120 +0,0 @@
-kernel void kernel_upscale(
-    global const void * p_src0,
-    ulong off_src0,
-    global void * p_dst,
-    ulong off_dst,
-    ulong nb00,
-    ulong nb01,
-    ulong nb02,
-    ulong nb03,
-    int ne10,
-    int ne11,
-    int ne12,
-    int ne13,
-    float sf0,
-    float sf1,
-    float sf2,
-    float sf3
-) {
-    global const char * src_base = (global const char *)p_src0 + off_src0;
-    global float * dst_base = (global float *)((global char *)p_dst + off_dst);
-
-    int index = get_global_id(0);
-    int dst_total_elements = ne10 * ne11 * ne12 * ne13;
-
-    if (index >= dst_total_elements) {
-        return;
-    }
-
-    int i10 = index % ne10;
-    int i11 = (index / ne10) % ne11;
-    int i12 = (index / (ne10 * ne11)) % ne12;
-    int i13 = index / (ne10 * ne11 * ne12);
-
-    int i00 = (int)(i10 / sf0);
-    int i01 = (int)(i11 / sf1);
-    int i02 = (int)(i12 / sf2);
-    int i03 = (int)(i13 / sf3);
-
-    ulong offset_src_element = (ulong)i03 * nb03 + (ulong)i02 * nb02 + (ulong)i01 * nb01 + (ulong)i00 * nb00;
-    global const float * src_element_ptr = (global const float *)(src_base + offset_src_element);
-
-    dst_base[index] = *src_element_ptr;
-}
-
-kernel void kernel_upscale_bilinear(
-    global const void * p_src0,
-    ulong off_src0,
-    global void * p_dst,
-    ulong off_dst,
-    ulong nb00,
-    ulong nb01,
-    ulong nb02,
-    ulong nb03,
-    int ne00_src,
-    int ne01_src,
-    int ne10_dst,
-    int ne11_dst,
-    int ne12_dst,
-    int ne13_dst,
-    float sf0,
-    float sf1,
-    float sf2,
-    float sf3,
-    float pixel_offset
-) {
-    global const char * src_base = (global const char *)p_src0 + off_src0;
-    global float * dst_base = (global float *)((global char *)p_dst + off_dst);
-
-    int index = get_global_id(0);
-    int dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
-
-    if (index >= dst_total_elements) {
-        return;
-    }
-
-    int i10_dst = index % ne10_dst;
-    int i11_dst = (index / ne10_dst) % ne11_dst;
-    int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
-    int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
-
-    int i02_src = (int)(i12_dst / sf2);
-    int i03_src = (int)(i13_dst / sf3);
-
-    float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
-    long y0_src = (long)floor(y_src_f);
-    long y1_src = y0_src + 1;
-
-    y0_src = max(0L, min(y0_src, (long)ne01_src - 1));
-    y1_src = max(0L, min(y1_src, (long)ne01_src - 1));
-
-    float dy = y_src_f - (float)y0_src;
-    dy = max(0.0f, min(dy, 1.0f));
-
-    float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
-    long x0_src = (long)floor(x_src_f);
-    long x1_src = x0_src + 1;
-
-    x0_src = max(0L, min(x0_src, (long)ne00_src - 1));
-    x1_src = max(0L, min(x1_src, (long)ne00_src - 1));
-
-    float dx = x_src_f - (float)x0_src;
-    dx = max(0.0f, min(dx, 1.0f));
-
-    global const float * p_a = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
-    global const float * p_b = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
-    global const float * p_c = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
-    global const float * p_d = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
-
-    const float val_a = *p_a;
-    const float val_b = *p_b;
-    const float val_c = *p_c;
-    const float val_d = *p_d;
-
-    float result = val_a * (1.0f - dx) * (1.0f - dy) +
-                   val_b * dx * (1.0f - dy) +
-                   val_c * (1.0f - dx) * dy +
-                   val_d * dx * dy;
-
-    dst_base[index] = result;
-}
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@ -1,110 +0,0 @@
-//
-// MIT license
-// Copyright (C) 2025 Codeplay Software Ltd.
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: MIT
-//
-
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-
-#ifndef GGML_SYCL_QUANTS_HPP
-#define GGML_SYCL_QUANTS_HPP
-
-#include <utility>
-
-#include "ggml-common.h"
-#include "ggml.h"
-
-namespace ggml_sycl_reordered {
-
-// The reordered block moves quants (qs) and  scales(d) to two
-// uniform regions of memory that is contiguous in the same tensor.
-// What this means is that instead of having:
-// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
-// We have:
-// [qs0, qs1, qs2, ..., qsN]  [d0, d1, d2, ..., dN]
-//
-// Notes: out-of-bounds qs will run into d values
-// Aligment relies on the allocated size of qs
-
-template <ggml_type type> struct block_q_t;
-
-// qk number of weights / quants in a block
-// qr number of weights in a byte (described as 'before dequantization')
-//    for quantization types that has low and high bits split, qr is calculated with
-//    using the lower bits, e.g for Q6 quants QR6 is 2
-// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
-// See ggml-common.h to see how these are calculated
-template <> struct block_q_t<GGML_TYPE_Q4_0> {
-    struct traits {
-        static constexpr uint32_t qk       = QK4_0;
-        static constexpr uint32_t qi       = QI4_0;
-        static constexpr uint32_t qr       = QR4_0;
-        static constexpr uint32_t vdr_mmvq = 2;
-    };
-
-    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
-        return { block_index * (QK4_0 / QR4_0), 0 };
-    }
-
-    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
-    }
-
-    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
-};
-
-template <> struct block_q_t<GGML_TYPE_Q4_K> {
-    struct traits {
-        static constexpr uint32_t qk       = QK_K;
-        static constexpr uint32_t qi       = QI4_K;
-        static constexpr uint32_t qr       = QR4_K;
-        static constexpr uint32_t vdr_mmvq = 2;
-    };
-
-    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
-        return { block_index * (traits::qk / traits::qr), 0 };
-    }
-
-    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        auto nblocks = (nrows * (ncols / QK_K));
-        return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
-                 (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
-    }
-
-    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
-};
-
-template <> struct block_q_t<GGML_TYPE_Q6_K> {
-    struct traits {
-        static constexpr uint32_t qk       = QK_K;
-        static constexpr uint32_t qi       = QI6_K;
-        static constexpr uint32_t qr       = QR6_K;
-        static constexpr uint32_t vdr_mmvq = 1;
-    };
-
-    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
-        auto low_bits_index  = block_index * (QK_K / QR6_K);
-        // the index of high bits it's after all low bits
-        auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
-        return { low_bits_index, high_bits_index };
-    }
-
-    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        auto nblocks        = (nrows * (ncols / QK_K));
-        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
-        auto block_scales   = total_qs_bytes + block_index * (QK_K / 16);
-        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
-        return { block_scales, sb_scale };
-    }
-
-    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
-};
-
-}  // namespace ggml_sycl_reordered
-
-#endif  // GGML_SYCL_QUANTS_HPP
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@ -1,60 +0,0 @@
-enable f16;
-
-@group(0) @binding(0)
-var<storage, read_write> src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> dst: array<f16>;
-
-struct Params {
-    ne: u32,            // total number of elements
-    offset_src: u32,    // in elements
-    offset_dst: u32,    // in elements
-
-    // Strides (in elements) — may be permuted
-    stride_src0: u32,
-    stride_src1: u32,
-    stride_src2: u32,
-    stride_src3: u32,
-
-    stride_dst0: u32,
-    stride_dst1: u32,
-    stride_dst2: u32,
-    stride_dst3: u32,
-
-    // Logical shape (same for both tensors)
-    ne0: u32,
-    ne1: u32,
-    ne2: u32,
-    ne3: u32,
-};
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-override wg_size: u32;
-@compute @workgroup_size(wg_size)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x >= params.ne) {
-        return;
-    }
-
-    var i = gid.x;
-
-    let i3 = i / (params.ne2 * params.ne1 * params.ne0);
-    i = i % (params.ne2 * params.ne1 * params.ne0);
-
-    let i2 = i / (params.ne1 * params.ne0);
-    i = i % (params.ne1 * params.ne0);
-
-    let i1 = i / params.ne0;
-    let i0 = i % params.ne0;
-
-    let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
-                  i2 * params.stride_src2 + i3 * params.stride_src3;
-
-    let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
-                  i2 * params.stride_dst2 + i3 * params.stride_dst3;
-
-    dst[params.offset_dst + dst_idx] = f16(src[params.offset_src + src_idx]);
-}
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@ -1,35 +0,0 @@
-import os
-import argparse
-
-
-def escape_triple_quotes(wgsl):
-    # Simple defense in case of embedded """
-    return wgsl.replace('"""', '\\"""')
-
-
-def to_cpp_string_literal(varname, content):
-    return f'const char* wgsl_{varname} = R"({content})";\n'
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', required=True)
-    parser.add_argument('--output', required=True)
-    args = parser.parse_args()
-
-    with open(args.output, 'w', encoding='utf-8') as out:
-        out.write("// Auto-generated shader embedding \n\n")
-        for fname in sorted(os.listdir(args.input)):
-            if not fname.endswith('.wgsl'):
-                continue
-            shader_path = os.path.join(args.input, fname)
-            varname = os.path.splitext(fname)[0]
-            with open(shader_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            content = escape_triple_quotes(content)
-            out.write(to_cpp_string_literal(varname, content))
-            out.write('\n')
-
-
-if __name__ == '__main__':
-    main()
--- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
@ -1,40 +0,0 @@
-@group(0) @binding(0)
-var<storage, read_write> output_buffer: array<u32>;
-
-struct Params {
-    offset: u32, // in bytes
-    size: u32,   // in bytes
-    value: u32,  // 4 8-bit values, which are either repeating (memset_tensor) or may be separate (cleaning up unaligned set_tensor operations)
-};
-
-@group(0) @binding(1)
-var<uniform> params: Params;
-
-override wg_size: u32;
-override bytes_per_thread: u32;
-
-@compute @workgroup_size(wg_size)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let i = gid.x * bytes_per_thread;
-    let start = params.offset;
-    let end = params.offset + params.size;
-
-    for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) {
-        let byte_index = start + i + j;
-        if (byte_index + 4u <= end) {
-            output_buffer[(byte_index >> 2u)] = params.value;
-        } else {
-            // Handle tail (unaligned)
-            for (var k: u32 = 0u; k < 4u; k = k + 1u) {
-                let idx = byte_index + k;
-                if (idx < end) {
-                    let word_idx = idx >> 2u;
-                    let byte_offset = (idx & 3u) * 8u;
-                    let mask = ~(0xffu << byte_offset);
-                    let existing = output_buffer[word_idx];
-                    output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset);
-                }
-            }
-        }
-    }
-}
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@ -1,56 +0,0 @@
-struct MulMatParams {
-    m: u32,
-    n: u32,
-    k: u32,
-    // all strides are in elements
-    stride_01: u32,
-    stride_11: u32,
-    stride_02: u32,
-    stride_12: u32,
-    stride_03: u32,
-    stride_13: u32,
-
-    bs02: u32,
-    bs03: u32,
-    broadcast2: u32,
-    broadcast3: u32
-};
-
-@group(0) @binding(0) var<storage, read_write> src0: array<f32>; // N rows, K columns
-@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // M rows, K columns (transposed)
-@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
-
-@group(0) @binding(3) var<uniform> params: MulMatParams;
-
-@compute @workgroup_size(64)
-fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
-    if (global_id.x >= total) {
-        return;
-    }
-
-    let dst2_stride = params.m * params.n;
-    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
-
-    let dst3_idx = global_id.x / dst3_stride;
-    let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension
-    let src13_idx = dst3_idx; // src1 is not broadcast
-    let dst3_rem = global_id.x % dst3_stride;
-
-    let dst2_idx = dst3_rem / dst2_stride;
-    let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
-    let src12_idx = dst2_idx; // src1 is not broadcast
-
-    let dst2_rem = dst3_rem % dst2_stride;
-
-    let row = dst2_rem / params.n; // output row
-    let col = dst2_rem % params.n; // output column
-
-    var sum = 0.0;
-    for (var i: u32 = 0u; i < params.k; i = i + 1u) {
-        let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i;
-        let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i;
-        sum = sum + src0[src0_idx] * src1[src1_idx];
-    }
-    dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
-}
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
@ -1,82 +0,0 @@
-enable f16;
-
-@group(0) @binding(0)
-var<storage, read_write> src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> idx: array<u32>;
-
-@group(0) @binding(2)
-var<storage, read_write> dst: array<f16>;
-
-@group(0) @binding(3)
-var<storage, read_write> error: atomic<u32>;
-
-struct Params {
-    offset_src: u32, // in elements
-    offset_idx: u32, // in elements
-    offset_dst: u32, // in elements
-
-    // Strides (in elements)
-    stride_src1: u32,
-    stride_src2: u32,
-    stride_src3: u32,
-
-    stride_idx0: u32,
-    stride_idx1: u32,
-    stride_idx2: u32,
-
-    stride_dst1: u32,
-    stride_dst2: u32,
-    stride_dst3: u32,
-
-    // Shape of src
-    ne0: u32,
-    n_rows: u32,
-    ne2: u32,
-    ne3: u32,
-
-    // Shape of idx
-    idx1: u32,
-    idx2: u32,
-};
-
-@group(0) @binding(4)
-var<uniform> params: Params;
-
-override wg_size: u32;
-@compute @workgroup_size(wg_size)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x >= params.n_rows * params.ne2 * params.ne3) {
-        return;
-    }
-    var i = gid.x;
-    let i_src3 = i / (params.ne2 * params.n_rows);
-    let i_dst3 = i / (params.ne2 * 3);
-
-    i = i % (params.ne2 * params.n_rows);
-    let i_src2 = i / params.n_rows;
-    let i_src1 = i % params.n_rows;
-
-    let i_idx2 = i_src3 % params.idx2;
-    let i_idx1 = i_src2 % params.idx1;
-    let i_idx0 = i_src1;
-
-    let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2;
-
-    let idx_high_val = idx[idx_high];
-    let idx_low_val = idx[idx_high + 1];
-
-    if (idx_low_val != 0) {
-        // Upper bits of index are not zero, output will be incorrect
-        atomicStore(&error, 1);
-        return;
-    }
-
-    let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
-    let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
-
-    for (var i: u32 = 0; i < params.ne0; i++) {
-      dst[i_dst_row + i] = f16(src[i_src_row + i]);
-    }
-}
--- a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
+++ b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
@ -1,97 +0,0 @@
-#ifndef GGML_ZDNN_IMPL
-#define GGML_ZDNN_IMPL
-
-#include "zdnn.h"
-#include "ggml.h"
-#include "ggml-zdnn.h"
-
-#include <vector>
-#include <memory>
-#include <vecintrin.h>
-
-#define GGML_ZDNN_NAME    "zDNN"
-#define GGML_ZDNN_VERSION ZDNN_VERNUM
-
-#define vec_neg(a)    (-(a))                // Vector Negate
-#define vec_add(a, b) ((a) + (b))           // Vector Add
-#define vec_sub(a, b) ((a) - (b))           // Vector Subtract
-#define vec_mul(a, b) ((a) * (b))           // Vector Multiply
-#define vec_div(a, b) ((a) / (b))           // Vector Divide
-#define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
-#define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
-#define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
-#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
-#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
-
-#ifndef vec_and
-#define vec_and(a, b) ((a) & (b)) // Vector AND
-#endif
-
-#ifndef vec_or
-#define vec_or(a, b)  ((a) | (b)) // Vector OR
-#endif
-
-#ifndef vec_xor
-#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
-#endif
-
-typedef   signed char char8x16_t  __attribute__((vector_size(16)));
-typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
-
-typedef int8_t   int8x16_t  __attribute__((vector_size(16)));
-typedef int16_t  int16x8_t  __attribute__((vector_size(16)));
-typedef int32_t  int32x4_t  __attribute__((vector_size(16)));
-typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
-typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
-typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
-
-typedef float float32x4_t   __attribute__((vector_size(16)));
-typedef double double64x2_t __attribute__((vector_size(16)));
-
-typedef   signed long long long64x2_t  __attribute__((vector_size(16)));
-typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
-
-#define ZDNN_CHECK(stmt)                \
-    do {                                \
-        zdnn_status status = (stmt);    \
-        GGML_ASSERT(status == ZDNN_OK); \
-    } while (0);
-
-struct ggml_backend_zdnn_device_context {
-    int zdnn_device;
-    int zdnn_device_ref_count;
-
-    bool has_parmblkformat_0;
-    bool has_parmblkformat_1;
-
-    size_t max_size;
-
-    char name[128];
-};
-
-struct ggml_backend_zdnn_context {
-    int device;
-    ggml_cgraph * gf;
-};
-
-struct ggml_backend_zdnn_buffer {
-    void * data;
-    size_t size;
-
-    zdnn_tensor_desc pre_tfm_desc;
-    zdnn_tensor_desc tfm_desc;
-    zdnn_ztensor     ztensor;
-
-    char name[GGML_MAX_NAME];
-};
-
-struct ggml_backend_zdnn_buffer_context {
-    void * all_data;
-    size_t all_size;
-    bool owned;
-
-    int n_buffers;
-    std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
-};
-
-#endif  // GGML_ZDNN_IMPL
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@ -1,846 +0,0 @@
-#include "zdnn.h"
-#include "ggml-zdnn.h"
-#include "ggml-zdnn-impl.h"
-
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-
-#include <vector>
-#include <memory>
-#include <csignal>
-#include <unistd.h>
-
-inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32:
-            return FP32;
-        case GGML_TYPE_F16:
-            return FP16;
-        case GGML_TYPE_BF16:
-            return BFLOAT;
-        case GGML_TYPE_I8:
-            return INT8;
-        case GGML_TYPE_I32:
-            return INT32;
-        case GGML_TYPE_Q8_0:
-            return INT8;
-        default:
-            GGML_ABORT("%s: fatal: unable to determine zTensor data type",
-                       __func__);
-            break;
-    }
-}
-
-inline void ggml_zdnn_create_tensor(zdnn_tensor_desc  & pre_tfm_desc,
-                                    zdnn_tensor_desc  & tfm_desc,
-                                    zdnn_ztensor      & ztensor,
-                              const ggml_tensor       * src,
-                              const int64_t           * ne,
-                              const zdnn_data_layouts   layout) {
-    zdnn_init_pre_transformed_desc(
-        layout,
-        ggml_zdnn_type_mapping(src->type),
-        &pre_tfm_desc,
-        ne[3], ne[2], ne[1], ne[0]
-    );
-
-    ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
-    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
-}
-
-inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor,
-                                          void * buffer) {
-    ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
-}
-
-inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
-    switch (tensor->op) {
-        case GGML_OP_MUL_MAT:
-            {
-                zdnn_init_pre_transformed_desc(
-                    ZDNN_2D,
-                    ggml_zdnn_type_mapping(tensor->type),
-                    &buffer->pre_tfm_desc,
-                    tensor->ne[1], tensor->ne[0]
-                );
-            } break;
-
-        default:
-            {
-                // For 4D tensors, GGML uses NCHW layout. However, because zDNN
-                // automatically transforms everything to NHWC, we will use it
-                // directly to avoid the performance penalty changing the
-                // layout and reshaping the tensor.
-                zdnn_init_pre_transformed_desc(
-                    ZDNN_NHWC,
-                    ggml_zdnn_type_mapping(tensor->type),
-                    &buffer->pre_tfm_desc,
-                    tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
-                );
-
-                // TODO: Consider adding a ggml check.
-                // TODO: If tensor = 4D, use ZDNN_NCHW by default.
-                // TODO: If tensor = 2D, use ZDNN_NHWC by default.
-            } break;
-    }
-
-    ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
-    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
-}
-
-static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const enum ggml_type type = src0->type;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    const ggml_tensor * weights = src0;
-    const ggml_tensor * inputs  = src1;
-          ggml_tensor * output  = dst;
-
-    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
-    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
-    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
-
-    zdnn_tensor_desc ptd_bias, td_bias;
-    zdnn_ztensor zt_bias;
-
-    const int64_t weights_rows = ne01;
-    const int64_t weights_cols = ne00;
-    const int64_t inputs_rows  = ne11;
-    const int64_t inputs_cols  = ne10;
-
-    assert(inputs_cols == weights_cols);
-
-    const int64_t output_rows = ne1;
-    const int64_t output_cols = ne0;
-
-    const int64_t bias_dim  [GGML_MAX_DIMS]  = { 1, 1, 1, output_cols };
-    ggml_zdnn_create_tensor(ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
-
-    void * bias_data = (void *)calloc(ne0, ggml_element_size(output));
-    if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data);
-    if (inputs_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(inputs_extra->ztensor, inputs->data);
-    ggml_zdnn_load_tensor(zt_bias, bias_data);
-
-    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
-    //               __func__, weights_extra->name,
-    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
-    //               weights_extra->pre_tfm_desc.dim1,
-    //               weights_extra->pre_tfm_desc.dim2,
-    //               weights_extra->pre_tfm_desc.dim3,
-    //               weights_extra->pre_tfm_desc.dim4);
-
-    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
-    //               __func__, inputs_extra->name,
-    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
-    //               inputs_extra->pre_tfm_desc.dim1,
-    //               inputs_extra->pre_tfm_desc.dim2,
-    //               inputs_extra->pre_tfm_desc.dim3,
-    //               inputs_extra->pre_tfm_desc.dim4);
-
-    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
-    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
-    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
-    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
-
-    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &zt_bias,
-                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
-    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
-    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
-
-    ZDNN_CHECK(zdnn_free_ztensor_buffer(&zt_bias));
-    free(bias_data);
-}
-
-static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    bool use_mul_mat_vec =
-        (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-
-    bool use_mul_mat_vec_q =
-        ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
-    bool use_mul_mat_q =
-        ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
-    // debug helpers
-    // GGML_LOG_INFO("%s: use_mul_mat_vec   = %d\n", __func__, use_mul_mat_vec);
-    // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
-    // GGML_LOG_INFO("%s: use_mul_mat_q     = %d\n", __func__, use_mul_mat_q);
-    // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16
-        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
-        && src1->ne[2] * src1->ne[3] > 1) {
-        // general KQ + KQV multi-batch
-        GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__);
-        // ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst);
-    } else if (use_mul_mat_vec) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr);
-    } else if (use_mul_mat_vec_q) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1);
-    } else if (use_mul_mat_q) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1);
-    } else {
-        // GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__);
-        ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
-    }
-}
-
-static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
-    switch (dst->op) {
-        case GGML_OP_MUL_MAT:
-            ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst);
-            break;
-
-        default:
-            return false;
-    }
-
-    return true;
-}
-
-static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * gf) {
-    ggml_backend_zdnn_context        * ctx     = (       ggml_backend_zdnn_context *)backend->context;
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)backend->device->context;
-
-    ctx->gf = gf;
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-
-        if (ggml_is_empty(node)
-            || node->op == GGML_OP_NONE
-            || node->op == GGML_OP_RESHAPE
-            || node->op == GGML_OP_VIEW
-            || node->op == GGML_OP_PERMUTE
-            || node->op == GGML_OP_TRANSPOSE) {
-            continue;
-        }
-
-        bool ok = ggml_zdnn_compute_forward(ctx, node);
-        if (!ok) {
-            GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
-                           __func__, node->name, ggml_op_name(node->op));
-        }
-
-        GGML_ASSERT(ok);
-    }
-
-    return GGML_STATUS_SUCCESS;
-}
-
-static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_dev, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_PERMUTE:
-            return true;
-
-        case GGML_OP_MUL_MAT:
-            {
-                const ggml_tensor * src0 = op->src[0];
-                const ggml_tensor * src1 = op->src[1];
-
-                const int64_t ne10 = src1->ne[0];
-                const int64_t ne0 = op->ne[0];
-                const int64_t ne1 = op->ne[1];
-
-                const int64_t max_batch = ctx_dev->max_size;
-
-                return ggml_is_matrix(src0) &&
-                       ggml_is_matrix(src1) &&
-                       ggml_is_contiguous(src0) &&
-                       ggml_is_contiguous(src1) &&
-                       src0->view_src == nullptr && src1->view_src == nullptr &&
-                       src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
-                       (ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch);
-            } break;
-
-        default:
-            return false;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-//
-// globals
-//
-
-// initialised in ggml_backend_zdnn_reg
-static ggml_backend_reg    g_ggml_backend_zdnn_reg;
-static ggml_backend_device g_ggml_backend_zdnn_device;
-
-static ggml_backend_zdnn_device_context g_ggml_ctx_dev_main = {
-    /* .zdnn_device           = */ 0,
-    /* .zdnn_device_ref_count = */ 0,
-    /* .has_parmblkformat_0   = */ false,
-    /* .has_parmblkformat_1   = */ false,
-    /* .max_size              = */ 0,
-    /* .name                  = */ "",
-};
-
-static int ggml_backend_zdnn_device_acq(ggml_backend_zdnn_device_context * ctx) {
-    assert(ctx != NULL);
-
-    if (ctx->zdnn_device == 0) {
-        ctx->zdnn_device = 1;
-    }
-
-    if (ctx->zdnn_device >= 1) {
-        ctx->has_parmblkformat_0 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0);
-        ctx->has_parmblkformat_1 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1);
-        ctx->max_size = zdnn_get_nnpa_max_dim_idx_size();
-        strncpy(ctx->name, GGML_ZDNN_NAME, sizeof(ctx->name) - 1);
-    }
-
-    ctx->zdnn_device_ref_count++;
-    return ctx->zdnn_device;
-}
-
-static void ggml_backend_zdnn_device_rel(ggml_backend_zdnn_device_context * ctx) {
-    assert(ctx != NULL);
-    assert(ctx->zdnn_device_ref_count > 0);
-
-    ctx->zdnn_device_ref_count--;
-    if (ctx->zdnn_device_ref_count == 0) {
-        if (ctx->zdnn_device >= 0) {
-            ctx->zdnn_device = 0;
-        }
-    }
-}
-
-static ggml_backend_zdnn_context * ggml_zdnn_init(ggml_backend_dev_t dev) {
-    GGML_LOG_INFO("%s: allocating\n", __func__);
-    GGML_LOG_INFO("%s: found 1 device\n", __func__);
-
-    #ifdef STATIC_LIB
-    zdnn_init();
-    #endif
-
-    ggml_backend_zdnn_context * ctx = new ggml_backend_zdnn_context();
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
-
-    int device = 1;
-    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, ctx_dev->name);
-
-    ctx->device = device;
-    GGML_LOG_INFO("%s: NNPA name: %s\n", __func__, ctx_dev->name);
-    GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_0 = %s\n", __func__, ctx_dev->has_parmblkformat_0 ? "true" : "false");
-    GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_1 = %s\n", __func__, ctx_dev->has_parmblkformat_1 ? "true" : "false");
-
-    ctx->gf = nullptr;
-
-    return ctx;
-}
-
-static void ggml_zdnn_free(ggml_backend_zdnn_context * ctx) {
-    GGML_LOG_INFO("%s: deallocating\n", __func__);
-    delete ctx;
-}
-
-//
-// backend interface
-//
-
-static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
-
-    for (int i = 0; i < ctx->n_buffers; i++) {
-        if (ctx->buffers[i]->ztensor.buffer != NULL && ctx->buffers[i]->ztensor.is_transformed) {
-            ZDNN_CHECK(zdnn_free_ztensor_buffer(&ctx->buffers[i]->ztensor));
-        }
-    }
-
-    delete ctx;
-}
-
-static void * ggml_backend_zdnn_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
-    return ctx->all_data;
-}
-
-static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != NULL) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        return GGML_STATUS_SUCCESS;
-    }
-
-    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
-
-    const int64_t tsize = ggml_nbytes(tensor);
-    int buffer_idx = ctx->n_buffers;
-
-    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
-    zdnn_buffer->data = tensor->data;
-    zdnn_buffer->size = tsize;
-    strncpy(zdnn_buffer->name, tensor->name, GGML_MAX_NAME - 1);
-
-    ggml_zdnn_init_tensor(zdnn_buffer.get(), tensor);
-    tensor->extra = zdnn_buffer.get();
-
-    ctx->buffers.push_back(std::move(zdnn_buffer));
-    ctx->n_buffers++;
-
-    // GGML_LOG_INFO("%s: initialised tensor '%s' in buffer %d, size = %8.2f MiB\n",
-    //               __func__, tensor->name, buffer_idx, tsize);
-
-    return GGML_STATUS_SUCCESS;
-}
-
-static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_zdnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_zdnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
-
-    memset(ctx->all_data, value, ctx->all_size);
-}
-
-static ggml_backend_buffer_i ggml_backend_zdnn_buffer_i = {
-    /* .free_buffer   = */ ggml_backend_zdnn_buffer_free_buffer,
-    /* .get_base      = */ ggml_backend_zdnn_buffer_get_base,
-    /* .init_tensor   = */ ggml_backend_zdnn_buffer_init_tensor,
-    /* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor,
-    /* .set_tensor    = */ ggml_backend_zdnn_buffer_set_tensor,
-    /* .get_tensor    = */ ggml_backend_zdnn_buffer_get_tensor,
-    /* .cpy_tensor    = */ NULL,
-    /* .clear         = */ ggml_backend_zdnn_buffer_clear,
-    /* .reset         = */ NULL,
-};
-
-//
-// default buffer type
-//
-
-static const char * ggml_backend_zdnn_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return GGML_ZDNN_NAME;
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_zdnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += size_page - (size_aligned % size_page);
-    }
-
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)buft->device->context;
-
-    GGML_ASSERT(ctx_dev->zdnn_device >= 0);
-    int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
-
-    ctx->all_data  = ggml_aligned_malloc(size_aligned);
-    ctx->all_size  = size_aligned;
-    ctx->owned     = true;
-    ctx->n_buffers = 1;
-
-    if (ctx->all_data != NULL) {
-        std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
-        zdnn_buffer->data = ctx->all_data;
-        zdnn_buffer->size = size_aligned;
-        ctx->buffers.push_back(std::move(zdnn_buffer));
-    }
-
-    if (size_aligned > 0 && (ctx->all_data == NULL)) {
-        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f\n",
-                       __func__, size_aligned / 1024.0 / 1024.0);
-        delete ctx;
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_zdnn_buffer_i, ctx, size);
-}
-
-static size_t ggml_backend_zdnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 256;
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_zdnn_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) {
-    static ggml_backend_buffer_type ggml_backend_buffer_type_zdnn = {
-        /* .iface   = */ {
-            /* .get_name       = */ ggml_backend_zdnn_buffer_type_get_name,
-            /* .alloc_buffer   = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
-            /* .get_alignment  = */ ggml_backend_zdnn_buffer_type_get_alignment,
-            /* .get_max_size   = */ NULL,
-            /* .get_alloc_size = */ NULL,  // defaults to ggml_nbytes
-            /* .is_host        = */ ggml_backend_zdnn_buffer_type_is_host,
-        },
-        /* .device  = */ &g_ggml_backend_zdnn_device,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_zdnn;
-}
-
-static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
-    return GGML_ZDNN_NAME "_Mapped";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) {
-    static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = {
-        /* .iface = */ {
-            /* .get_name       = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer   = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
-            /* .get_alignment  = */ ggml_backend_zdnn_buffer_type_get_alignment,
-            /* .get_max_size   = */ NULL,
-            /* .get_alloc_size = */ NULL,  // defaults to ggml_nbytes
-            /* .is_host        = */ ggml_backend_zdnn_buffer_type_is_host,
-        },
-        /* .device  = */ &g_ggml_backend_zdnn_device,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_from_ptr_type_zdnn;
-}
-
-//
-// backend
-//
-
-static const char * ggml_backend_zdnn_name(ggml_backend_t backend) {
-    return GGML_ZDNN_NAME;
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_zdnn_free(ggml_backend_t backend) {
-    ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend->context;
-
-    ggml_zdnn_free(ctx);
-    free(backend);
-}
-
-static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    return ggml_zdnn_graph_compute(backend, cgraph);
-}
-
-static ggml_backend_i ggml_backend_zdnn_i = {
-    /* .get_name           = */ ggml_backend_zdnn_name,
-    /* .free               = */ ggml_backend_zdnn_free,
-    /* .set_tensor_async   = */ NULL,
-    /* .get_tensor_async   = */ NULL,
-    /* .cpy_tensor_async   = */ NULL,
-    /* .synchronize        = */ NULL,
-    /* .graph_plan_create  = */ NULL,
-    /* .graph_plan_free    = */ NULL,
-    /* .graph_plan_update  = */ NULL,
-    /* .graph_plan_compute = */ NULL,
-    /* .graph_compute      = */ ggml_backend_zdnn_graph_compute,
-    /* .event_record       = */ NULL,
-    /* .event_wait         = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_zdnn_guid(void) {
-    static const char * guid_str = "IBM-ZDNN-ACCELER";
-    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
-}
-
-// TODO: remove in the future
-ggml_backend_t ggml_backend_zdnn_init(void) {
-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0);
-
-    ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
-    if (ctx == NULL) {
-        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
-        return NULL;
-    }
-
-    ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(ggml_backend));
-    *backend = (ggml_backend) {
-        /* .guid       = */ ggml_backend_zdnn_guid(),
-        /* .iface      = */ ggml_backend_zdnn_i,
-        /* .device     = */ dev,
-        /* .context    = */ ctx,
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_zdnn(ggml_backend_t backend) {
-    return backend != NULL &&
-           ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid());
-
-    GGML_UNUSED(backend);
-}
-
-//
-// backend device
-//
-
-static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) {
-    return GGML_ZDNN_NAME;
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) {
-    return "IBM Z Neural Network Processing Assist (NNPA)";
-}
-
-static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    *free  = 0;
-    *total = 0;
-}
-
-static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_zdnn_device_get_name(dev);
-    props->description = ggml_backend_zdnn_device_get_description(dev);
-    props->type        = ggml_backend_zdnn_device_get_type(dev);
-    ggml_backend_zdnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = (ggml_backend_dev_caps) {
-        /* .async                = */ false,
-        /* .host_buffer          = */ false,
-        /* .buffer_from_host_ptr = */ true,
-        /* .events               = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, const char * params) {
-    ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
-    if (ctx == NULL) {
-        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
-        return NULL;
-    }
-
-    ggml_backend_t backend = (ggml_backend *)malloc(sizeof(ggml_backend));
-    *backend = (ggml_backend) {
-        /* .guid       = */ ggml_backend_zdnn_guid(),
-        /* .iface      = */ ggml_backend_zdnn_i,
-        /* .device     = */ dev,
-        /* .context    = */ ctx,
-    };
-
-    return backend;
-
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_zdnn_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
-
-    ctx->all_data  = ptr;
-    ctx->all_size  = size;
-    ctx->owned     = false;
-    ctx->n_buffers = 0;
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    // page-align the data ptr
-    {
-        const uintptr_t offs = (uintptr_t) ptr % size_page;
-        ptr  = (void *)((char *)ptr - offs);
-        size += offs;
-    }
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += size_page - (size_aligned % size_page);
-    }
-
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
-
-    GGML_ASSERT(ctx_dev->zdnn_device >= 0);
-    int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
-
-    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
-    zdnn_buffer->data = ptr;
-    zdnn_buffer->size = size;
-    ctx->buffers.push_back(std::move(zdnn_buffer));
-
-    GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n",
-                  __func__, size_aligned / 1024.0 / 1024.0);
-
-    ++ctx->n_buffers;
-
-    return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size);
-}
-
-static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context;
-
-    return ggml_zdnn_supports_op(ctx_dev, op);
-}
-
-static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return
-        buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name ||
-        buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name;
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_device_i ggml_backend_zdnn_device_i = {
-    /* .get_name             = */ ggml_backend_zdnn_device_get_name,
-    /* .get_description      = */ ggml_backend_zdnn_device_get_description,
-    /* .get_memory           = */ ggml_backend_zdnn_device_get_memory,
-    /* .get_type             = */ ggml_backend_zdnn_device_get_type,
-    /* .get_props            = */ ggml_backend_zdnn_device_get_props,
-    /* .init_backend         = */ ggml_backend_zdnn_device_init,
-    /* .get_buffer_type      = */ ggml_backend_zdnn_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr,
-    /* .supports_op          = */ ggml_backend_zdnn_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_zdnn_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-//
-// backend registry
-//
-
-static const char * ggml_backend_zdnn_reg_get_name(ggml_backend_reg_t reg) {
-    return GGML_ZDNN_NAME;
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_zdnn_reg_device_count(ggml_backend_reg_t reg) {
-    if (!zdnn_is_nnpa_installed()) {
-        return 0;
-    }
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_zdnn_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    return &g_ggml_backend_zdnn_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static ggml_backend_feature g_ggml_backend_zdnn_features[] = {
-    { "NNPA", zdnn_is_nnpa_installed() ? "1" : "0" },
-    { "NNPA_PARMBLKFORMAT_0", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0) ? "1" : "0" },
-    { "NNPA_PARMBLKFORMAT_1", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1) ? "1" : "0" },
-    { NULL, NULL },
-};
-
-static ggml_backend_feature * ggml_backend_zdnn_get_features(ggml_backend_reg_t reg) {
-    return g_ggml_backend_zdnn_features;
-
-    GGML_UNUSED(reg);
-}
-
-static void * ggml_backend_zdnn_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_get_features") == 0) {
-        return (void *) ggml_backend_zdnn_get_features;
-    }
-
-    return NULL;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_reg_i ggml_backend_zdnn_reg_i = {
-    /* .get_name         = */ ggml_backend_zdnn_reg_get_name,
-    /* .get_device_count = */ ggml_backend_zdnn_reg_device_count,
-    /* .get_device       = */ ggml_backend_zdnn_reg_device_get,
-    /* .get_proc_address = */ ggml_backend_zdnn_get_proc_address,
-};
-
-static void ggml_zdnn_cleanup(void) {
-    ggml_backend_zdnn_device_rel(&g_ggml_ctx_dev_main);
-}
-
-// TODO: make thread-safe
-ggml_backend_reg_t ggml_backend_zdnn_reg(void) {
-    ggml_backend_zdnn_device_acq(&g_ggml_ctx_dev_main);
-
-    // register cleanup callback
-    atexit(ggml_zdnn_cleanup);
-
-    {
-        g_ggml_backend_zdnn_reg = (ggml_backend_reg) {
-            /* .api_version = */ GGML_ZDNN_VERSION,
-            /* .iface       = */ ggml_backend_zdnn_reg_i,
-            /* .context     = */ NULL,
-        };
-
-        g_ggml_backend_zdnn_device = (ggml_backend_device) {
-            /* .iface       = */ ggml_backend_zdnn_device_i,
-            /* .reg         = */ &g_ggml_backend_zdnn_reg,
-            /* .context     = */ &g_ggml_ctx_dev_main,
-        };
-
-        return &g_ggml_backend_zdnn_reg;
-    }
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)
--- a/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja
+++ b/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja
@ -1,59 +0,0 @@
-{# Alias tools -> available_tools #}
-{%- if tools and not available_tools -%}
-    {%- set available_tools = tools -%}
-{%- endif -%}
-{%- if messages[0]['role'] == 'system' %}
-     {%- set system_message = messages[0]['content'] %}
-     {%- set loop_messages = messages[1:] %}
- {%- else %}
-     {%- set system_message = "Knowledge Cutoff Date: April 2024. Today's Date: " + strftime_now('%B %d, %Y') + ". You are Granite, developed by IBM." %}
-     {%- if available_tools and documents %}
-         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request. Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
-     {%- elif available_tools %}
-         {%- set system_message = system_message + " You are a helpful assistant with access to the following tools. When a tool is required to answer the user's query, respond only with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
-     {%- elif documents %}
-         {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
-    {%- elif thinking %}
-    {%- set system_message = system_message + " You are a helpful AI assistant.
-Respond to every user query in a comprehensive and detailed way. You can write down your thoughts and reasoning process before responding. In the thought process, engage in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. In the response section, based on various attempts, explorations, and reflections from the thoughts section, systematically present the final solution that you deem correct. The response should summarize the thought process. Write your thoughts between <think></think> and write your response between <response></response> for each user query." %}
-     {%- else %}
-         {%- set system_message = system_message + " You are a helpful AI assistant." %}
-     {%- endif %}
-     {%- if 'citations' in controls and documents %}
-         {%- set system_message = system_message + '
-Use the symbols <|start_of_cite|> and <|end_of_cite|> to indicate when a fact comes from a document in the search result, e.g <|start_of_cite|> {document_id: 1}my fact <|end_of_cite|> for a fact from document 1. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
-     {%- endif %}
-     {%- if 'hallucinations' in controls and documents %}
-         {%- set system_message = system_message + '
-Finally, after the response is written, include a numbered list of sentences from the response with a corresponding risk value that are hallucinated and not based in the documents.' %}
-     {%- endif %}
-     {%- set loop_messages = messages %}
- {%- endif %}
- {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>
-' }}
- {%- if available_tools %}
-     {{- '<|start_of_role|>available_tools<|end_of_role|>' }}
-     {{- available_tools | tojson(indent=4) }}
-     {{- '<|end_of_text|>
-' }}
- {%- endif %}
- {%- if documents %}
-     {%- for document in documents %}
-         {{- '<|start_of_role|>document {"document_id": "' + document['doc_id'] | string + '"}<|end_of_role|>
-' }}
-         {{- document['text'] }}
-         {{- '<|end_of_text|>
-' }}
-              {%- endfor %}
- {%- endif %}
- {%- for message in loop_messages %}
-     {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>
-' }}
-     {%- if loop.last and add_generation_prompt %}
-         {{- '<|start_of_role|>assistant' }}
-             {%- if controls %}
-                 {{- ' ' + controls | tojson()}}
-             {%- endif %}
-         {{- '<|end_of_role|>' }}
-     {%- endif %}
- {%- endfor %}
--- a/models/templates/openai-gpt-oss-120b.jinja
+++ b/models/templates/openai-gpt-oss-120b.jinja
@ -1,331 +0,0 @@
-{#-
-  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
-  following kwargs:
-  - "builtin_tools": A list, can contain "browser" and/or "python".
-  - "model_identity": A string that optionally describes the model identity.
-  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
- #}
-
-{#- Tool Definition Rendering ============================================== #}
-{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
-    {%- if param_spec.type == "array" -%}
-        {%- if param_spec['items'] -%}
-            {%- if param_spec['items']['type'] == "string" -%}
-                {{- "string[]" }}
-            {%- elif param_spec['items']['type'] == "number" -%}
-                {{- "number[]" }}
-            {%- elif param_spec['items']['type'] == "integer" -%}
-                {{- "number[]" }}
-            {%- elif param_spec['items']['type'] == "boolean" -%}
-                {{- "boolean[]" }}
-            {%- else -%}
-                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
-                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
-                    {{- "any[]" }}
-                {%- else -%}
-                    {{- inner_type + "[]" }}
-                {%- endif -%}
-            {%- endif -%}
-            {%- if param_spec.nullable -%}
-                {{- " | null" }}
-            {%- endif -%}
-        {%- else -%}
-            {{- "any[]" }}
-            {%- if param_spec.nullable -%}
-                {{- " | null" }}
-            {%- endif -%}
-        {%- endif -%}
-    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
-        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
-        {%- if param_spec.type | length > 1 -%}
-            {{- param_spec.type | join(" | ") }}
-        {%- else -%}
-            {{- param_spec.type[0] }}
-        {%- endif -%}
-    {%- elif param_spec.oneOf -%}
-        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
-        {%- set has_object_variants = false -%}
-        {%- for variant in param_spec.oneOf -%}
-            {%- if variant.type == "object" -%}
-                {%- set has_object_variants = true -%}
-            {%- endif -%}
-        {%- endfor -%}
-        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
-            {{- "any" }}
-        {%- else -%}
-            {%- for variant in param_spec.oneOf -%}
-                {{- render_typescript_type(variant, required_params) -}}
-                {%- if variant.description %}
-                    {{- "// " + variant.description }}
-                {%- endif -%}
-                {%- if variant.default is defined %}
-                    {{ "// default: " + variant.default|tojson }}
-                {%- endif -%}
-                {%- if not loop.last %}
-                    {{- " | " }}
-                {% endif -%}
-            {%- endfor -%}
-        {%- endif -%}
-    {%- elif param_spec.type == "string" -%}
-        {%- if param_spec.enum -%}
-            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
-        {%- else -%}
-            {{- "string" }}
-            {%- if param_spec.nullable %}
-                {{- " | null" }}
-            {%- endif -%}
-        {%- endif -%}
-    {%- elif param_spec.type == "number" -%}
-        {{- "number" }}
-    {%- elif param_spec.type == "integer" -%}
-        {{- "number" }}
-    {%- elif param_spec.type == "boolean" -%}
-        {{- "boolean" }}
-
-    {%- elif param_spec.type == "object" -%}
-        {%- if param_spec.properties -%}
-            {{- "{\n" }}
-            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
-                {{- prop_name -}}
-                {%- if prop_name not in (param_spec.required or []) -%}
-                    {{- "?" }}
-                {%- endif -%}
-                {{- ": " }}
-                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
-                {%- if not loop.last -%}
-                    {{-", " }}
-                {%- endif -%}
-            {%- endfor -%}
-            {{- "}" }}
-        {%- else -%}
-            {{- "object" }}
-        {%- endif -%}
-    {%- else -%}
-        {{- "any" }}
-    {%- endif -%}
-{%- endmacro -%}
-
-{%- macro render_tool_namespace(namespace_name, tools) -%}
-    {{- "## " + namespace_name + "\n\n" }}
-    {{- "namespace " + namespace_name + " {\n\n" }}
-    {%- for tool in tools %}
-        {%- set tool = tool.function %}
-        {{- "// " + tool.description + "\n" }}
-        {{- "type "+ tool.name + " = " }}
-        {%- if tool.parameters and tool.parameters.properties %}
-            {{- "(_: {\n" }}
-            {%- for param_name, param_spec in tool.parameters.properties.items() %}
-                {%- if param_spec.description %}
-                    {{- "// " + param_spec.description + "\n" }}
-                {%- endif %}
-                {{- param_name }}
-                {%- if param_name not in (tool.parameters.required or []) -%}
-                    {{- "?" }}
-                {%- endif -%}
-                {{- ": " }}
-                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
-                {%- if param_spec.default is defined -%}
-                    {%- if param_spec.enum %}
-                        {{- ", // default: " + param_spec.default }}
-                    {%- elif param_spec.oneOf %}
-                        {{- "// default: " + param_spec.default }}
-                    {%- else %}
-                        {{- ", // default: " + param_spec.default|tojson }}
-                    {%- endif -%}
-                {%- endif -%}
-                {%- if not loop.last %}
-                    {{- ",\n" }}
-                {%- else %}
-                    {{- ",\n" }}
-                {%- endif -%}
-            {%- endfor %}
-            {{- "}) => any;\n\n" }}
-        {%- else -%}
-            {{- "() => any;\n\n" }}
-        {%- endif -%}
-    {%- endfor %}
-    {{- "} // namespace " + namespace_name }}
-{%- endmacro -%}
-
-{%- macro render_builtin_tools(browser_tool, python_tool) -%}
-    {%- if browser_tool %}
-        {{- "## browser\n\n" }}
-        {{- "// Tool for browsing.\n" }}
-        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
-        {{- "// Cite information from the tool using the following format:\n" }}
-        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
-        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
-        {{- "// sources=web (default: web)\n" }}
-        {{- "namespace browser {\n\n" }}
-        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
-        {{- "type search = (_: {\n" }}
-        {{- "query: string,\n" }}
-        {{- "topn?: number, // default: 10\n" }}
-        {{- "source?: string,\n" }}
-        {{- "}) => any;\n\n" }}
-        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
-        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
-        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
-        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
-        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
-        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
-        {{- "type open = (_: {\n" }}
-        {{- "id?: number | string, // default: -1\n" }}
-        {{- "cursor?: number, // default: -1\n" }}
-        {{- "loc?: number, // default: -1\n" }}
-        {{- "num_lines?: number, // default: -1\n" }}
-        {{- "view_source?: boolean, // default: false\n" }}
-        {{- "source?: string,\n" }}
-        {{- "}) => any;\n\n" }}
-        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
-        {{- "type find = (_: {\n" }}
-        {{- "pattern: string,\n" }}
-        {{- "cursor?: number, // default: -1\n" }}
-        {{- "}) => any;\n\n" }}
-        {{- "} // namespace browser\n\n" }}
-    {%- endif -%}
-
-    {%- if python_tool %}
-        {{- "## python\n\n" }}
-        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
-        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
-    {%- endif -%}
-{%- endmacro -%}
-
-{#- System Message Construction ============================================ #}
-{%- macro build_system_message() -%}
-    {%- if model_identity is not defined %}
-        {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %}
-    {%- endif %}
-    {{- model_identity + "\n" }}
-    {{- "Knowledge cutoff: 2024-06\n" }}
-    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
-    {%- if reasoning_effort is not defined %}
-        {%- set reasoning_effort = "medium" %}
-    {%- endif %}
-    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
-    {%- if builtin_tools %}
-        {{- "# Tools\n\n" }}
-        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
-        {%- for tool in builtin_tools %}
-            {%- if tool == "browser" %}
-                {%- set available_builtin_tools.browser = true %}
-            {%- elif tool == "python" %}
-                {%- set available_builtin_tools.python = true %}
-            {%- endif %}
-        {%- endfor %}
-        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
-    {%- endif -%}
-    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
-    {%- if tools -%}
-        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
-    {%- endif -%}
-{%- endmacro -%}
-
-{#- Main Template Logic ================================================= #}
-{#- Set defaults #}
-
-{#- Render system message #}
-{{- "<|start|>system<|message|>" }}
-{{- build_system_message() }}
-{{- "<|end|>" }}
-
-{#- Extract developer message #}
-{%- if messages[0].role == "developer" or messages[0].role == "system" %}
-    {%- set developer_message = messages[0].content %}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set developer_message = "" %}
-    {%- set loop_messages = messages %}
-{%- endif %}
-
-{#- Render developer message #}
-{%- if developer_message or tools %}
-    {{- "<|start|>developer<|message|>" }}
-    {%- if developer_message %}
-        {{- "# Instructions\n\n" }}
-        {{- developer_message }}
-        {{- "\n\n" }}
-    {%- endif %}
-    {%- if tools -%}
-        {{- "# Tools\n\n" }}
-        {{- render_tool_namespace("functions", tools) }}
-    {%- endif -%}
-    {{- "<|end|>" }}
-{%- endif %}
-
-{#- Render messages #}
-{%- set last_tool_call = namespace(name=none) %}
-{%- for message in loop_messages -%}
-    {#- At this point only assistant/user/tool messages should remain #}
-    {%- if message.role == 'assistant' -%}
-        {#- Checks to ensure the messages are being passed in the format we expect #}
-        {%- if "content" in message %}
-            {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %}
-                {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
-            {%- endif %}
-        {%- endif %}
-        {%- if "thinking" in message %}
-            {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %}
-                {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
-            {%- endif %}
-        {%- endif %}
-        {%- if "tool_calls" in message %}
-            {#- We need very careful handling here - we want to drop the tool call analysis message if the model #}
-            {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #}
-            {#- when we render CoT/analysis messages in inference. #}
-            {%- set future_final_message = namespace(found=false) %}
-            {%- for future_message in loop_messages[loop.index:] %}
-                {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %}
-                    {%- set future_final_message.found = true %}
-                {%- endif %}
-            {%- endfor %}
-            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
-            {#- in "tool" messages from the most recent assistant tool call name #}
-            {%- set tool_call = message.tool_calls[0] %}
-            {%- if tool_call.function %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {%- if message.content and message.thinking %}
-                {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }}
-            {%- elif message.content and not future_final_message.found %}
-                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
-            {%- elif message.thinking and not future_final_message.found %}
-                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
-            {%- endif %}
-            {{- "<|start|>assistant to=" }}
-            {{- "functions." + tool_call.name + "<|channel|>commentary " }}
-            {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }}
-            {{- tool_call.arguments|tojson }}
-            {{- "<|call|>" }}
-            {%- set last_tool_call.name = tool_call.name %}
-        {%- elif loop.last and not add_generation_prompt %}
-            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
-            {#- This is a situation that should only occur in training, never in inference. #}
-            {%- if "thinking" in message %}
-                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
-            {%- endif %}
-            {#- <|return|> indicates the end of generation, but <|end|> does not #}
-            {#- <|return|> should never be an input to the model, but we include it as the final token #}
-            {#- when training, so the model learns to emit it. #}
-            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
-        {%- else %}
-            {#- CoT is dropped during all previous turns, so we never render it for inference #}
-            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
-            {%- set last_tool_call.name = none %}
-        {%- endif %}
-    {%- elif message.role == 'tool' -%}
-        {%- if last_tool_call.name is none %}
-            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
-        {%- endif %}
-        {{- "<|start|>functions." + last_tool_call.name }}
-        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
-    {%- elif message.role == 'user' -%}
-        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
-    {%- endif -%}
-{%- endfor -%}
-
-{#- Generation prompt #}
-{%- if add_generation_prompt -%}
-<|start|>assistant
-{%- endif -%}